summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary')
-rw-r--r--src/gallium/auxiliary/cso_cache/cso_hash.c6
-rw-r--r--src/gallium/auxiliary/cso_cache/cso_hash.h6
-rw-r--r--src/gallium/auxiliary/draw/Makefile1
-rw-r--r--src/gallium/auxiliary/draw/SConscript1
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h2
-rw-r--r--src/gallium/auxiliary/draw/draw_vs.c5
-rw-r--r--src/gallium/auxiliary/draw/draw_vs.h4
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_exec.c15
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_ppc.c239
-rw-r--r--src/gallium/auxiliary/gallivm/gallivm_cpu.cpp11
-rw-r--r--src/gallium/auxiliary/gallivm/gallivm_p.h4
-rw-r--r--src/gallium/auxiliary/gallivm/instructionssoa.cpp245
-rw-r--r--src/gallium/auxiliary/gallivm/instructionssoa.h1
-rw-r--r--src/gallium/auxiliary/gallivm/storage.cpp2
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.cpp97
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.h17
-rw-r--r--src/gallium/auxiliary/gallivm/tgsitollvm.cpp17
-rw-r--r--src/gallium/auxiliary/pipebuffer/pb_buffer.h15
-rw-r--r--src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c3
-rw-r--r--src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c12
-rw-r--r--src/gallium/auxiliary/rtasm/Makefile1
-rw-r--r--src/gallium/auxiliary/rtasm/SConscript1
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_execmem.c17
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc.c959
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc.h335
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c659
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h469
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_x86sse.c66
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_x86sse.h11
-rw-r--r--src/gallium/auxiliary/tgsi/Makefile1
-rw-r--r--src/gallium/auxiliary/tgsi/SConscript1
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_build.c24
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_exec.c141
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_exec.h10
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_parse.c54
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_ppc.c1329
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_ppc.h51
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.c359
-rw-r--r--src/gallium/auxiliary/util/Makefile1
-rw-r--r--src/gallium/auxiliary/util/SConscript5
-rw-r--r--src/gallium/auxiliary/util/p_debug.c53
-rw-r--r--src/gallium/auxiliary/util/u_keymap.c309
-rw-r--r--src/gallium/auxiliary/util/u_keymap.h68
-rw-r--r--src/gallium/auxiliary/util/u_math.h2
-rw-r--r--src/gallium/auxiliary/util/u_mm.c12
-rw-r--r--src/gallium/auxiliary/util/u_mm.h12
-rw-r--r--src/gallium/auxiliary/util/u_sse.h77
-rw-r--r--src/gallium/auxiliary/util/u_tile.c35
-rw-r--r--src/gallium/auxiliary/util/u_time.c2
49 files changed, 5113 insertions, 654 deletions
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 7f0044c5a7..4e7664f9bf 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -431,3 +431,9 @@ struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter
--hash->data.d->size;
return ret;
}
+
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key)
+{
+ struct cso_node **node = cso_hash_find_node(hash, key);
+ return (*node != hash->data.e);
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
index 85f3e276c6..5891c325fa 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.h
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -44,6 +44,7 @@
#ifndef CSO_HASH_H
#define CSO_HASH_H
+#include "pipe/p_compiler.h"
#ifdef __cplusplus
extern "C" {
@@ -95,6 +96,11 @@ struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
*/
struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
+/**
+ * Returns true if a value with the given key exists in the hash
+ */
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key);
+
int cso_hash_iter_is_null(struct cso_hash_iter iter);
unsigned cso_hash_iter_key(struct cso_hash_iter iter);
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index f2e36a89e9..bdbf5a08ed 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -40,6 +40,7 @@ C_SOURCES = \
draw_vs_aos_machine.c \
draw_vs_exec.c \
draw_vs_llvm.c \
+ draw_vs_ppc.c \
draw_vs_sse.c
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index 544a04918b..5f05aa324a 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -38,6 +38,7 @@ draw = env.ConvenienceLibrary(
'draw_vs_aos_machine.c',
'draw_vs_exec.c',
'draw_vs_llvm.c',
+ 'draw_vs_ppc.c',
'draw_vs_sse.c',
'draw_vs_varient.c'
])
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 37c4c87f87..5d531146c5 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -196,7 +196,7 @@ struct draw_context
const float (*aligned_constants)[4];
- float (*aligned_constant_storage)[4];
+ const float (*aligned_constant_storage)[4];
unsigned const_storage_size;
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 34adbd49b0..7f305304ff 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -85,7 +85,10 @@ draw_create_vertex_shader(struct draw_context *draw,
if (!vs) {
vs = draw_create_vs_sse( draw, shader );
if (!vs) {
- vs = draw_create_vs_exec( draw, shader );
+ vs = draw_create_vs_ppc( draw, shader );
+ if (!vs) {
+ vs = draw_create_vs_exec( draw, shader );
+ }
}
}
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 68c24abad3..89ae158751 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -158,6 +158,10 @@ draw_create_vs_sse(struct draw_context *draw,
const struct pipe_shader_state *templ);
struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+ const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
draw_create_vs_llvm(struct draw_context *draw,
const struct pipe_shader_state *templ);
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 82d27d4493..80c3606657 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -62,12 +62,15 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
{
struct exec_vertex_shader *evs = exec_vertex_shader(shader);
- /* specify the vertex program to interpret/execute */
- tgsi_exec_machine_bind_shader(evs->machine,
- shader->state.tokens,
- PIPE_MAX_SAMPLERS,
- NULL /*samplers*/ );
-
+ /* Specify the vertex program to interpret/execute.
+ * Avoid rebinding when possible.
+ */
+ if (evs->machine->Tokens != shader->state.tokens) {
+ tgsi_exec_machine_bind_shader(evs->machine,
+ shader->state.tokens,
+ PIPE_MAX_SAMPLERS,
+ NULL /*samplers*/ );
+ }
}
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
new file mode 100644
index 0000000000..8b75136144
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+ * Authors:
+ * Keith Whitwell <keith@tungstengraphics.com>
+ * Brian Paul
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_config.h"
+
+#include "draw_vs.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_ppc.h"
+#include "tgsi/tgsi_ppc.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
+ float (*outputs)[4][4],
+ float (*temps)[4][4],
+ float (*immeds)[4],
+ float (*consts)[4],
+ const float *builtins);
+
+
+struct draw_ppc_vertex_shader {
+ struct draw_vertex_shader base;
+ struct ppc_function ppc_program;
+
+ codegen_function func;
+};
+
+
+static void
+vs_ppc_prepare( struct draw_vertex_shader *base,
+ struct draw_context *draw )
+{
+ /* nothing */
+}
+
+
+/**
+ * Simplified vertex shader interface for the pt paths. Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_ppc_run_linear( struct draw_vertex_shader *base,
+ const float (*input)[4],
+ float (*output)[4],
+ const float (*constants)[4],
+ unsigned count,
+ unsigned input_stride,
+ unsigned output_stride )
+{
+ struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+ unsigned int i;
+
+#define MAX_VERTICES 4
+
+ /* loop over verts */
+ for (i = 0; i < count; i += MAX_VERTICES) {
+ const uint max_vertices = MIN2(MAX_VERTICES, count - i);
+ float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
+ float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
+ float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+ uint attr;
+
+ /* convert (up to) four input verts to SoA format */
+ for (attr = 0; attr < base->info.num_inputs; attr++) {
+ const float *vIn = (const float *) input;
+ uint vert;
+ for (vert = 0; vert < max_vertices; vert++) {
+#if 0
+ if (attr==0)
+ printf("Input v%d a%d: %f %f %f %f\n",
+ vert, attr, vIn[0], vIn[1], vIn[2], vIn[3]);
+#endif
+ inputs_soa[attr][0][vert] = vIn[attr * 4 + 0];
+ inputs_soa[attr][1][vert] = vIn[attr * 4 + 1];
+ inputs_soa[attr][2][vert] = vIn[attr * 4 + 2];
+ inputs_soa[attr][3][vert] = vIn[attr * 4 + 3];
+ vIn += input_stride / 4;
+ }
+ }
+
+ /* run compiled shader
+ */
+ shader->func(inputs_soa, outputs_soa, temps_soa,
+ (float (*)[4]) shader->base.immediates,
+ (float (*)[4]) constants,
+ ppc_builtin_constants);
+
+ /* convert (up to) four output verts from SoA back to AoS format */
+ for (attr = 0; attr < base->info.num_outputs; attr++) {
+ float *vOut = (float *) output;
+ uint vert;
+ for (vert = 0; vert < max_vertices; vert++) {
+ vOut[attr * 4 + 0] = outputs_soa[attr][0][vert];
+ vOut[attr * 4 + 1] = outputs_soa[attr][1][vert];
+ vOut[attr * 4 + 2] = outputs_soa[attr][2][vert];
+ vOut[attr * 4 + 3] = outputs_soa[attr][3][vert];
+#if 0
+ if (attr==0)
+ printf("Output v%d a%d: %f %f %f %f\n",
+ vert, attr, vOut[0], vOut[1], vOut[2], vOut[3]);
+#endif
+ vOut += output_stride / 4;
+ }
+ }
+
+ /* advance to next group of four input/output verts */
+ input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+ output = (float (*)[4])((char *)output + output_stride * max_vertices);
+ }
+}
+
+
+static void
+vs_ppc_delete( struct draw_vertex_shader *base )
+{
+ struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+
+ ppc_release_func( &shader->ppc_program );
+
+ align_free( (void *) shader->base.immediates );
+
+ FREE( (void*) shader->base.state.tokens );
+ FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+ const struct pipe_shader_state *templ)
+{
+ struct draw_ppc_vertex_shader *vs;
+
+ vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
+ if (vs == NULL)
+ return NULL;
+
+ /* we make a private copy of the tokens */
+ vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+ if (!vs->base.state.tokens)
+ goto fail;
+
+ tgsi_scan_shader(templ->tokens, &vs->base.info);
+
+ vs->base.draw = draw;
+#if 0
+ if (1)
+ vs->base.create_varient = draw_vs_varient_aos_ppc;
+ else
+#endif
+ vs->base.create_varient = draw_vs_varient_generic;
+ vs->base.prepare = vs_ppc_prepare;
+ vs->base.run_linear = vs_ppc_run_linear;
+ vs->base.delete = vs_ppc_delete;
+
+ vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
+ sizeof(float), 16);
+
+ ppc_init_func( &vs->ppc_program );
+
+ if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
+ &vs->ppc_program,
+ (float (*)[4]) vs->base.immediates,
+ TRUE ))
+ goto fail;
+
+ vs->func = (codegen_function) ppc_get_func( &vs->ppc_program );
+ if (!vs->func) {
+ goto fail;
+ }
+
+ return &vs->base;
+
+fail:
+ /*
+ debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+ */
+
+ ppc_release_func( &vs->ppc_program );
+
+ FREE(vs);
+ return NULL;
+}
+
+
+
+#else /* PIPE_ARCH_PPC */
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc( struct draw_context *draw,
+ const struct pipe_shader_state *templ )
+{
+ return (void *) 0;
+}
+
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 3a4a41e544..93a9748bdb 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -158,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog
llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
llvm::ExecutionEngine *ee = cpu->engine;
assert(ee);
- /*FIXME : remove */
- ee->DisableLazyCompilation();
+ /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+ ee->DisableLazyCompilation(false);
ee->addModuleProvider(mp);
llvm::Function *func = func_for_shader(prog);
@@ -179,8 +179,7 @@ struct gallivm_cpu_engine * gallivm_global_cpu_engine()
typedef void (*vertex_shader_runner)(void *ainputs,
void *dests,
- float (*aconsts)[4],
- void *temps);
+ float (*aconsts)[4]);
#define MAX_TGSI_VERTICES 4
/*!
@@ -202,7 +201,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
unsigned int i, j;
unsigned slot;
vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-
assert(runner);
for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
@@ -224,8 +222,7 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
/* run shader */
runner(machine->Inputs,
machine->Outputs,
- (float (*)[4]) constants,
- machine->Temps);
+ (float (*)[4]) constants);
/* Unswizzle all output results
*/
diff --git a/src/gallium/auxiliary/gallivm/gallivm_p.h b/src/gallium/auxiliary/gallivm/gallivm_p.h
index ebf3e11cd5..d2c5852bdf 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_p.h
+++ b/src/gallium/auxiliary/gallivm/gallivm_p.h
@@ -101,10 +101,10 @@ static INLINE int gallivm_w_swizzle(int swizzle)
return w;
}
-#endif /* MESA_LLVM */
-
#if defined __cplusplus
}
#endif
+#endif /* MESA_LLVM */
+
#endif
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index a658072551..d5600fd22d 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
return res;
}
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
- std::vector<llvm::Value*> res(4);
-
- //Extract x's
- llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
- m_storage->constantInt(0),
- name("extractX"));
- //cast it to an unsigned int
- x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
- res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
- //only x is valid. the others shouldn't be necessary
- /*
- res[1] = Constant::getNullValue(m_floatVecType);
- res[2] = Constant::getNullValue(m_floatVecType);
- res[3] = Constant::getNullValue(m_floatVecType);
- */
-
- return res;
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
- res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
- res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
- res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
- return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
- res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
- res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
- res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
- return res;
-}
-
void InstructionsSoa::end()
{
m_builder.CreateRetVoid();
}
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2,
- const std::vector<llvm::Value*> in3)
-{
- std::vector<llvm::Value*> res = mul(in1, in2);
- return add(res, in3);
-}
-
std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
{
std::vector<llvm::Value*> res(4);
@@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
return res;
}
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+ return &m_builder;
+}
+
void InstructionsSoa::createFunctionMap()
{
m_functionsMap[TGSI_OPCODE_ABS] = "abs";
@@ -274,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i
return callBuiltin(func, in1);
}
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+ res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+ res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+ res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+ return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+ std::vector<llvm::Value*> res(4);
+
+ //Extract x's
+ llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+ m_storage->constantInt(0),
+ name("extractX"));
+ //cast it to an unsigned int
+ x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+ res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+ //only x is valid. the others shouldn't be necessary
+ /*
+ res[1] = Constant::getNullValue(m_floatVecType);
+ res[2] = Constant::getNullValue(m_floatVecType);
+ res[3] = Constant::getNullValue(m_floatVecType);
+ */
+
+ return res;
+}
+
std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
const std::vector<llvm::Value*> in2)
{
@@ -281,6 +264,59 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
return callBuiltin(func, in1, in2);
}
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+ llvm::Function *func = function(TGSI_OPCODE_LIT);
+ return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2,
+ const std::vector<llvm::Value*> in3)
+{
+ std::vector<llvm::Value*> res = mul(in1, in2);
+ return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_MAX);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_MIN);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+ res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+ res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+ res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+ return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_POWER);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+ llvm::Function *func = function(TGSI_OPCODE_RSQ);
+ return callBuiltin(func, in);
+}
std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
const std::vector<llvm::Value*> in2)
@@ -289,6 +325,37 @@ std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> i
return callBuiltin(func, in1, in2);
}
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+ res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+ res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+ res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+ return res;
+}
+
+void checkFunction(Function *func)
+{
+ for (Function::const_iterator BI = func->begin(), BE = func->end();
+ BI != BE; ++BI) {
+ const BasicBlock &BB = *BI;
+ for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+ II != IE; ++II) {
+ const Instruction &I = *II;
+ std::cout<< "Instr = "<<I;
+ for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+ const Value *Op = I.getOperand(op);
+ std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+ //I->setOperand(op, V);
+ }
+ }
+ }
+}
+
llvm::Value * InstructionsSoa::allocaTemp()
{
VectorType *vector = VectorType::get(Type::FloatTy, 4);
@@ -408,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std
return allocaToResult(allocaPtr);
}
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_POWER);
- return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_MIN);
- return callBuiltin(func, in1, in2);
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_MAX);
- return callBuiltin(func, in1, in2);
-}
-
-void checkFunction(Function *func)
-{
- for (Function::const_iterator BI = func->begin(), BE = func->end();
- BI != BE; ++BI) {
- const BasicBlock &BB = *BI;
- for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
- II != IE; ++II) {
- const Instruction &I = *II;
- std::cout<< "Instr = "<<I;
- for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
- const Value *Op = I.getOperand(op);
- std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
- //I->setOperand(op, V);
- }
- }
- }
-}
-
void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
{
assert(originalFunc);
@@ -492,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
}
}
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
- res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
- res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
- res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
- return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
- llvm::Function *func = function(TGSI_OPCODE_LIT);
- return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
- llvm::Function *func = function(TGSI_OPCODE_RSQ);
- return callBuiltin(func, in);
-}
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3817fdc904..d6831e0a6b 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -76,6 +76,7 @@ public:
void end();
std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+ llvm::IRBuilder<>* getIRBuilder();
private:
const char * name(const char *prefix) const;
llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
diff --git a/src/gallium/auxiliary/gallivm/storage.cpp b/src/gallium/auxiliary/gallivm/storage.cpp
index 6f373f6dd5..73df24c976 100644
--- a/src/gallium/auxiliary/gallivm/storage.cpp
+++ b/src/gallium/auxiliary/gallivm/storage.cpp
@@ -323,7 +323,7 @@ llvm::Value * Storage::elemIdx(llvm::Value *ptr, int idx,
if (indIdx) {
getElem = GetElementPtrInst::Create(ptr,
- BinaryOperator::create(Instruction::Add,
+ BinaryOperator::Create(Instruction::Add,
indIdx,
constantInt(idx),
name("add"),
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 78d754371f..e1e5cabcf5 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -48,13 +48,11 @@ using namespace llvm;
StorageSoa::StorageSoa(llvm::BasicBlock *block,
llvm::Value *input,
llvm::Value *output,
- llvm::Value *consts,
- llvm::Value *temps)
+ llvm::Value *consts)
: m_block(block),
m_input(input),
m_output(output),
m_consts(consts),
- m_temps(temps),
m_immediates(0),
m_idx(0)
{
@@ -93,7 +91,7 @@ void StorageSoa::declareImmediates()
std::vector<float> vals(4);
std::vector<Constant*> channelArray;
- vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
+ vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
llvm::Constant *xChannel = createConstGlobalVector(vals);
vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
@@ -144,22 +142,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
return res;
}
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
{
- std::vector<llvm::Value*> res(4);
- llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
+ std::vector<llvm::Value*> x(4);
+ x[0] = m_builder->CreateExtractElement(vector,
+ constantInt(cc),
+ name("x"));
+
+ VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+ Constant *constVector = Constant::getNullValue(vectorType);
+ Value *res = m_builder->CreateInsertElement(constVector, x[0],
+ constantInt(0),
+ name("vecx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+ name("vecxx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+ name("vecxxx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+ name("vecxxxx"));
+ return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
+{
+ llvm::Value* res;
+ std::vector<llvm::Value*> res2(4);
+ llvm::Value *xChannel;
xChannel = elementPointer(m_consts, idx, 0);
- yChannel = elementPointer(m_consts, idx, 1);
- zChannel = elementPointer(m_consts, idx, 2);
- wChannel = elementPointer(m_consts, idx, 3);
- res[0] = alignedArrayLoad(xChannel);
- res[1] = alignedArrayLoad(yChannel);
- res[2] = alignedArrayLoad(zChannel);
- res[3] = alignedArrayLoad(wChannel);
+ res = alignedArrayLoad(xChannel);
- return res;
+ res2[0]=unpackConstElement(m_builder, res,0);
+ res2[1]=unpackConstElement(m_builder, res,1);
+ res2[2]=unpackConstElement(m_builder, res,2);
+ res2[3]=unpackConstElement(m_builder, res,3);
+
+ return res2;
}
std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
@@ -174,14 +193,15 @@ std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
return res;
}
-std::vector<llvm::Value*> StorageSoa::tempElement(llvm::Value *idx)
+std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx)
{
std::vector<llvm::Value*> res(4);
+ llvm::Value *temp = m_temps[idx];
- res[0] = element(m_temps, idx, 0);
- res[1] = element(m_temps, idx, 1);
- res[2] = element(m_temps, idx, 2);
- res[3] = element(m_temps, idx, 3);
+ res[0] = element(temp, constantInt(0), 0);
+ res[1] = element(temp, constantInt(0), 1);
+ res[2] = element(temp, constantInt(0), 2);
+ res[3] = element(temp, constantInt(0), 3);
return res;
}
@@ -260,6 +280,12 @@ llvm::Module * StorageSoa::currentModule() const
return m_block->getParent()->getParent();
}
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+ Constant*c = ConstantFP::get(APFloat(val));
+ return c;
+}
+
llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
{
VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -278,7 +304,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
}
std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
- llvm::Value *indIdx)
+ llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
{
std::vector<llvm::Value*> val(4);
@@ -299,10 +325,10 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
val = outputElement(realIndex);
break;
case TGSI_FILE_TEMPORARY:
- val = tempElement(realIndex);
+ val = tempElement(m_builder, idx);
break;
case TGSI_FILE_CONSTANT:
- val = constElement(realIndex);
+ val = constElement(m_builder, realIndex);
break;
case TGSI_FILE_IMMEDIATE:
val = immediateElement(realIndex);
@@ -328,19 +354,39 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
return res;
}
+llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder)
+{
+ VectorType *vector = VectorType::get(Type::FloatTy, 4);
+ ArrayType *vecArray = ArrayType::get(vector, 4);
+ AllocaInst *alloca = new AllocaInst(vecArray, "temp",
+ m_builder->GetInsertBlock());
+
+ return alloca;
+}
+
+
void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
- int mask)
+ int mask, llvm::IRBuilder<>* m_builder)
{
llvm::Value *out = 0;
+ llvm::Value *realIndex = 0;
switch(type) {
case TGSI_FILE_OUTPUT:
out = m_output;
+ realIndex = constantInt(idx);
break;
case TGSI_FILE_TEMPORARY:
- out = m_temps;
+ // if that temp doesn't already exist, alloca it
+ if (m_temps.find(idx) == m_temps.end())
+ m_temps[idx] = allocaTemp(m_builder);
+
+ out = m_temps[idx];
+
+ realIndex = constantInt(0);
break;
case TGSI_FILE_INPUT:
out = m_input;
+ realIndex = constantInt(idx);
break;
case TGSI_FILE_ADDRESS: {
llvm::Value *addr = m_addresses[idx];
@@ -358,7 +404,6 @@ void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm
assert(0);
break;
}
- llvm::Value *realIndex = constantInt(idx);
if ((mask & TGSI_WRITEMASK_X)) {
llvm::Value *xChannel = elementPointer(out, realIndex, 0);
new StoreInst(val[0], xChannel, false, m_block);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index ae2fc7c6ae..56886f85e7 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -29,6 +29,7 @@
#define STORAGESOA_H
#include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
#include <vector>
#include <list>
@@ -51,14 +52,13 @@ public:
StorageSoa(llvm::BasicBlock *block,
llvm::Value *input,
llvm::Value *output,
- llvm::Value *consts,
- llvm::Value *temps);
+ llvm::Value *consts);
std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle,
- llvm::Value *indIdx =0);
+ llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
- int mask);
+ int mask, llvm::IRBuilder<>* m_builder);
void addImmediate(float *vec);
void declareImmediates();
@@ -76,12 +76,14 @@ private:
const char *name(const char *prefix) const;
llvm::Value *alignedArrayLoad(llvm::Value *val);
llvm::Module *currentModule() const;
+ llvm::Constant *createConstGlobalFloat(const float val);
llvm::Constant *createConstGlobalVector(const std::vector<float> &vec);
std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
- std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+ llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+ std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
- std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
+ std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx);
std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
private:
llvm::BasicBlock *m_block;
@@ -89,12 +91,13 @@ private:
llvm::Value *m_input;
llvm::Value *m_output;
llvm::Value *m_consts;
- llvm::Value *m_temps;
+ std::map<int, llvm::Value*> m_temps;
llvm::GlobalVariable *m_immediates;
std::map<int, llvm::Value*> m_addresses;
std::vector<std::vector<float> > m_immediatesToFlush;
+ llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder);
mutable std::map<int, llvm::ConstantInt*> m_constInts;
mutable char m_name[32];
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 7292c0e366..c11b88af9e 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -52,8 +52,7 @@ static inline FunctionType *vertexShaderFunctionType()
// pass are castable to the following:
// [4 x <4 x float>] inputs,
// [4 x <4 x float>] output,
- // [4 x [4 x float]] consts,
- // [4 x <4 x float>] temps
+ // [4 x [1 x float]] consts,
std::vector<const Type*> funcArgs;
VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -61,13 +60,12 @@ static inline FunctionType *vertexShaderFunctionType()
PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
ArrayType *floatArray = ArrayType::get(Type::FloatTy, 4);
- ArrayType *constsArray = ArrayType::get(floatArray, 4);
+ ArrayType *constsArray = ArrayType::get(floatArray, 1);
PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
funcArgs.push_back(vectorArrayPtr);//inputs
funcArgs.push_back(vectorArrayPtr);//output
funcArgs.push_back(constsArrayPtr);//consts
- funcArgs.push_back(vectorArrayPtr);//temps
FunctionType *functionType = FunctionType::get(
/*Result=*/Type::VoidTy,
@@ -707,9 +705,8 @@ translate_instructionir(llvm::Module *module,
if (src->SrcRegister.Indirect) {
indIdx = storage->addrElement(src->SrcRegisterInd.Index);
}
-
val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
- src->SrcRegister.Index, swizzle, indIdx);
+ src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
inputs[i] = val;
}
@@ -1025,9 +1022,9 @@ translate_instructionir(llvm::Module *module,
/* store results */
for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
storage->store((enum tgsi_file_type)dst->DstRegister.File,
- dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+ dst->DstRegister.Index, out, dst->DstRegister.WriteMask,
+ instr->getIRBuilder() );
}
}
@@ -1122,8 +1119,6 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
output->setName("outputs");
Value *consts = args++;
consts->setName("consts");
- Value *temps = args++;
- temps->setName("temps");
BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
@@ -1132,7 +1127,7 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
fi = tgsi_default_full_instruction();
fd = tgsi_default_full_declaration();
- StorageSoa storage(label_entry, input, output, consts, temps);
+ StorageSoa storage(label_entry, input, output, consts);
InstructionsSoa instr(mod, shader, label_entry, &storage);
while(!tgsi_parse_end_of_tokens(&parse)) {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 8505d333bd..19db8a6a91 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -177,12 +177,16 @@ pb_get_base_buffer( struct pb_buffer *buf,
}
+/**
+ * Don't call this directly. Use pb_reference instead.
+ */
static INLINE void
pb_destroy(struct pb_buffer *buf)
{
assert(buf);
if(!buf)
return;
+ assert(buf->base.refcount == 0);
buf->vtbl->destroy(buf);
}
@@ -193,11 +197,16 @@ static INLINE void
pb_reference(struct pb_buffer **dst,
struct pb_buffer *src)
{
- if (src)
+ if (src) {
+ assert(src->base.refcount);
src->base.refcount++;
+ }
- if (*dst && --(*dst)->base.refcount == 0)
- pb_destroy( *dst );
+ if (*dst) {
+ assert((*dst)->base.refcount);
+ if(--(*dst)->base.refcount == 0)
+ pb_destroy( *dst );
+ }
*dst = src;
}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 633ee70a75..e2594ea236 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -86,8 +86,7 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
if(!fenced_buf) {
- assert(buf->base.refcount == 1);
- pb_destroy(buf);
+ pb_reference(&buf, NULL);
}
return fenced_buf;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index fe80ca30ee..a976d3041a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -100,7 +100,7 @@ mm_buffer_destroy(struct pb_buffer *buf)
assert(buf->base.refcount == 0);
pipe_mutex_lock(mm->mutex);
- mmFreeMem(mm_buf->block);
+ u_mmFreeMem(mm_buf->block);
FREE(buf);
pipe_mutex_unlock(mm->mutex);
}
@@ -175,14 +175,14 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
mm_buf->mgr = mm;
- mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+ mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
if(!mm_buf->block) {
debug_printf("warning: heap full\n");
#if 0
mmDumpMemInfo(mm->heap);
#endif
- mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+ mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
if(!mm_buf->block) {
FREE(mm_buf);
pipe_mutex_unlock(mm->mutex);
@@ -213,7 +213,7 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
pipe_mutex_lock(mm->mutex);
- mmDestroy(mm->heap);
+ u_mmDestroy(mm->heap);
pb_unmap(mm->buffer);
pb_reference(&mm->buffer, NULL);
@@ -254,7 +254,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
if(!mm->map)
goto failure;
- mm->heap = mmInit(0, size);
+ mm->heap = u_mmInit(0, size);
if (!mm->heap)
goto failure;
@@ -262,7 +262,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
failure:
if(mm->heap)
- mmDestroy(mm->heap);
+ u_mmDestroy(mm->heap);
if(mm->map)
pb_unmap(mm->buffer);
if(mm)
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 39b8a4dbd7..252dc5274a 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -7,6 +7,7 @@ C_SOURCES = \
rtasm_cpu.c \
rtasm_execmem.c \
rtasm_x86sse.c \
+ rtasm_ppc.c \
rtasm_ppc_spe.c
include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index 8ea25922aa..eb48368acc 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -6,6 +6,7 @@ rtasm = env.ConvenienceLibrary(
'rtasm_cpu.c',
'rtasm_execmem.c',
'rtasm_x86sse.c',
+ 'rtasm_ppc.c',
'rtasm_ppc_spe.c',
])
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index f16191cb61..be7433baf8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -38,12 +38,13 @@
#include "rtasm_execmem.h"
-#if defined(__linux__)
+#if defined(PIPE_OS_LINUX)
+
/*
* Allocate a large block of memory which can hold code then dole it out
* in pieces by means of the generic memory manager code.
-*/
+ */
#include <unistd.h>
#include <sys/mman.h>
@@ -62,7 +63,7 @@ static void
init_heap(void)
{
if (!exec_heap)
- exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+ exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
if (!exec_mem)
exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE,
@@ -83,7 +84,7 @@ rtasm_exec_malloc(size_t size)
if (exec_heap) {
size = (size + 31) & ~31; /* next multiple of 32 bytes */
- block = mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
+ block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
}
if (block)
@@ -103,17 +104,17 @@ rtasm_exec_free(void *addr)
pipe_mutex_lock(exec_mutex);
if (exec_heap) {
- struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+ struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
if (block)
- mmFreeMem(block);
+ u_mmFreeMem(block);
}
pipe_mutex_unlock(exec_mutex);
}
-#else
+#else /* PIPE_OS_LINUX */
/*
* Just use regular memory.
@@ -133,4 +134,4 @@ rtasm_exec_free(void *addr)
}
-#endif
+#endif /* PIPE_OS_LINUX */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 0000000000..b65bfa7bbd
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,959 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf
+ * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf
+ *
+ * Other PPC refs:
+ * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+ * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html
+ * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+ *
+ * \author Brian Paul
+ */
+
+
+#include <stdio.h>
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_execmem.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p)
+{
+ uint i;
+
+ p->num_inst = 0;
+ p->max_inst = 100; /* first guess at buffer size */
+ p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+ p->reg_used = 0x0;
+ p->fp_used = 0x0;
+ p->vec_used = 0x0;
+
+ /* only allow using gp registers 3..12 for now */
+ for (i = 0; i < 3; i++)
+ ppc_reserve_register(p, i);
+ for (i = 12; i < PPC_NUM_REGS; i++)
+ ppc_reserve_register(p, i);
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+ assert(p->num_inst <= p->max_inst);
+ if (p->store != NULL) {
+ rtasm_exec_free(p->store);
+ }
+ p->store = NULL;
+}
+
+
+uint
+ppc_num_instructions(const struct ppc_function *p)
+{
+ return p->num_inst;
+}
+
+
+void (*ppc_get_func(struct ppc_function *p))(void)
+{
+#if 0
+ DUMP_END();
+ if (DISASSEM && p->store)
+ debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+ if (p->store == p->error_overflow)
+ return (void (*)(void)) NULL;
+ else
+#endif
+ return (void (*)(void)) p->store;
+}
+
+
+void
+ppc_dump_func(const struct ppc_function *p)
+{
+ uint i;
+ for (i = 0; i < p->num_inst; i++) {
+ debug_printf("%3u: 0x%08x\n", i, p->store[i]);
+ }
+}
+
+
+/**
+ * Mark a register as being unavailable.
+ */
+int
+ppc_reserve_register(struct ppc_function *p, int reg)
+{
+ assert(reg < PPC_NUM_REGS);
+ p->reg_used |= (1 << reg);
+ return reg;
+}
+
+
+/**
+ * Allocate a general purpose register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_register(struct ppc_function *p)
+{
+ unsigned i;
+ for (i = 0; i < PPC_NUM_REGS; i++) {
+ const uint64_t mask = 1 << i;
+ if ((p->reg_used & mask) == 0) {
+ p->reg_used |= mask;
+ return i;
+ }
+ }
+ return -1;
+}
+
+
+/**
+ * Mark the given general purpose register as "unallocated".
+ */
+void
+ppc_release_register(struct ppc_function *p, int reg)
+{
+ assert(reg < PPC_NUM_REGS);
+ assert(p->reg_used & (1 << reg));
+ p->reg_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a floating point register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_fp_register(struct ppc_function *p)
+{
+ unsigned i;
+ for (i = 0; i < PPC_NUM_FP_REGS; i++) {
+ const uint64_t mask = 1 << i;
+ if ((p->fp_used & mask) == 0) {
+ p->fp_used |= mask;
+ return i;
+ }
+ }
+ return -1;
+}
+
+
+/**
+ * Mark the given floating point register as "unallocated".
+ */
+void
+ppc_release_fp_register(struct ppc_function *p, int reg)
+{
+ assert(reg < PPC_NUM_FP_REGS);
+ assert(p->fp_used & (1 << reg));
+ p->fp_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p)
+{
+ unsigned i;
+ for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+ const uint64_t mask = 1 << i;
+ if ((p->vec_used & mask) == 0) {
+ p->vec_used |= mask;
+ return i;
+ }
+ }
+ return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+ assert(reg < PPC_NUM_VEC_REGS);
+ assert(p->vec_used & (1 << reg));
+ p->vec_used &= ~(1 << reg);
+}
+
+
+/**
+ * Append instruction to instruction buffer. Grow buffer if out of room.
+ */
+static void
+emit_instruction(struct ppc_function *p, uint32_t inst_bits)
+{
+ if (!p->store)
+ return; /* out of memory, drop the instruction */
+
+ if (p->num_inst == p->max_inst) {
+ /* allocate larger buffer */
+ uint32_t *newbuf;
+ p->max_inst *= 2; /* 2x larger */
+ newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+ if (newbuf) {
+ memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE);
+ }
+ rtasm_exec_free(p->store);
+ p->store = newbuf;
+ if (!p->store) {
+ /* out of memory */
+ p->num_inst = 0;
+ return;
+ }
+ }
+
+ p->store[p->num_inst++] = inst_bits;
+}
+
+
+union vx_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned vD:5;
+ unsigned vA:5;
+ unsigned vB:5;
+ unsigned op2:11;
+ } inst;
+};
+
+static INLINE void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+ union vx_inst inst;
+ inst.inst.op = 4;
+ inst.inst.vD = vD;
+ inst.inst.vA = vA;
+ inst.inst.vB = vB;
+ inst.inst.op2 = op2;
+ emit_instruction(p, inst.bits);
+};
+
+
+union vxr_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned vD:5;
+ unsigned vA:5;
+ unsigned vB:5;
+ unsigned rC:1;
+ unsigned op2:10;
+ } inst;
+};
+
+static INLINE void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+ union vxr_inst inst;
+ inst.inst.op = 4;
+ inst.inst.vD = vD;
+ inst.inst.vA = vA;
+ inst.inst.vB = vB;
+ inst.inst.rC = 0;
+ inst.inst.op2 = op2;
+ emit_instruction(p, inst.bits);
+};
+
+
+union va_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned vD:5;
+ unsigned vA:5;
+ unsigned vB:5;
+ unsigned vC:5;
+ unsigned op2:6;
+ } inst;
+};
+
+static INLINE void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+ union va_inst inst;
+ inst.inst.op = 4;
+ inst.inst.vD = vD;
+ inst.inst.vA = vA;
+ inst.inst.vB = vB;
+ inst.inst.vC = vC;
+ inst.inst.op2 = op2;
+ emit_instruction(p, inst.bits);
+};
+
+
+union i_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned li:24;
+ unsigned aa:1;
+ unsigned lk:1;
+ } inst;
+};
+
+static INLINE void
+emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
+{
+ union i_inst inst;
+ inst.inst.op = op;
+ inst.inst.li = li;
+ inst.inst.aa = aa;
+ inst.inst.lk = lk;
+ emit_instruction(p, inst.bits);
+}
+
+
+union xl_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned bo:5;
+ unsigned bi:5;
+ unsigned unused:3;
+ unsigned bh:2;
+ unsigned op2:10;
+ unsigned lk:1;
+ } inst;
+};
+
+static INLINE void
+emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
+ uint op2, uint lk)
+{
+ union xl_inst inst;
+ inst.inst.op = op;
+ inst.inst.bo = bo;
+ inst.inst.bi = bi;
+ inst.inst.unused = 0x0;
+ inst.inst.bh = bh;
+ inst.inst.op2 = op2;
+ inst.inst.lk = lk;
+ emit_instruction(p, inst.bits);
+}
+
+static INLINE void
+dump_xl(const char *name, uint inst)
+{
+ union xl_inst i;
+
+ i.bits = inst;
+ debug_printf("%s = 0x%08x\n", name, inst);
+ debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op);
+ debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo);
+ debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi);
+ debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused);
+ debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh);
+ debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2);
+ debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk);
+}
+
+
+union x_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned vrs:5;
+ unsigned ra:5;
+ unsigned rb:5;
+ unsigned op2:10;
+ unsigned unused:1;
+ } inst;
+};
+
+static INLINE void
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
+{
+ union x_inst inst;
+ inst.inst.op = op;
+ inst.inst.vrs = vrs;
+ inst.inst.ra = ra;
+ inst.inst.rb = rb;
+ inst.inst.op2 = op2;
+ inst.inst.unused = 0x0;
+ emit_instruction(p, inst.bits);
+}
+
+
+union d_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned rt:5;
+ unsigned ra:5;
+ unsigned si:16;
+ } inst;
+};
+
+static INLINE void
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+{
+ union d_inst inst;
+ assert(si >= -32768);
+ assert(si <= 32767);
+ inst.inst.op = op;
+ inst.inst.rt = rt;
+ inst.inst.ra = ra;
+ inst.inst.si = (unsigned) (si & 0xffff);
+ emit_instruction(p, inst.bits);
+};
+
+
+union a_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned frt:5;
+ unsigned fra:5;
+ unsigned frb:5;
+ unsigned unused:5;
+ unsigned op2:5;
+ unsigned rc:1;
+ } inst;
+};
+
+static INLINE void
+emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
+ uint rc)
+{
+ union a_inst inst;
+ inst.inst.op = op;
+ inst.inst.frt = frt;
+ inst.inst.fra = fra;
+ inst.inst.frb = frb;
+ inst.inst.unused = 0x0;
+ inst.inst.op2 = op2;
+ inst.inst.rc = rc;
+ emit_instruction(p, inst.bits);
+};
+
+
+union xo_inst {
+ uint32_t bits;
+ struct {
+ unsigned op:6;
+ unsigned rt:5;
+ unsigned ra:5;
+ unsigned rb:5;
+ unsigned oe:1;
+ unsigned op2:9;
+ unsigned rc:1;
+ } inst;
+};
+
+static INLINE void
+emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
+ uint op2, uint rc)
+{
+ union xo_inst inst;
+ inst.inst.op = op;
+ inst.inst.rt = rt;
+ inst.inst.ra = ra;
+ inst.inst.rb = rb;
+ inst.inst.oe = oe;
+ inst.inst.op2 = op2;
+ inst.inst.rc = rc;
+ emit_instruction(p, inst.bits);
+}
+
+
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add: vD = vA * vB + vC */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
+}
+
+/** vector float negative mult subtract: vD = vA - vB * vC */
+void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+ emit_vx(p, 586, vD, 0, vB);
+}
+
+/** vector store: store vR at mem[vA+vB] */
+void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+ emit_x(p, 31, vR, vA, vB, 231);
+}
+
+/** vector load: vR = mem[vA+vB] */
+void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+ emit_x(p, 31, vR, vA, vB, 103);
+}
+
+/** load vector element word: vR = mem_word[ra+rb] */
+void
+ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
+{
+ emit_x(p, 31, vr, ra, rb, 71);
+}
+
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 1220, vD, vA, vB);
+}
+
+/** Pseudo-instruction: vector move */
+void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
+{
+ ppc_vor(p, vD, vA, vA);
+}
+
+/** Set vector register to {0,0,0,0} */
+void
+ppc_vzero(struct ppc_function *p, uint vr)
+{
+ ppc_vxor(p, vr, vr, vr);
+}
+
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+ emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+ emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+ emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+ emit_vx(p, 652, vD, imm, vB);
+}
+
+/** vector splat signed immediate word */
+void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
+{
+ assert(imm >= -16);
+ assert(imm < 15);
+ emit_vx(p, 908, vD, imm, 0);
+}
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+ emit_vx(p, 388, vD, vA, vB);
+}
+
+
+
+
+/**
+ ** integer arithmetic
+ **/
+
+/** rt = ra + imm */
+void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+ emit_d(p, 14, rt, ra, imm);
+}
+
+/** rt = ra + (imm << 16) */
+void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+ emit_d(p, 15, rt, ra, imm);
+}
+
+/** rt = ra + rb */
+void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+ emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+}
+
+/** rt = ra AND ra */
+void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+ emit_x(p, 31, ra, rt, rb, 28); /* note argument order */
+}
+
+/** rt = ra AND imm */
+void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+ emit_d(p, 28, ra, rt, imm); /* note argument order */
+}
+
+/** rt = ra OR ra */
+void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+ emit_x(p, 31, ra, rt, rb, 444); /* note argument order */
+}
+
+/** rt = ra OR imm */
+void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+ emit_d(p, 24, ra, rt, imm); /* note argument order */
+}
+
+/** rt = ra XOR ra */
+void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+ emit_x(p, 31, ra, rt, rb, 316); /* note argument order */
+}
+
+/** rt = ra XOR imm */
+void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+ emit_d(p, 26, ra, rt, imm); /* note argument order */
+}
+
+/** pseudo instruction: move: rt = ra */
+void
+ppc_mr(struct ppc_function *p, uint rt, uint ra)
+{
+ ppc_or(p, rt, ra, ra);
+}
+
+/** pseudo instruction: load immediate: rt = imm */
+void
+ppc_li(struct ppc_function *p, uint rt, int imm)
+{
+ ppc_addi(p, rt, 0, imm);
+}
+
+/** rt = imm << 16 */
+void
+ppc_lis(struct ppc_function *p, uint rt, int imm)
+{
+ ppc_addis(p, rt, 0, imm);
+}
+
+/** rt = imm */
+void
+ppc_load_int(struct ppc_function *p, uint rt, int imm)
+{
+ ppc_lis(p, rt, (imm >> 16)); /* rt = imm >> 16 */
+ ppc_ori(p, rt, rt, (imm & 0xffff)); /* rt = rt | (imm & 0xffff) */
+}
+
+
+
+
+/**
+ ** integer load/store
+ **/
+
+/** store rs at memory[(ra)+d],
+ * then update ra = (ra)+d
+ */
+void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
+{
+ emit_d(p, 37, rs, ra, d);
+}
+
+/** store rs at memory[(ra)+d] */
+void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
+{
+ emit_d(p, 36, rs, ra, d);
+}
+
+/** Load rt = mem[(ra)+d]; then zero set high 32 bits to zero. */
+void
+ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
+{
+ emit_d(p, 32, rt, ra, d);
+}
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+/** add: frt = fra + frb */
+void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+ emit_a(p, 63, frt, fra, frb, 21, 0);
+}
+
+/** sub: frt = fra - frb */
+void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+ emit_a(p, 63, frt, fra, frb, 20, 0);
+}
+
+/** convert to int: rt = (int) ra */
+void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
+{
+ emit_x(p, 63, rt, 0, fra, 15);
+}
+
+/** store frs at mem[(ra)+offset] */
+void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
+{
+ emit_d(p, 52, frs, ra, offset);
+}
+
+/** store frs at mem[(ra)+(rb)] */
+void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
+{
+ emit_x(p, 31, frs, ra, rb, 983);
+}
+
+/** load frt = mem[(ra)+offset] */
+void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
+{
+ emit_d(p, 48, frt, ra, offset);
+}
+
+
+
+
+
+/**
+ ** branch instructions
+ **/
+
+/** BLR: Branch to link register (p. 35) */
+void
+ppc_blr(struct ppc_function *p)
+{
+ emit_i(p, 18, 0, 0, 1);
+}
+
+/** Branch Conditional to Link Register (p. 36) */
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
+{
+ emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+}
+
+/** Pseudo instruction: return from subroutine */
+void
+ppc_return(struct ppc_function *p)
+{
+ ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 0000000000..08212a2a25
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,335 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4 /**< 4 bytes / instruction */
+
+#define PPC_NUM_REGS 32
+#define PPC_NUM_FP_REGS 32
+#define PPC_NUM_VEC_REGS 32
+
+/** Stack pointer register */
+#define PPC_REG_SP 1
+
+/** Branch conditions */
+#define BRANCH_COND_ALWAYS 0x14 /* binary 1z1zz (z=ignored) */
+
+/** Branch hints */
+#define BRANCH_HINT_SUB_RETURN 0x0 /* binary 00 */
+
+
+struct ppc_function
+{
+ uint32_t *store; /**< instruction buffer */
+ uint num_inst;
+ uint max_inst;
+ uint32_t reg_used; /** used/free general-purpose registers bitmask */
+ uint32_t fp_used; /** used/free floating point registers bitmask */
+ uint32_t vec_used; /** used/free vector registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p);
+extern void ppc_release_func(struct ppc_function *p);
+extern uint ppc_num_instructions(const struct ppc_function *p);
+extern void (*ppc_get_func( struct ppc_function *p ))( void );
+extern void ppc_dump_func(const struct ppc_function *p);
+
+extern int ppc_reserve_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_register(struct ppc_function *p);
+extern void ppc_release_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_fp_register(struct ppc_function *p);
+extern void ppc_release_fp_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_vec_register(struct ppc_function *p);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add: vD = vA * vB + vC */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float negative mult subtract: vD = vA - vB * vC */
+extern void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+/** vector store: store vR at mem[vA+vB] */
+extern void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** vector load: vR = mem[vA+vB] */
+extern void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** load vector element word: vR = mem_word[vA+vB] */
+extern void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** Pseudo-instruction: vector move */
+extern void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
+
+/** Set vector register to {0,0,0,0} */
+extern void
+ppc_vzero(struct ppc_function *p, uint vr);
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat signed immediate word */
+extern void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm);
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+extern void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+
+/**
+ ** scalar arithmetic
+ **/
+
+extern void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_mr(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_li(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_lis(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_load_int(struct ppc_function *p, uint rt, int imm);
+
+
+
+/**
+ ** scalar load/store
+ **/
+
+extern void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d);
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+extern void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
+
+extern void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
+
+extern void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset);
+
+
+
+/**
+ ** branch instructions
+ **/
+
+extern void
+ppc_blr(struct ppc_function *p);
+
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg);
+
+extern void
+ppc_return(struct ppc_function *p);
+
+
+#endif /* RTASM_PPC_H */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a04cc6c4ff..1bd9f1c8dd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
* Real-time assembly generation interface for Cell B.E. SPEs.
*
* \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
*/
+
+#include <stdio.h>
#include "pipe/p_compiler.h"
#include "util/u_memory.h"
#include "rtasm_ppc_spe.h"
+
#ifdef GALLIUM_CELL
/**
* SPE instruction types
@@ -143,21 +147,91 @@ union spe_inst_RI18 {
/*@}*/
+static void
+indent(const struct spe_function *p)
+{
+ int i;
+ for (i = 0; i < p->indent; i++) {
+ putchar(' ');
+ }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+ return longname + 4;
+}
+
+
+static const char *
+reg_name(int reg)
+{
+ switch (reg) {
+ case SPE_REG_SP:
+ return "$sp";
+ case SPE_REG_RA:
+ return "$lr";
+ default:
+ {
+ /* cycle through four buffers to handle multiple calls per printf */
+ static char buf[4][10];
+ static int b = 0;
+ b = (b + 1) % 4;
+ sprintf(buf[b], "$%d", reg);
+ return buf[b];
+ }
+ }
+}
+
+
+static void
+emit_instruction(struct spe_function *p, uint32_t inst_bits)
+{
+ if (!p->store)
+ return; /* out of memory, drop the instruction */
+
+ if (p->num_inst == p->max_inst) {
+ /* allocate larger buffer */
+ uint32_t *newbuf;
+ p->max_inst *= 2; /* 2x larger */
+ newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16);
+ if (newbuf) {
+ memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE);
+ }
+ align_free(p->store);
+ p->store = newbuf;
+ if (!p->store) {
+ /* out of memory */
+ p->num_inst = 0;
+ return;
+ }
+ }
+
+ p->store[p->num_inst++] = inst_bits;
+}
+
+
+
static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, unsigned rB)
+ unsigned rA, unsigned rB, const char *name)
{
union spe_inst_RR inst;
inst.inst.op = op;
inst.inst.rB = rB;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, %s\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
+ }
}
static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, unsigned rB, unsigned rC)
+ unsigned rA, unsigned rB, unsigned rC, const char *name)
{
union spe_inst_RRR inst;
inst.inst.op = op;
@@ -165,73 +239,105 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
inst.inst.rB = rB;
inst.inst.rA = rA;
inst.inst.rC = rC;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+ reg_name(rA), reg_name(rB), reg_name(rC));
+ }
}
static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, int imm)
+ unsigned rA, int imm, const char *name)
{
union spe_inst_RI7 inst;
inst.inst.op = op;
inst.inst.i7 = imm;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+ }
}
static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, int imm)
+ unsigned rA, int imm, const char *name)
{
union spe_inst_RI8 inst;
inst.inst.op = op;
inst.inst.i8 = imm;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+ }
}
static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
- unsigned rA, int imm)
+ unsigned rA, int imm, const char *name)
{
union spe_inst_RI10 inst;
inst.inst.op = op;
inst.inst.i10 = imm;
inst.inst.rA = rA;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+ }
+}
+
+
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+ unsigned rA, int imm, const char *name)
+{
+ assert(imm <= 511);
+ assert(imm >= -512);
+ emit_RI10(p, op, rT, rA, imm, name);
}
static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
- int imm)
+ int imm, const char *name)
{
union spe_inst_RI16 inst;
inst.inst.op = op;
inst.inst.i16 = imm;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+ }
}
static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
- int imm)
+ int imm, const char *name)
{
union spe_inst_RI18 inst;
inst.inst.op = op;
inst.inst.i18 = imm;
inst.inst.rT = rT;
- p->store[p->num_inst++] = inst.bits;
- assert(p->num_inst <= p->max_inst);
+ emit_instruction(p, inst.bits);
+ if (p->print) {
+ indent(p);
+ printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+ }
}
@@ -240,80 +346,101 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
#define EMIT_(_name, _op) \
void _name (struct spe_function *p, unsigned rT) \
{ \
- emit_RR(p, _op, rT, 0, 0); \
+ emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
}
#define EMIT_R(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA) \
{ \
- emit_RR(p, _op, rT, rA, 0); \
+ emit_RR(p, _op, rT, rA, 0, __FUNCTION__); \
}
#define EMIT_RR(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
{ \
- emit_RR(p, _op, rT, rA, rB); \
+ emit_RR(p, _op, rT, rA, rB, __FUNCTION__); \
}
#define EMIT_RRR(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
{ \
- emit_RRR(p, _op, rT, rA, rB, rC); \
+ emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__); \
}
#define EMIT_RI7(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
{ \
- emit_RI7(p, _op, rT, rA, imm); \
+ emit_RI7(p, _op, rT, rA, imm, __FUNCTION__); \
}
#define EMIT_RI8(_name, _op, bias) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
{ \
- emit_RI8(p, _op, rT, rA, bias - imm); \
+ emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__); \
}
#define EMIT_RI10(_name, _op) \
void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
{ \
- emit_RI10(p, _op, rT, rA, imm); \
+ emit_RI10(p, _op, rT, rA, imm, __FUNCTION__); \
+}
+
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+ emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__); \
}
#define EMIT_RI16(_name, _op) \
void _name (struct spe_function *p, unsigned rT, int imm) \
{ \
- emit_RI16(p, _op, rT, imm); \
+ emit_RI16(p, _op, rT, imm, __FUNCTION__); \
}
#define EMIT_RI18(_name, _op) \
void _name (struct spe_function *p, unsigned rT, int imm) \
{ \
- emit_RI18(p, _op, rT, imm); \
+ emit_RI18(p, _op, rT, imm, __FUNCTION__); \
}
#define EMIT_I16(_name, _op) \
void _name (struct spe_function *p, int imm) \
{ \
- emit_RI16(p, _op, 0, imm); \
+ emit_RI16(p, _op, 0, imm, __FUNCTION__); \
}
#include "rtasm_ppc_spe.h"
+
/**
* Initialize an spe_function.
- * \param code_size size of instruction buffer to allocate, in bytes.
+ * \param code_size initial size of instruction buffer to allocate, in bytes.
+ * If zero, use a default.
*/
void spe_init_func(struct spe_function *p, unsigned code_size)
{
- p->store = align_malloc(code_size, 16);
+ unsigned int i;
+
+ if (!code_size)
+ code_size = 64;
+
p->num_inst = 0;
p->max_inst = code_size / SPE_INST_SIZE;
+ p->store = align_malloc(code_size, 16);
+
+ p->set_count = 0;
+ memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
/* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
*/
- p->regs[0] = ~7;
- p->regs[1] = (1U << (80 - 64)) - 1;
+ p->regs[0] = p->regs[1] = p->regs[2] = 1;
+ for (i = 80; i <= 127; i++) {
+ p->regs[i] = 1;
+ }
+
+ p->print = false;
+ p->indent = 0;
}
@@ -327,20 +454,23 @@ void spe_release_func(struct spe_function *p)
}
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+ return p->num_inst * SPE_INST_SIZE;
+}
+
+
/**
- * Alloate a SPE register.
+ * Allocate a SPE register.
* \return register index or -1 if none left.
*/
int spe_allocate_available_register(struct spe_function *p)
{
unsigned i;
for (i = 0; i < SPE_NUM_REGS; i++) {
- const uint64_t mask = (1ULL << (i % 64));
- const unsigned idx = i / 64;
-
- assert(idx < 2);
- if ((p->regs[idx] & mask) != 0) {
- p->regs[idx] &= ~mask;
+ if (p->regs[i] == 0) {
+ p->regs[i] = 1;
return i;
}
}
@@ -354,31 +484,160 @@ int spe_allocate_available_register(struct spe_function *p)
*/
int spe_allocate_register(struct spe_function *p, int reg)
{
- const unsigned idx = reg / 64;
- const unsigned bit = reg % 64;
-
assert(reg < SPE_NUM_REGS);
- assert((p->regs[idx] & (1ULL << bit)) != 0);
-
- p->regs[idx] &= ~(1ULL << bit);
+ assert(p->regs[reg] == 0);
+ p->regs[reg] = 1;
return reg;
}
/**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated". Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
*/
void spe_release_register(struct spe_function *p, int reg)
{
- const unsigned idx = reg / 64;
- const unsigned bit = reg % 64;
+ assert(reg < SPE_NUM_REGS);
+ assert(p->regs[reg] == 1);
- assert(idx < 2);
+ p->regs[reg] = 0;
+}
- assert(reg < SPE_NUM_REGS);
- assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers. This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+ unsigned int i;
+
+ /* Keep track of the set count. If it ever wraps around to 0,
+ * we're in trouble.
+ */
+ p->set_count++;
+ assert(p->set_count > 0);
+
+ /* Increment the allocation count of all registers currently
+ * allocated. Then any registers that are allocated in this set
+ * will be the only ones with a count of 1; they'll all be released
+ * when the register set is released.
+ */
+ for (i = 0; i < SPE_NUM_REGS; i++) {
+ if (p->regs[i] > 0)
+ p->regs[i]++;
+ }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+ unsigned int i;
+
+ /* If the set count drops below zero, we're in trouble. */
+ assert(p->set_count > 0);
+ p->set_count--;
+
+ /* Drop the allocation level of all registers. Any allocated
+ * during this register set will drop to 0 and then become
+ * available.
+ */
+ for (i = 0; i < SPE_NUM_REGS; i++) {
+ if (p->regs[i] > 0)
+ p->regs[i]--;
+ }
+}
- p->regs[idx] |= (1ULL << bit);
+
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+ unsigned i, num = 0;
+ /* only count registers in the range available to callers */
+ for (i = 2; i < 80; i++) {
+ if (p->regs[i]) {
+ used[num++] = i;
+ }
+ }
+ return num;
+}
+
+
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+ p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+ p->indent += spaces;
+}
+
+
+void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+ if (p->print) {
+ p->indent += rel_indent;
+ indent(p);
+ p->indent -= rel_indent;
+ printf("# %s\n", s);
+ }
+}
+
+
+/**
+ * Load quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+ const boolean pSave = p->print;
+
+ /* offset must be a multiple of 16 */
+ assert(offset % 16 == 0);
+ /* offset must fit in 10-bit signed int field, after shifting */
+ assert((offset >> 4) <= 511);
+ assert((offset >> 4) >= -512);
+
+ p->print = FALSE;
+ emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+ p->print = pSave;
+
+ if (p->print) {
+ indent(p);
+ printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+ }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+ const boolean pSave = p->print;
+
+ /* offset must be a multiple of 16 */
+ assert(offset % 16 == 0);
+ /* offset must fit in 10-bit signed int field, after shifting */
+ assert((offset >> 4) <= 511);
+ assert((offset >> 4) >= -512);
+
+ p->print = FALSE;
+ emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+ p->print = pSave;
+
+ if (p->print) {
+ indent(p);
+ printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+ }
}
@@ -392,51 +651,51 @@ void spe_release_register(struct spe_function *p, int reg)
/** Branch Indirect to address in rA */
void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Interupt Return */
void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect and set link on external data */
void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
int e)
{
- emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect and set link. Save PC in rT, jump to rA. */
void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
int e)
{
- emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if zero word. If rT.word[0]==0, jump to rA. */
void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if non-zero word. If rT.word[0]!=0, jump to rA. */
void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if zero halfword. If rT.halfword[1]==0, jump to rA. */
void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
/** Branch indirect if non-zero halfword. If rT.halfword[1]!=0, jump to rA. */
void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
{
- emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+ emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
}
@@ -505,30 +764,226 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
}
else {
spe_ilhu(p, rT, i >> 16);
- spe_iohl(p, rT, i & 0xffff);
+ if (i & 0xffff)
+ spe_iohl(p, rT, i & 0xffff);
+ }
+}
+
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+ /* If the whole value is in the lower 18 bits, use ila, which
+ * doesn't sign-extend. Otherwise, if the two halfwords of
+ * the constant are identical, use ilh. Otherwise, if every byte of
+ * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+ * Bytes Immediate (fsmbi) to load the value in a single instruction.
+ * Otherwise, in the general case, we have to use ilhu followed by iohl.
+ */
+ if ((ui & 0x0003ffff) == ui) {
+ spe_ila(p, rT, ui);
+ }
+ else if ((ui >> 16) == (ui & 0xffff)) {
+ spe_ilh(p, rT, ui & 0xffff);
+ }
+ else if (
+ ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+ ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+ ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+ ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+ ) {
+ unsigned int mask = 0;
+ /* fsmbi duplicates each bit in the given mask eight times,
+ * using a 16-bit value to initialize a 16-byte quadword.
+ * Each 4-bit nybble of the mask corresponds to a full word
+ * of the result; look at the value and figure out the mask
+ * (replicated for each word in the quadword), and then
+ * form the "select mask" to get the value.
+ */
+ if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+ if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+ if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+ if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+ spe_fsmbi(p, rT, mask);
+ }
+ else {
+ /* The general case: this usually uses two instructions, but
+ * may use only one if the low-order 16 bits of each word are 0.
+ */
+ spe_ilhu(p, rT, ui >> 16);
+ if (ui & 0xffff)
+ spe_iohl(p, rT, ui & 0xffff);
+ }
+}
+
+/**
+ * This function is constructed identically to spe_xor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If we can, emit a single instruction, either And Byte Immediate
+ * (which uses the same constant across each byte), And Halfword Immediate
+ * (which sign-extends a 10-bit immediate to 16 bits and uses that
+ * across each halfword), or And Word Immediate (which sign-extends
+ * a 10-bit immediate to 32 bits).
+ *
+ * Otherwise, we'll need to use a temporary register.
+ */
+ unsigned int tmp;
+
+ /* If the upper 23 bits are all 0s or all 1s, sign extension
+ * will work and we can use And Word Immediate
+ */
+ tmp = ui & 0xfffffe00;
+ if (tmp == 0xfffffe00 || tmp == 0) {
+ spe_andi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric along halfword boundaries and
+ * the upper 7 bits of each halfword are all 0s or 1s, we
+ * can use And Halfword Immediate
+ */
+ tmp = ui & 0xfe00fe00;
+ if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+ spe_andhi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric in each byte, then we can use
+ * the And Byte Immediate instruction.
+ */
+ tmp = ui & 0x000000ff;
+ if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+ spe_andbi(p, rT, rA, tmp);
+ return;
+ }
+
+ /* Otherwise, we'll have to use a temporary register. */
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_and(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+}
+
+
+/**
+ * This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If we can, emit a single instruction, either Exclusive Or Byte
+ * Immediate (which uses the same constant across each byte), Exclusive
+ * Or Halfword Immediate (which sign-extends a 10-bit immediate to
+ * 16 bits and uses that across each halfword), or Exclusive Or Word
+ * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+ *
+ * Otherwise, we'll need to use a temporary register.
+ */
+ unsigned int tmp;
+
+ /* If the upper 23 bits are all 0s or all 1s, sign extension
+ * will work and we can use Exclusive Or Word Immediate
+ */
+ tmp = ui & 0xfffffe00;
+ if (tmp == 0xfffffe00 || tmp == 0) {
+ spe_xori(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric along halfword boundaries and
+ * the upper 7 bits of each halfword are all 0s or 1s, we
+ * can use Exclusive Or Halfword Immediate
+ */
+ tmp = ui & 0xfe00fe00;
+ if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+ spe_xorhi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric in each byte, then we can use
+ * the Exclusive Or Byte Immediate instruction.
+ */
+ tmp = ui & 0x000000ff;
+ if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+ spe_xorbi(p, rT, rA, tmp);
+ return;
}
+
+ /* Otherwise, we'll have to use a temporary register. */
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_xor(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
}
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If the comparison value is 9 bits or less, it fits inside a
+ * Compare Equal Word Immediate instruction.
+ */
+ if ((ui & 0x000001ff) == ui) {
+ spe_ceqi(p, rT, rA, ui);
+ }
+ /* Otherwise, we're going to have to load a word first. */
+ else {
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_ceq(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+ }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If the comparison value is 10 bits or less, it fits inside a
+ * Compare Logical Greater Than Word Immediate instruction.
+ */
+ if ((ui & 0x000003ff) == ui) {
+ spe_clgti(p, rT, rA, ui);
+ }
+ /* Otherwise, we're going to have to load a word first. */
+ else {
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_clgt(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+ }
+}
void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
{
- spe_ila(p, rT, 66051);
- spe_shufb(p, rT, rA, rA, rT);
+ /* Use a temporary, just in case rT == rA */
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+ spe_ila(p, tmp_reg, 0x00010203);
+ spe_shufb(p, rT, rA, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
}
void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
{
- spe_nor(p, rT, rT, rT);
+ spe_nor(p, rT, rA, rA);
}
void
spe_move(struct spe_function *p, unsigned rT, unsigned rA)
{
- spe_ori(p, rT, rA, 0);
+ /* Use different instructions depending on the instruction address
+ * to take advantage of the dual pipelines.
+ */
+ if (p->num_inst & 1)
+ spe_shlqbyi(p, rT, rA, 0); /* odd pipe */
+ else
+ spe_ori(p, rT, rA, 0); /* even pipe */
}
@@ -539,4 +994,70 @@ spe_zero(struct spe_function *p, unsigned rT)
}
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+ assert(word >= 0);
+ assert(word <= 3);
+
+ if (word == 0) {
+ int tmp1 = rT;
+ spe_ila(p, tmp1, 66051);
+ spe_shufb(p, rT, rA, rA, tmp1);
+ }
+ else {
+ /* XXX review this, we may not need the rotqbyi instruction */
+ int tmp1 = rT;
+ int tmp2 = spe_allocate_available_register(p);
+
+ spe_ila(p, tmp1, 66051);
+ spe_rotqbyi(p, tmp2, rA, 4 * word);
+ spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+ spe_release_register(p, tmp2);
+ }
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ *
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB. But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+ unsigned int compare_reg = spe_allocate_available_register(p);
+ spe_fcgt(p, compare_reg, rA, rB);
+ spe_selb(p, rT, rA, rB, compare_reg);
+ spe_release_register(p, compare_reg);
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ *
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+ unsigned int compare_reg = spe_allocate_available_register(p);
+ spe_fcgt(p, compare_reg, rA, rB);
+ spe_selb(p, rT, rB, rA, compare_reg);
+ spe_release_register(p, compare_reg);
+}
+
#endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace3..7c211ffc51 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
* For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
*
* \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
*/
#ifndef RTASM_PPC_SPE_H
@@ -39,10 +40,10 @@
/** number of general-purpose SIMD registers */
#define SPE_NUM_REGS 128
-/** Return Address register */
+/** Return Address register (aka $lr / Link Register) */
#define SPE_REG_RA 0
-/** Stack Pointer register */
+/** Stack Pointer register (aka $sp) */
#define SPE_REG_SP 1
@@ -52,221 +53,252 @@ struct spe_function
uint num_inst;
uint max_inst;
- /**
- * Mask of used / unused registers
- *
- * Each set bit corresponds to an available register. Each cleared bit
- * corresponds to an allocated register.
+ /**
+ * The "set count" reflects the number of nested register sets
+ * are allowed. In the unlikely case that we exceed the set count,
+ * register allocation will start to be confused, which is critical
+ * enough that we check for it.
+ */
+ unsigned char set_count;
+
+ /**
+ * Flags for used and unused registers. Each byte corresponds to a
+ * register; a 0 in that byte means that the register is available.
+ * A value of 1 means that the register was allocated in the current
+ * register set. Any other value N means that the register was allocated
+ * N register sets ago.
*
* \sa
* spe_allocate_register, spe_allocate_available_register,
- * spe_release_register
+ * spe_allocate_register_set, spe_release_register_set, spe_release_register,
*/
- uint64_t regs[SPE_NUM_REGS / 64];
+ unsigned char regs[SPE_NUM_REGS];
+
+ boolean print; /**< print/dump instructions as they're emitted? */
+ int indent; /**< number of spaces to indent */
};
+
extern void spe_init_func(struct spe_function *p, unsigned code_size);
extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
extern int spe_allocate_available_register(struct spe_function *p);
extern int spe_allocate_register(struct spe_function *p, int reg);
extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
+
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
#endif /* RTASM_PPC_SPE_H */
#ifndef EMIT_
-#define EMIT_(name, _op) \
- extern void _name (struct spe_function *p, unsigned rT)
+#define EMIT_(_name, _op) \
+ extern void _name (struct spe_function *p, unsigned rT);
#define EMIT_R(_name, _op) \
- extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
+ extern void _name (struct spe_function *p, unsigned rT, unsigned rA);
#define EMIT_RR(_name, _op) \
extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
- unsigned rB)
+ unsigned rB);
#define EMIT_RRR(_name, _op) \
extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
- unsigned rB, unsigned rC)
+ unsigned rB, unsigned rC);
#define EMIT_RI7(_name, _op) \
extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
- int imm)
+ int imm);
#define EMIT_RI8(_name, _op, bias) \
extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
- int imm)
+ int imm);
#define EMIT_RI10(_name, _op) \
extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
- int imm)
+ int imm);
+#define EMIT_RI10s(_name, _op) \
+ extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+ int imm);
#define EMIT_RI16(_name, _op) \
- extern void _name (struct spe_function *p, unsigned rT, int imm)
+ extern void _name (struct spe_function *p, unsigned rT, int imm);
#define EMIT_RI18(_name, _op) \
- extern void _name (struct spe_function *p, unsigned rT, int imm)
+ extern void _name (struct spe_function *p, unsigned rT, int imm);
#define EMIT_I16(_name, _op) \
- extern void _name (struct spe_function *p, int imm)
+ extern void _name (struct spe_function *p, int imm);
#define UNDEF_EMIT_MACROS
#endif /* EMIT_ */
/* Memory load / store instructions
*/
-EMIT_RI10(spe_lqd, 0x034);
-EMIT_RR (spe_lqx, 0x1c4);
-EMIT_RI16(spe_lqa, 0x061);
-EMIT_RI16(spe_lqr, 0x067);
-EMIT_RI10(spe_stqd, 0x024);
-EMIT_RR (spe_stqx, 0x144);
-EMIT_RI16(spe_stqa, 0x041);
-EMIT_RI16(spe_stqr, 0x047);
-EMIT_RI7 (spe_cbd, 0x1f4);
-EMIT_RR (spe_cbx, 0x1d4);
-EMIT_RI7 (spe_chd, 0x1f5);
-EMIT_RI7 (spe_chx, 0x1d5);
-EMIT_RI7 (spe_cwd, 0x1f6);
-EMIT_RI7 (spe_cwx, 0x1d6);
-EMIT_RI7 (spe_cdd, 0x1f7);
-EMIT_RI7 (spe_cdx, 0x1d7);
+EMIT_RR (spe_lqx, 0x1c4)
+EMIT_RI16(spe_lqa, 0x061)
+EMIT_RI16(spe_lqr, 0x067)
+EMIT_RR (spe_stqx, 0x144)
+EMIT_RI16(spe_stqa, 0x041)
+EMIT_RI16(spe_stqr, 0x047)
+EMIT_RI7 (spe_cbd, 0x1f4)
+EMIT_RR (spe_cbx, 0x1d4)
+EMIT_RI7 (spe_chd, 0x1f5)
+EMIT_RI7 (spe_chx, 0x1d5)
+EMIT_RI7 (spe_cwd, 0x1f6)
+EMIT_RI7 (spe_cwx, 0x1d6)
+EMIT_RI7 (spe_cdd, 0x1f7)
+EMIT_RI7 (spe_cdx, 0x1d7)
/* Constant formation instructions
*/
-EMIT_RI16(spe_ilh, 0x083);
-EMIT_RI16(spe_ilhu, 0x082);
-EMIT_RI16(spe_il, 0x081);
-EMIT_RI18(spe_ila, 0x021);
-EMIT_RI16(spe_iohl, 0x0c1);
-EMIT_RI16(spe_fsmbi, 0x065);
+EMIT_RI16(spe_ilh, 0x083)
+EMIT_RI16(spe_ilhu, 0x082)
+EMIT_RI16(spe_il, 0x081)
+EMIT_RI18(spe_ila, 0x021)
+EMIT_RI16(spe_iohl, 0x0c1)
+EMIT_RI16(spe_fsmbi, 0x065)
/* Integer and logical instructions
*/
-EMIT_RR (spe_ah, 0x0c8);
-EMIT_RI10(spe_ahi, 0x01d);
-EMIT_RR (spe_a, 0x0c0);
-EMIT_RI10(spe_ai, 0x01c);
-EMIT_RR (spe_sfh, 0x048);
-EMIT_RI10(spe_sfhi, 0x00d);
-EMIT_RR (spe_sf, 0x040);
-EMIT_RI10(spe_sfi, 0x00c);
-EMIT_RR (spe_addx, 0x340);
-EMIT_RR (spe_cg, 0x0c2);
-EMIT_RR (spe_cgx, 0x342);
-EMIT_RR (spe_sfx, 0x341);
-EMIT_RR (spe_bg, 0x042);
-EMIT_RR (spe_bgx, 0x343);
-EMIT_RR (spe_mpy, 0x3c4);
-EMIT_RR (spe_mpyu, 0x3cc);
-EMIT_RI10(spe_mpyi, 0x074);
-EMIT_RI10(spe_mpyui, 0x075);
-EMIT_RRR (spe_mpya, 0x00c);
-EMIT_RR (spe_mpyh, 0x3c5);
-EMIT_RR (spe_mpys, 0x3c7);
-EMIT_RR (spe_mpyhh, 0x3c6);
-EMIT_RR (spe_mpyhha, 0x346);
-EMIT_RR (spe_mpyhhu, 0x3ce);
-EMIT_RR (spe_mpyhhau, 0x34e);
-EMIT_R (spe_clz, 0x2a5);
-EMIT_R (spe_cntb, 0x2b4);
-EMIT_R (spe_fsmb, 0x1b6);
-EMIT_R (spe_fsmh, 0x1b5);
-EMIT_R (spe_fsm, 0x1b4);
-EMIT_R (spe_gbb, 0x1b2);
-EMIT_R (spe_gbh, 0x1b1);
-EMIT_R (spe_gb, 0x1b0);
-EMIT_RR (spe_avgb, 0x0d3);
-EMIT_RR (spe_absdb, 0x053);
-EMIT_RR (spe_sumb, 0x253);
-EMIT_R (spe_xsbh, 0x2b6);
-EMIT_R (spe_xshw, 0x2ae);
-EMIT_R (spe_xswd, 0x2a6);
-EMIT_RR (spe_and, 0x0c1);
-EMIT_RR (spe_andc, 0x2c1);
-EMIT_RI10(spe_andbi, 0x016);
-EMIT_RI10(spe_andhi, 0x015);
-EMIT_RI10(spe_andi, 0x014);
-EMIT_RR (spe_or, 0x041);
-EMIT_RR (spe_orc, 0x2c9);
-EMIT_RI10(spe_orbi, 0x006);
-EMIT_RI10(spe_orhi, 0x005);
-EMIT_RI10(spe_ori, 0x004);
-EMIT_R (spe_orx, 0x1f0);
-EMIT_RR (spe_xor, 0x241);
-EMIT_RI10(spe_xorbi, 0x026);
-EMIT_RI10(spe_xorhi, 0x025);
-EMIT_RI10(spe_xori, 0x024);
-EMIT_RR (spe_nand, 0x0c9);
-EMIT_RR (spe_nor, 0x049);
-EMIT_RR (spe_eqv, 0x249);
-EMIT_RRR (spe_selb, 0x008);
-EMIT_RRR (spe_shufb, 0x00b);
+EMIT_RR (spe_ah, 0x0c8)
+EMIT_RI10(spe_ahi, 0x01d)
+EMIT_RR (spe_a, 0x0c0)
+EMIT_RI10s(spe_ai, 0x01c)
+EMIT_RR (spe_sfh, 0x048)
+EMIT_RI10(spe_sfhi, 0x00d)
+EMIT_RR (spe_sf, 0x040)
+EMIT_RI10(spe_sfi, 0x00c)
+EMIT_RR (spe_addx, 0x340)
+EMIT_RR (spe_cg, 0x0c2)
+EMIT_RR (spe_cgx, 0x342)
+EMIT_RR (spe_sfx, 0x341)
+EMIT_RR (spe_bg, 0x042)
+EMIT_RR (spe_bgx, 0x343)
+EMIT_RR (spe_mpy, 0x3c4)
+EMIT_RR (spe_mpyu, 0x3cc)
+EMIT_RI10(spe_mpyi, 0x074)
+EMIT_RI10(spe_mpyui, 0x075)
+EMIT_RRR (spe_mpya, 0x00c)
+EMIT_RR (spe_mpyh, 0x3c5)
+EMIT_RR (spe_mpys, 0x3c7)
+EMIT_RR (spe_mpyhh, 0x3c6)
+EMIT_RR (spe_mpyhha, 0x346)
+EMIT_RR (spe_mpyhhu, 0x3ce)
+EMIT_RR (spe_mpyhhau, 0x34e)
+EMIT_R (spe_clz, 0x2a5)
+EMIT_R (spe_cntb, 0x2b4)
+EMIT_R (spe_fsmb, 0x1b6)
+EMIT_R (spe_fsmh, 0x1b5)
+EMIT_R (spe_fsm, 0x1b4)
+EMIT_R (spe_gbb, 0x1b2)
+EMIT_R (spe_gbh, 0x1b1)
+EMIT_R (spe_gb, 0x1b0)
+EMIT_RR (spe_avgb, 0x0d3)
+EMIT_RR (spe_absdb, 0x053)
+EMIT_RR (spe_sumb, 0x253)
+EMIT_R (spe_xsbh, 0x2b6)
+EMIT_R (spe_xshw, 0x2ae)
+EMIT_R (spe_xswd, 0x2a6)
+EMIT_RR (spe_and, 0x0c1)
+EMIT_RR (spe_andc, 0x2c1)
+EMIT_RI10s(spe_andbi, 0x016)
+EMIT_RI10s(spe_andhi, 0x015)
+EMIT_RI10s(spe_andi, 0x014)
+EMIT_RR (spe_or, 0x041)
+EMIT_RR (spe_orc, 0x2c9)
+EMIT_RI10s(spe_orbi, 0x006)
+EMIT_RI10s(spe_orhi, 0x005)
+EMIT_RI10s(spe_ori, 0x004)
+EMIT_R (spe_orx, 0x1f0)
+EMIT_RR (spe_xor, 0x241)
+EMIT_RI10s(spe_xorbi, 0x046)
+EMIT_RI10s(spe_xorhi, 0x045)
+EMIT_RI10s(spe_xori, 0x044)
+EMIT_RR (spe_nand, 0x0c9)
+EMIT_RR (spe_nor, 0x049)
+EMIT_RR (spe_eqv, 0x249)
+EMIT_RRR (spe_selb, 0x008)
+EMIT_RRR (spe_shufb, 0x00b)
/* Shift and rotate instructions
*/
-EMIT_RR (spe_shlh, 0x05f);
-EMIT_RI7 (spe_shlhi, 0x07f);
-EMIT_RR (spe_shl, 0x05b);
-EMIT_RI7 (spe_shli, 0x07b);
-EMIT_RR (spe_shlqbi, 0x1db);
-EMIT_RI7 (spe_shlqbii, 0x1fb);
-EMIT_RR (spe_shlqby, 0x1df);
-EMIT_RI7 (spe_shlqbyi, 0x1ff);
-EMIT_RR (spe_shlqbybi, 0x1cf);
-EMIT_RR (spe_roth, 0x05c);
-EMIT_RI7 (spe_rothi, 0x07c);
-EMIT_RR (spe_rot, 0x058);
-EMIT_RI7 (spe_roti, 0x078);
-EMIT_RR (spe_rotqby, 0x1dc);
-EMIT_RI7 (spe_rotqbyi, 0x1fc);
-EMIT_RR (spe_rotqbybi, 0x1cc);
-EMIT_RR (spe_rotqbi, 0x1d8);
-EMIT_RI7 (spe_rotqbii, 0x1f8);
-EMIT_RR (spe_rothm, 0x05d);
-EMIT_RI7 (spe_rothmi, 0x07d);
-EMIT_RR (spe_rotm, 0x059);
-EMIT_RI7 (spe_rotmi, 0x079);
-EMIT_RR (spe_rotqmby, 0x1dd);
-EMIT_RI7 (spe_rotqmbyi, 0x1fd);
-EMIT_RR (spe_rotqmbybi, 0x1cd);
-EMIT_RR (spe_rotqmbi, 0x1c9);
-EMIT_RI7 (spe_rotqmbii, 0x1f9);
-EMIT_RR (spe_rotmah, 0x05e);
-EMIT_RI7 (spe_rotmahi, 0x07e);
-EMIT_RR (spe_rotma, 0x05a);
-EMIT_RI7 (spe_rotmai, 0x07a);
+EMIT_RR (spe_shlh, 0x05f)
+EMIT_RI7 (spe_shlhi, 0x07f)
+EMIT_RR (spe_shl, 0x05b)
+EMIT_RI7 (spe_shli, 0x07b)
+EMIT_RR (spe_shlqbi, 0x1db)
+EMIT_RI7 (spe_shlqbii, 0x1fb)
+EMIT_RR (spe_shlqby, 0x1df)
+EMIT_RI7 (spe_shlqbyi, 0x1ff)
+EMIT_RR (spe_shlqbybi, 0x1cf)
+EMIT_RR (spe_roth, 0x05c)
+EMIT_RI7 (spe_rothi, 0x07c)
+EMIT_RR (spe_rot, 0x058)
+EMIT_RI7 (spe_roti, 0x078)
+EMIT_RR (spe_rotqby, 0x1dc)
+EMIT_RI7 (spe_rotqbyi, 0x1fc)
+EMIT_RR (spe_rotqbybi, 0x1cc)
+EMIT_RR (spe_rotqbi, 0x1d8)
+EMIT_RI7 (spe_rotqbii, 0x1f8)
+EMIT_RR (spe_rothm, 0x05d)
+EMIT_RI7 (spe_rothmi, 0x07d)
+EMIT_RR (spe_rotm, 0x059)
+EMIT_RI7 (spe_rotmi, 0x079)
+EMIT_RR (spe_rotqmby, 0x1dd)
+EMIT_RI7 (spe_rotqmbyi, 0x1fd)
+EMIT_RR (spe_rotqmbybi, 0x1cd)
+EMIT_RR (spe_rotqmbi, 0x1c9)
+EMIT_RI7 (spe_rotqmbii, 0x1f9)
+EMIT_RR (spe_rotmah, 0x05e)
+EMIT_RI7 (spe_rotmahi, 0x07e)
+EMIT_RR (spe_rotma, 0x05a)
+EMIT_RI7 (spe_rotmai, 0x07a)
/* Compare, branch, and halt instructions
*/
-EMIT_RR (spe_heq, 0x3d8);
-EMIT_RI10(spe_heqi, 0x07f);
-EMIT_RR (spe_hgt, 0x258);
-EMIT_RI10(spe_hgti, 0x04f);
-EMIT_RR (spe_hlgt, 0x2d8);
-EMIT_RI10(spe_hlgti, 0x05f);
-EMIT_RR (spe_ceqb, 0x3d0);
-EMIT_RI10(spe_ceqbi, 0x07e);
-EMIT_RR (spe_ceqh, 0x3c8);
-EMIT_RI10(spe_ceqhi, 0x07d);
-EMIT_RR (spe_ceq, 0x3c0);
-EMIT_RI10(spe_ceqi, 0x07c);
-EMIT_RR (spe_cgtb, 0x250);
-EMIT_RI10(spe_cgtbi, 0x04e);
-EMIT_RR (spe_cgth, 0x248);
-EMIT_RI10(spe_cgthi, 0x04d);
-EMIT_RR (spe_cgt, 0x240);
-EMIT_RI10(spe_cgti, 0x04c);
-EMIT_RR (spe_clgtb, 0x2d0);
-EMIT_RI10(spe_clgtbi, 0x05e);
-EMIT_RR (spe_clgth, 0x2c8);
-EMIT_RI10(spe_clgthi, 0x05d);
-EMIT_RR (spe_clgt, 0x2c0);
-EMIT_RI10(spe_clgti, 0x05c);
-EMIT_I16 (spe_br, 0x064);
-EMIT_I16 (spe_bra, 0x060);
-EMIT_RI16(spe_brsl, 0x066);
-EMIT_RI16(spe_brasl, 0x062);
-EMIT_RI16(spe_brnz, 0x042);
-EMIT_RI16(spe_brz, 0x040);
-EMIT_RI16(spe_brhnz, 0x046);
-EMIT_RI16(spe_brhz, 0x044);
+EMIT_RR (spe_heq, 0x3d8)
+EMIT_RI10(spe_heqi, 0x07f)
+EMIT_RR (spe_hgt, 0x258)
+EMIT_RI10(spe_hgti, 0x04f)
+EMIT_RR (spe_hlgt, 0x2d8)
+EMIT_RI10(spe_hlgti, 0x05f)
+EMIT_RR (spe_ceqb, 0x3d0)
+EMIT_RI10(spe_ceqbi, 0x07e)
+EMIT_RR (spe_ceqh, 0x3c8)
+EMIT_RI10(spe_ceqhi, 0x07d)
+EMIT_RR (spe_ceq, 0x3c0)
+EMIT_RI10(spe_ceqi, 0x07c)
+EMIT_RR (spe_cgtb, 0x250)
+EMIT_RI10(spe_cgtbi, 0x04e)
+EMIT_RR (spe_cgth, 0x248)
+EMIT_RI10(spe_cgthi, 0x04d)
+EMIT_RR (spe_cgt, 0x240)
+EMIT_RI10(spe_cgti, 0x04c)
+EMIT_RR (spe_clgtb, 0x2d0)
+EMIT_RI10(spe_clgtbi, 0x05e)
+EMIT_RR (spe_clgth, 0x2c8)
+EMIT_RI10(spe_clgthi, 0x05d)
+EMIT_RR (spe_clgt, 0x2c0)
+EMIT_RI10(spe_clgti, 0x05c)
+EMIT_I16 (spe_br, 0x064)
+EMIT_I16 (spe_bra, 0x060)
+EMIT_RI16(spe_brsl, 0x066)
+EMIT_RI16(spe_brasl, 0x062)
+EMIT_RI16(spe_brnz, 0x042)
+EMIT_RI16(spe_brz, 0x040)
+EMIT_RI16(spe_brhnz, 0x046)
+EMIT_RI16(spe_brhz, 0x044)
+
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
@@ -292,13 +324,33 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
extern void
spe_load_int(struct spe_function *p, unsigned rT, int i);
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
/** Replicate word 0 of rA across rT. */
extern void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
/** rT = rA. */
extern void
@@ -308,49 +360,61 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
extern void
spe_zero(struct spe_function *p, unsigned rT);
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
/* Floating-point instructions
*/
-EMIT_RR (spe_fa, 0x2c4);
-EMIT_RR (spe_dfa, 0x2cc);
-EMIT_RR (spe_fs, 0x2c5);
-EMIT_RR (spe_dfs, 0x2cd);
-EMIT_RR (spe_fm, 0x2c6);
-EMIT_RR (spe_dfm, 0x2ce);
-EMIT_RRR (spe_fma, 0x00e);
-EMIT_RR (spe_dfma, 0x35c);
-EMIT_RRR (spe_fnms, 0x00d);
-EMIT_RR (spe_dfnms, 0x35e);
-EMIT_RRR (spe_fms, 0x00f);
-EMIT_RR (spe_dfms, 0x35d);
-EMIT_RR (spe_dfnma, 0x35f);
-EMIT_R (spe_frest, 0x1b8);
-EMIT_R (spe_frsqest, 0x1b9);
-EMIT_RR (spe_fi, 0x3d4);
-EMIT_RI8 (spe_csflt, 0x1da, 155);
-EMIT_RI8 (spe_cflts, 0x1d8, 173);
-EMIT_RI8 (spe_cuflt, 0x1db, 155);
-EMIT_RI8 (spe_cfltu, 0x1d9, 173);
-EMIT_R (spe_frds, 0x3b9);
-EMIT_R (spe_fesd, 0x3b8);
-EMIT_RR (spe_dfceq, 0x3c3);
-EMIT_RR (spe_dfcmeq, 0x3cb);
-EMIT_RR (spe_dfcgt, 0x2c3);
-EMIT_RR (spe_dfcmgt, 0x2cb);
-EMIT_RI7 (spe_dftsv, 0x3bf);
-EMIT_RR (spe_fceq, 0x3c2);
-EMIT_RR (spe_fcmeq, 0x3ca);
-EMIT_RR (spe_fcgt, 0x2c2);
-EMIT_RR (spe_fcmgt, 0x2ca);
-EMIT_R (spe_fscrwr, 0x3ba);
-EMIT_ (spe_fscrrd, 0x398);
+EMIT_RR (spe_fa, 0x2c4)
+EMIT_RR (spe_dfa, 0x2cc)
+EMIT_RR (spe_fs, 0x2c5)
+EMIT_RR (spe_dfs, 0x2cd)
+EMIT_RR (spe_fm, 0x2c6)
+EMIT_RR (spe_dfm, 0x2ce)
+EMIT_RRR (spe_fma, 0x00e)
+EMIT_RR (spe_dfma, 0x35c)
+EMIT_RRR (spe_fnms, 0x00d)
+EMIT_RR (spe_dfnms, 0x35e)
+EMIT_RRR (spe_fms, 0x00f)
+EMIT_RR (spe_dfms, 0x35d)
+EMIT_RR (spe_dfnma, 0x35f)
+EMIT_R (spe_frest, 0x1b8)
+EMIT_R (spe_frsqest, 0x1b9)
+EMIT_RR (spe_fi, 0x3d4)
+EMIT_RI8 (spe_csflt, 0x1da, 155)
+EMIT_RI8 (spe_cflts, 0x1d8, 173)
+EMIT_RI8 (spe_cuflt, 0x1db, 155)
+EMIT_RI8 (spe_cfltu, 0x1d9, 173)
+EMIT_R (spe_frds, 0x3b9)
+EMIT_R (spe_fesd, 0x3b8)
+EMIT_RR (spe_dfceq, 0x3c3)
+EMIT_RR (spe_dfcmeq, 0x3cb)
+EMIT_RR (spe_dfcgt, 0x2c3)
+EMIT_RR (spe_dfcmgt, 0x2cb)
+EMIT_RI7 (spe_dftsv, 0x3bf)
+EMIT_RR (spe_fceq, 0x3c2)
+EMIT_RR (spe_fcmeq, 0x3ca)
+EMIT_RR (spe_fcgt, 0x2c2)
+EMIT_RR (spe_fcmgt, 0x2ca)
+EMIT_R (spe_fscrwr, 0x3ba)
+EMIT_ (spe_fscrrd, 0x398)
/* Channel instructions
*/
-EMIT_R (spe_rdch, 0x00d);
-EMIT_R (spe_rdchcnt, 0x00f);
-EMIT_R (spe_wrch, 0x10d);
+EMIT_R (spe_rdch, 0x00d)
+EMIT_R (spe_rdchcnt, 0x00f)
+EMIT_R (spe_wrch, 0x10d)
#ifdef UNDEF_EMIT_MACROS
@@ -361,6 +425,7 @@ EMIT_R (spe_wrch, 0x10d);
#undef EMIT_RI7
#undef EMIT_RI8
#undef EMIT_RI10
+#undef EMIT_RI10s
#undef EMIT_RI16
#undef EMIT_RI18
#undef EMIT_I16
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index ad9d8f8ced..99ee74cf14 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -240,7 +240,8 @@ static void emit_modrm( struct x86_function *p,
/* Oh-oh we've stumbled into the SIB thing.
*/
if (regmem.file == file_REG32 &&
- regmem.idx == reg_SP) {
+ regmem.idx == reg_SP &&
+ regmem.mod != mod_REG) {
emit_1ub(p, 0x24); /* simplistic! */
}
@@ -439,25 +440,70 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
}
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
{
DUMP_RI( dst, imm );
+ assert(dst.file == file_REG32);
assert(dst.mod == mod_REG);
emit_1ub(p, 0xb8 + dst.idx);
emit_1i(p, imm);
}
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void
+x86_group1_imm( struct x86_function *p,
+ unsigned op, struct x86_reg dst, int imm )
{
- DUMP_RI( dst, imm );
+ assert(dst.file == file_REG32);
assert(dst.mod == mod_REG);
- emit_1ub(p, 0x80);
- emit_modrm_noreg(p, 0, dst);
- emit_1ub(p, imm);
+ if(-0x80 <= imm && imm < 0x80) {
+ emit_1ub(p, 0x83);
+ emit_modrm_noreg(p, op, dst);
+ emit_1b(p, (char)imm);
+ }
+ else {
+ emit_1ub(p, 0x81);
+ emit_modrm_noreg(p, op, dst);
+ emit_1i(p, imm);
+ }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ DUMP_RI( dst, imm );
+ x86_group1_imm(p, 7, dst, imm);
}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af79f07dd3..1b5eaaca85 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -152,12 +152,13 @@ void x86_jmp( struct x86_function *p, int label );
/* void x86_call( struct x86_function *p, void (*label)() ); */
void x86_call( struct x86_function *p, struct x86_reg reg);
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
/* Macro for sse_shufps() and sse2_pshufd():
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index c7155a9316..d7df9490cf 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -11,6 +11,7 @@ C_SOURCES = \
tgsi_info.c \
tgsi_iterate.c \
tgsi_parse.c \
+ tgsi_ppc.c \
tgsi_scan.c \
tgsi_sse2.c \
tgsi_text.c \
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 45bf3f6d57..8200cce42f 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary(
'tgsi_parse.c',
'tgsi_sanity.c',
'tgsi_scan.c',
+ 'tgsi_ppc.c',
'tgsi_sse2.c',
'tgsi_text.c',
'tgsi_transform.c',
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 74614d3688..38fcaf8829 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void )
return instruction_ext_nv;
}
-union token_u32
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
{
- unsigned u32;
-};
+ return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
unsigned
tgsi_compare_instruction_ext_nv(
@@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_instruction_ext_nv
@@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_instruction_ext_label
@@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_instruction_ext_texture
@@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_src_register_ext_swz
@@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_src_register_ext_mod
@@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_dst_register_ext_concode
@@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate(
{
a.Padding = b.Padding = 0;
a.Extended = b.Extended = 0;
- return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+ return compare32(&a, &b);
}
struct tgsi_dst_register_ext_modulate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 41dffc3dba..96567772b7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -467,17 +467,6 @@ micro_exp2(
}
static void
-micro_f2it(
- union tgsi_exec_channel *dst,
- const union tgsi_exec_channel *src )
-{
- dst->i[0] = (int) src->f[0];
- dst->i[1] = (int) src->f[1];
- dst->i[2] = (int) src->f[2];
- dst->i[3] = (int) src->f[3];
-}
-
-static void
micro_f2ut(
union tgsi_exec_channel *dst,
const union tgsi_exec_channel *src )
@@ -958,14 +947,22 @@ fetch_src_file_channel(
switch( file ) {
case TGSI_FILE_CONSTANT:
assert(mach->Consts);
- assert(index->i[0] >= 0);
- assert(index->i[1] >= 0);
- assert(index->i[2] >= 0);
- assert(index->i[3] >= 0);
- chan->f[0] = mach->Consts[index->i[0]][swizzle];
- chan->f[1] = mach->Consts[index->i[1]][swizzle];
- chan->f[2] = mach->Consts[index->i[2]][swizzle];
- chan->f[3] = mach->Consts[index->i[3]][swizzle];
+ if (index->i[0] < 0)
+ chan->f[0] = 0.0f;
+ else
+ chan->f[0] = mach->Consts[index->i[0]][swizzle];
+ if (index->i[1] < 0)
+ chan->f[1] = 0.0f;
+ else
+ chan->f[1] = mach->Consts[index->i[1]][swizzle];
+ if (index->i[2] < 0)
+ chan->f[2] = 0.0f;
+ else
+ chan->f[2] = mach->Consts[index->i[2]][swizzle];
+ if (index->i[3] < 0)
+ chan->f[3] = 0.0f;
+ else
+ chan->f[3] = mach->Consts[index->i[3]][swizzle];
break;
case TGSI_FILE_INPUT:
@@ -1037,11 +1034,28 @@ fetch_source(
union tgsi_exec_channel index;
uint swizzle;
+ /* We start with a direct index into a register file.
+ *
+ * file[1],
+ * where:
+ * file = SrcRegister.File
+ * [1] = SrcRegister.Index
+ */
index.i[0] =
index.i[1] =
index.i[2] =
index.i[3] = reg->SrcRegister.Index;
+ /* There is an extra source register that indirectly subscripts
+ * a register file. The direct index now becomes an offset
+ * that is being added to the indirect register.
+ *
+ * file[ind[2].x+1],
+ * where:
+ * ind = SrcRegisterInd.File
+ * [2] = SrcRegisterInd.Index
+ * .x = SrcRegisterInd.SwizzleX
+ */
if (reg->SrcRegister.Indirect) {
union tgsi_exec_channel index2;
union tgsi_exec_channel indir_index;
@@ -1064,10 +1078,10 @@ fetch_source(
&indir_index );
/* add value of address register to the offset */
- index.i[0] += indir_index.i[0];
- index.i[1] += indir_index.i[1];
- index.i[2] += indir_index.i[2];
- index.i[3] += indir_index.i[3];
+ index.i[0] += (int) indir_index.f[0];
+ index.i[1] += (int) indir_index.f[1];
+ index.i[2] += (int) indir_index.f[2];
+ index.i[3] += (int) indir_index.f[3];
/* for disabled execution channels, zero-out the index to
* avoid using a potential garbage value.
@@ -1078,19 +1092,31 @@ fetch_source(
}
}
- if( reg->SrcRegister.Dimension ) {
- switch( reg->SrcRegister.File ) {
+ /* There is an extra source register that is a second
+ * subscript to a register file. Effectively it means that
+ * the register file is actually a 2D array of registers.
+ *
+ * file[1][3] == file[1*sizeof(file[1])+3],
+ * where:
+ * [3] = SrcRegisterDim.Index
+ */
+ if (reg->SrcRegister.Dimension) {
+ /* The size of the first-order array depends on the register file type.
+ * We need to multiply the index to the first array to get an effective,
+ * "flat" index that points to the beginning of the second-order array.
+ */
+ switch (reg->SrcRegister.File) {
case TGSI_FILE_INPUT:
- index.i[0] *= 17;
- index.i[1] *= 17;
- index.i[2] *= 17;
- index.i[3] *= 17;
+ index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+ index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+ index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+ index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
break;
case TGSI_FILE_CONSTANT:
- index.i[0] *= 4096;
- index.i[1] *= 4096;
- index.i[2] *= 4096;
- index.i[3] *= 4096;
+ index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
+ index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
+ index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
+ index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
break;
default:
assert( 0 );
@@ -1101,6 +1127,17 @@ fetch_source(
index.i[2] += reg->SrcRegisterDim.Index;
index.i[3] += reg->SrcRegisterDim.Index;
+ /* Again, the second subscript index can be addressed indirectly
+ * identically to the first one.
+ * Nothing stops us from indirectly addressing the indirect register,
+ * but there is no need for that, so we won't exercise it.
+ *
+ * file[1][ind[4].y+3],
+ * where:
+ * ind = SrcRegisterDimInd.File
+ * [4] = SrcRegisterDimInd.Index
+ * .y = SrcRegisterDimInd.SwizzleX
+ */
if (reg->SrcRegisterDim.Indirect) {
union tgsi_exec_channel index2;
union tgsi_exec_channel indir_index;
@@ -1120,10 +1157,10 @@ fetch_source(
&index2,
&indir_index );
- index.i[0] += indir_index.i[0];
- index.i[1] += indir_index.i[1];
- index.i[2] += indir_index.i[2];
- index.i[3] += indir_index.i[3];
+ index.i[0] += (int) indir_index.f[0];
+ index.i[1] += (int) indir_index.f[1];
+ index.i[2] += (int) indir_index.f[2];
+ index.i[3] += (int) indir_index.f[3];
/* for disabled execution channels, zero-out the index to
* avoid using a potential garbage value.
@@ -1133,6 +1170,11 @@ fetch_source(
index.i[i] = 0;
}
}
+
+ /* If by any chance there was a need for a 3D array of register
+ * files, we would have to check whether SrcRegisterDim is followed
+ * by a dimension register and continue the saga.
+ */
}
swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
@@ -1701,6 +1743,7 @@ exec_declaration(
break;
default:
+ eval = NULL;
assert( 0 );
}
@@ -1743,7 +1786,7 @@ exec_instruction(
case TGSI_OPCODE_ARL:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
- micro_f2it( &r[0], &r[0] );
+ micro_trunc( &r[0], &r[0] );
STORE( &r[0], 0, chan_index );
}
break;
@@ -2033,7 +2076,21 @@ exec_instruction(
case TGSI_OPCODE_DOT2ADD:
/* TGSI_OPCODE_DP2A */
- assert (0);
+ FETCH( &r[0], 0, CHAN_X );
+ FETCH( &r[1], 1, CHAN_X );
+ micro_mul( &r[0], &r[0], &r[1] );
+
+ FETCH( &r[1], 0, CHAN_Y );
+ FETCH( &r[2], 1, CHAN_Y );
+ micro_mul( &r[1], &r[1], &r[2] );
+ micro_add( &r[0], &r[0], &r[1] );
+
+ FETCH( &r[2], 2, CHAN_X );
+ micro_add( &r[0], &r[0], &r[2] );
+
+ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( &r[0], 0, chan_index );
+ }
break;
case TGSI_OPCODE_INDEX:
@@ -2478,7 +2535,8 @@ exec_instruction(
micro_mul( &dot, &r[2], &r[2] );
micro_add( &tmp, &tmp, &dot );
- /* tmp = 1 / tmp */
+ /* tmp = 1 / sqrt(tmp) */
+ micro_sqrt( &tmp, &tmp );
micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
/* note: w channel is undefined */
@@ -2511,7 +2569,8 @@ exec_instruction(
micro_mul( &dot, &r[3], &r[3] );
micro_add( &tmp, &tmp, &dot );
- /* tmp = 1 / tmp */
+ /* tmp = 1 / sqrt(tmp) */
+ micro_sqrt( &tmp, &tmp );
micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index fc40a25e09..ac4b239910 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -178,6 +178,16 @@ struct tgsi_exec_labels
#define TGSI_EXEC_MAX_LOOP_NESTING 20
#define TGSI_EXEC_MAX_CALL_NESTING 20
+/* The maximum number of input attributes per vertex. For 2D
+ * input register files, this is the stride between two 1D
+ * arrays.
+ */
+#define TGSI_EXEC_MAX_INPUT_ATTRIBS 17
+
+/* The maximum number of constant vectors per constant buffer.
+ */
+#define TGSI_EXEC_MAX_CONST_BUFFER 4096
+
/**
* Run-time virtual machine state for executing TGSI shader.
*/
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 3757486ba9..2cd56e413a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens(
1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
}
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings. The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+ memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
static void
next_token(
struct tgsi_parse_context *ctx,
void *token )
{
assert( !tgsi_parse_end_of_tokens( ctx ) );
-
- *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+ copy_token(token, &ctx->Tokens[ctx->Position]);
+ ctx->Position++;
}
+
void
tgsi_parse_token(
struct tgsi_parse_context *ctx )
@@ -116,7 +133,7 @@ tgsi_parse_token(
struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
*decl = tgsi_default_full_declaration();
- decl->Declaration = *(struct tgsi_declaration *) &token;
+ copy_token(&decl->Declaration, &token);
next_token( ctx, &decl->DeclarationRange );
@@ -132,8 +149,7 @@ tgsi_parse_token(
struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
*imm = tgsi_default_full_immediate();
- imm->Immediate = *(struct tgsi_immediate *) &token;
-
+ copy_token(&imm->Immediate, &token);
assert( !imm->Immediate.Extended );
switch (imm->Immediate.DataType) {
@@ -158,8 +174,7 @@ tgsi_parse_token(
unsigned extended;
*inst = tgsi_default_full_instruction();
- inst->Instruction = *(struct tgsi_instruction *) &token;
-
+ copy_token(&inst->Instruction, &token);
extended = inst->Instruction.Extended;
while( extended ) {
@@ -169,18 +184,15 @@ tgsi_parse_token(
switch( token.Type ) {
case TGSI_INSTRUCTION_EXT_TYPE_NV:
- inst->InstructionExtNv =
- *(struct tgsi_instruction_ext_nv *) &token;
+ copy_token(&inst->InstructionExtNv, &token);
break;
case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
- inst->InstructionExtLabel =
- *(struct tgsi_instruction_ext_label *) &token;
+ copy_token(&inst->InstructionExtLabel, &token);
break;
case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
- inst->InstructionExtTexture =
- *(struct tgsi_instruction_ext_texture *) &token;
+ copy_token(&inst->InstructionExtTexture, &token);
break;
default:
@@ -212,13 +224,13 @@ tgsi_parse_token(
switch( token.Type ) {
case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
- inst->FullDstRegisters[i].DstRegisterExtConcode =
- *(struct tgsi_dst_register_ext_concode *) &token;
+ copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+ &token);
break;
case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
- inst->FullDstRegisters[i].DstRegisterExtModulate =
- *(struct tgsi_dst_register_ext_modulate *) &token;
+ copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+ &token);
break;
default:
@@ -245,13 +257,13 @@ tgsi_parse_token(
switch( token.Type ) {
case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
- inst->FullSrcRegisters[i].SrcRegisterExtSwz =
- *(struct tgsi_src_register_ext_swz *) &token;
+ copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+ &token);
break;
case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
- inst->FullSrcRegisters[i].SrcRegisterExtMod =
- *(struct tgsi_src_register_ext_mod *) &token;
+ copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+ &token);
break;
default:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 0000000000..a92b1902e3
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,1329 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_dump.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+ 1.0f, -128.0f, 128.0, 0.0
+};
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+ for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+ ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+ if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+ FOR_EACH_CHANNEL( CHAN )\
+ IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+
+/**
+ * How many TGSI temps should be implemented with real PPC vector registers
+ * rather than memory.
+ */
+#define MAX_PPC_TEMPS 4
+
+
+struct reg_chan_vec
+{
+ struct tgsi_full_src_register src;
+ uint chan;
+ uint vec;
+};
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+ struct ppc_function *f;
+ int inputs_reg; /**< GP register pointing to input params */
+ int outputs_reg; /**< GP register pointing to output params */
+ int temps_reg; /**< GP register pointing to temporary "registers" */
+ int immed_reg; /**< GP register pointing to immediates buffer */
+ int const_reg; /**< GP register pointing to constants buffer */
+ int builtins_reg; /**< GP register pointint to built-in constants */
+
+ int offset_reg; /**< used to reduce redundant li instructions */
+ int offset_value;
+
+ int one_vec; /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+ int bit31_vec; /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+
+ /**
+ * Map TGSI temps to PPC vector temps.
+ * We have 32 PPC vector regs. Use 16 of them for storing 4 TGSI temps.
+ * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1].
+ */
+ int temps_map[MAX_PPC_TEMPS][4];
+
+ /**
+ * Cache of src registers.
+ * This is used to avoid redundant load instructions.
+ */
+ struct {
+ struct tgsi_full_src_register src;
+ uint chan;
+ uint vec;
+ } regs[12]; /* 3 src regs, 4 channels */
+ uint num_regs;
+};
+
+
+/**
+ * Initialize code generation context.
+ */
+static void
+init_gen_context(struct gen_context *gen, struct ppc_function *func)
+{
+ uint i;
+
+ memset(gen, 0, sizeof(*gen));
+ gen->f = func;
+ gen->inputs_reg = ppc_reserve_register(func, 3); /* first function param */
+ gen->outputs_reg = ppc_reserve_register(func, 4); /* second function param */
+ gen->temps_reg = ppc_reserve_register(func, 5); /* ... */
+ gen->immed_reg = ppc_reserve_register(func, 6);
+ gen->const_reg = ppc_reserve_register(func, 7);
+ gen->builtins_reg = ppc_reserve_register(func, 8);
+ gen->one_vec = -1;
+ gen->bit31_vec = -1;
+ gen->offset_reg = -1;
+ gen->offset_value = -9999999;
+ for (i = 0; i < MAX_PPC_TEMPS; i++) {
+ gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f);
+ gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f);
+ gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f);
+ gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f);
+ }
+}
+
+
+/**
+ * All PPC vector load/store instructions form an effective address
+ * by adding the contents of two registers. For example:
+ * lvx v2,r8,r9 # v2 = memory[r8 + r9]
+ * stvx v2,r8,r9 # memory[r8 + r9] = v2;
+ * So our lvx/stvx instructions are typically preceded by an 'li' instruction
+ * to load r9 (above) with an immediate (an offset).
+ * This code emits that 'li' instruction, but only if the offset value is
+ * different than the previous 'li'.
+ * This optimization seems to save about 10% in the instruction count.
+ * Note that we need to unconditionally emit an 'li' inside basic blocks
+ * (such as inside loops).
+ */
+static int
+emit_li_offset(struct gen_context *gen, int offset)
+{
+ if (gen->offset_reg <= 0) {
+ /* allocate a GP register for storing load/store offset */
+ gen->offset_reg = ppc_allocate_register(gen->f);
+ }
+
+ /* emit new 'li' if offset is changing */
+ if (gen->offset_value < 0 || gen->offset_value != offset) {
+ gen->offset_value = offset;
+ ppc_li(gen->f, gen->offset_reg, offset);
+ }
+
+ return gen->offset_reg;
+}
+
+
+/**
+ * Forces subsequent emit_li_offset() calls to emit an 'li'.
+ * To be called at the top of basic blocks.
+ */
+static void
+reset_li_offset(struct gen_context *gen)
+{
+ gen->offset_value = -9999999;
+}
+
+
+
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+ uint pos;
+ for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+ if (ppc_builtin_constants[pos] == value) {
+ int offset = pos * 4;
+ int offset_reg = emit_li_offset(gen, offset);
+
+ /* Load 4-byte word into vector register.
+ * The vector slot depends on the effective address we load from.
+ * We know that our builtins start at a 16-byte boundary so we
+ * know that 'swizzle' tells us which vector slot will have the
+ * loaded word. The other vector slots will be undefined.
+ */
+ ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+ /* splat word[pos % 4] across the vector reg */
+ ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+ return;
+ }
+ }
+ assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+ if (gen->one_vec < 0) {
+ gen->one_vec = ppc_allocate_vec_register(gen->f);
+ load_constant_vec(gen, gen->one_vec, 1.0f);
+ }
+ return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+ if (gen->bit31_vec < 0) {
+ gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+ ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+ }
+ return gen->bit31_vec;
+}
+
+
+/**
+ * Register fetch. Return PPC vector register with result.
+ */
+static int
+emit_fetch(struct gen_context *gen,
+ const struct tgsi_full_src_register *reg,
+ const unsigned chan_index)
+{
+ uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+ int dst_vec = -1;
+
+ switch (swizzle) {
+ case TGSI_EXTSWIZZLE_X:
+ case TGSI_EXTSWIZZLE_Y:
+ case TGSI_EXTSWIZZLE_Z:
+ case TGSI_EXTSWIZZLE_W:
+ switch (reg->SrcRegister.File) {
+ case TGSI_FILE_INPUT:
+ {
+ int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
+ ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
+ }
+ break;
+ case TGSI_FILE_TEMPORARY:
+ if (reg->SrcRegister.Index < MAX_PPC_TEMPS) {
+ /* use PPC vec register */
+ dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle];
+ }
+ else {
+ /* use memory-based temp register "file" */
+ int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
+ ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
+ }
+ break;
+ case TGSI_FILE_IMMEDIATE:
+ {
+ int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
+ /* Load 4-byte word into vector register.
+ * The vector slot depends on the effective address we load from.
+ * We know that our immediates start at a 16-byte boundary so we
+ * know that 'swizzle' tells us which vector slot will have the
+ * loaded word. The other vector slots will be undefined.
+ */
+ ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+ /* splat word[swizzle] across the vector reg */
+ ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+ }
+ break;
+ case TGSI_FILE_CONSTANT:
+ {
+ int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+ int offset_reg = emit_li_offset(gen, offset);
+ dst_vec = ppc_allocate_vec_register(gen->f);
+ /* Load 4-byte word into vector register.
+ * The vector slot depends on the effective address we load from.
+ * We know that our constants start at a 16-byte boundary so we
+ * know that 'swizzle' tells us which vector slot will have the
+ * loaded word. The other vector slots will be undefined.
+ */
+ ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
+ /* splat word[swizzle] across the vector reg */
+ ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+ }
+ break;
+ default:
+ assert( 0 );
+ }
+ break;
+ case TGSI_EXTSWIZZLE_ZERO:
+ ppc_vzero(gen->f, dst_vec);
+ break;
+ case TGSI_EXTSWIZZLE_ONE:
+ {
+ int one_vec = gen_one_vec(gen);
+ dst_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vmove(gen->f, dst_vec, one_vec);
+ }
+ break;
+ default:
+ assert( 0 );
+ }
+
+ assert(dst_vec >= 0);
+
+ {
+ uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+ if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+ int bit31_vec = gen_get_bit31_vec(gen);
+
+ switch (sign_op) {
+ case TGSI_UTIL_SIGN_CLEAR:
+ /* vec = vec & ~bit31 */
+ ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
+ break;
+ case TGSI_UTIL_SIGN_SET:
+ /* vec = vec | bit31 */
+ ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
+ break;
+ case TGSI_UTIL_SIGN_TOGGLE:
+ /* vec = vec ^ bit31 */
+ ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
+ break;
+ default:
+ assert(0);
+ }
+ }
+ }
+
+ return dst_vec;
+}
+
+
+
+/**
+ * Test if two TGSI src registers refer to the same memory location.
+ * We use this to avoid redundant register loads.
+ */
+static boolean
+equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
+ const struct tgsi_full_src_register *b, uint chan_b)
+{
+ int swz_a, swz_b;
+ int sign_a, sign_b;
+ if (a->SrcRegister.File != b->SrcRegister.File)
+ return FALSE;
+ if (a->SrcRegister.Index != b->SrcRegister.Index)
+ return FALSE;
+ swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
+ swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+ if (swz_a != swz_b)
+ return FALSE;
+ sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
+ sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b);
+ if (sign_a != sign_b)
+ return FALSE;
+ return TRUE;
+}
+
+
+/**
+ * Given a TGSI src register and channel index, return the PPC vector
+ * register containing the value. We use a cache to prevent re-loading
+ * the same register multiple times.
+ * \return index of PPC vector register with the desired src operand
+ */
+static int
+get_src_vec(struct gen_context *gen,
+ struct tgsi_full_instruction *inst, int src_reg, uint chan)
+{
+ const const struct tgsi_full_src_register *src =
+ &inst->FullSrcRegisters[src_reg];
+ int vec;
+ uint i;
+
+ /* check the cache */
+ for (i = 0; i < gen->num_regs; i++) {
+ if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
+ /* cache hit */
+ assert(gen->regs[i].vec >= 0);
+ return gen->regs[i].vec;
+ }
+ }
+
+ /* cache miss: allocate new vec reg and emit fetch/load code */
+ vec = emit_fetch(gen, src, chan);
+ gen->regs[gen->num_regs].src = *src;
+ gen->regs[gen->num_regs].chan = chan;
+ gen->regs[gen->num_regs].vec = vec;
+ gen->num_regs++;
+
+ assert(gen->num_regs <= Elements(gen->regs));
+
+ assert(vec >= 0);
+
+ return vec;
+}
+
+
+/**
+ * Clear the src operand cache. To be called at the end of each emit function.
+ */
+static void
+release_src_vecs(struct gen_context *gen)
+{
+ uint i;
+ for (i = 0; i < gen->num_regs; i++) {
+ const const struct tgsi_full_src_register src = gen->regs[i].src;
+ if (!(src.SrcRegister.File == TGSI_FILE_TEMPORARY &&
+ src.SrcRegister.Index < MAX_PPC_TEMPS)) {
+ ppc_release_vec_register(gen->f, gen->regs[i].vec);
+ }
+ }
+ gen->num_regs = 0;
+}
+
+
+
+static int
+get_dst_vec(struct gen_context *gen,
+ const struct tgsi_full_instruction *inst,
+ unsigned chan_index)
+{
+ const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+ if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+ reg->DstRegister.Index < MAX_PPC_TEMPS) {
+ int vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+ return vec;
+ }
+ else {
+ return ppc_allocate_vec_register(gen->f);
+ }
+}
+
+
+/**
+ * Register store. Store 'src_vec' at location indicated by 'reg'.
+ * \param free_vec Should the src_vec be released when done?
+ */
+static void
+emit_store(struct gen_context *gen,
+ int src_vec,
+ const struct tgsi_full_instruction *inst,
+ unsigned chan_index,
+ boolean free_vec)
+{
+ const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+ switch (reg->DstRegister.File) {
+ case TGSI_FILE_OUTPUT:
+ {
+ int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+ int offset_reg = emit_li_offset(gen, offset);
+ ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
+ }
+ break;
+ case TGSI_FILE_TEMPORARY:
+ if (reg->DstRegister.Index < MAX_PPC_TEMPS) {
+ if (!free_vec) {
+ int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+ if (dst_vec != src_vec)
+ ppc_vmove(gen->f, dst_vec, src_vec);
+ }
+ free_vec = FALSE;
+ }
+ else {
+ int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+ int offset_reg = emit_li_offset(gen, offset);
+ ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
+ }
+ break;
+#if 0
+ case TGSI_FILE_ADDRESS:
+ emit_addrs(
+ func,
+ xmm,
+ reg->DstRegister.Index,
+ chan_index );
+ break;
+#endif
+ default:
+ assert( 0 );
+ }
+
+#if 0
+ switch( inst->Instruction.Saturate ) {
+ case TGSI_SAT_NONE:
+ break;
+
+ case TGSI_SAT_ZERO_ONE:
+ /* assert( 0 ); */
+ break;
+
+ case TGSI_SAT_MINUS_PLUS_ONE:
+ assert( 0 );
+ break;
+ }
+#endif
+
+ if (free_vec)
+ ppc_release_vec_register(gen->f, src_vec);
+}
+
+
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int v0, v1;
+ uint chan_index;
+
+ v0 = get_src_vec(gen, inst, 0, CHAN_X);
+ v1 = ppc_allocate_vec_register(gen->f);
+
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_RSQ:
+ /* v1 = 1.0 / sqrt(v0) */
+ ppc_vrsqrtefp(gen->f, v1, v0);
+ break;
+ case TGSI_OPCODE_RCP:
+ /* v1 = 1.0 / v0 */
+ ppc_vrefp(gen->f, v1, v0);
+ break;
+ default:
+ assert(0);
+ }
+
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ emit_store(gen, v1, inst, chan_index, FALSE);
+ }
+
+ release_src_vecs(gen);
+ ppc_release_vec_register(gen->f, v1);
+}
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ uint chan_index;
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+ int v0 = get_src_vec(gen, inst, 0, chan_index); /* v0 = srcreg[0] */
+ int v1 = get_dst_vec(gen, inst, chan_index);
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ABS:
+ /* turn off the most significant bit of each vector float word */
+ {
+ int bit31_vec = gen_get_bit31_vec(gen);
+ ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */
+ }
+ break;
+ case TGSI_OPCODE_FLOOR:
+ ppc_vrfim(gen->f, v1, v0); /* v1 = floor(v0) */
+ break;
+ case TGSI_OPCODE_FRAC:
+ ppc_vrfim(gen->f, v1, v0); /* tmp = floor(v0) */
+ ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */
+ break;
+ case TGSI_OPCODE_EXPBASE2:
+ ppc_vexptefp(gen->f, v1, v0); /* v1 = 2^v0 */
+ break;
+ case TGSI_OPCODE_LOGBASE2:
+ /* XXX this may be broken! */
+ ppc_vlogefp(gen->f, v1, v0); /* v1 = log2(v0) */
+ break;
+ case TGSI_OPCODE_MOV:
+ case TGSI_OPCODE_SWZ:
+ if (v0 != v1)
+ ppc_vmove(gen->f, v1, v0);
+ break;
+ default:
+ assert(0);
+ }
+ emit_store(gen, v1, inst, chan_index, TRUE); /* store v0 */
+ }
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int zero_vec = -1;
+ uint chan;
+
+ if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+ zero_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vzero(gen->f, zero_vec);
+ }
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ /* fetch src operands */
+ int v0 = get_src_vec(gen, inst, 0, chan);
+ int v1 = get_src_vec(gen, inst, 1, chan);
+ int v2 = get_dst_vec(gen, inst, chan);
+
+ /* emit binop */
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ADD:
+ ppc_vaddfp(gen->f, v2, v0, v1);
+ break;
+ case TGSI_OPCODE_SUB:
+ ppc_vsubfp(gen->f, v2, v0, v1);
+ break;
+ case TGSI_OPCODE_MUL:
+ ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec);
+ break;
+ case TGSI_OPCODE_MIN:
+ ppc_vminfp(gen->f, v2, v0, v1);
+ break;
+ case TGSI_OPCODE_MAX:
+ ppc_vmaxfp(gen->f, v2, v0, v1);
+ break;
+ default:
+ assert(0);
+ }
+
+ /* store v2 */
+ emit_store(gen, v2, inst, chan, TRUE);
+ }
+
+ if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
+ ppc_release_vec_register(gen->f, zero_vec);
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ uint chan;
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ /* fetch src operands */
+ int v0 = get_src_vec(gen, inst, 0, chan);
+ int v1 = get_src_vec(gen, inst, 1, chan);
+ int v2 = get_src_vec(gen, inst, 2, chan);
+ int v3 = get_dst_vec(gen, inst, chan);
+
+ /* emit ALU */
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_MAD:
+ ppc_vmaddfp(gen->f, v3, v0, v1, v2); /* v3 = v0 * v1 + v2 */
+ break;
+ case TGSI_OPCODE_LRP:
+ ppc_vsubfp(gen->f, v3, v1, v2); /* v3 = v1 - v2 */
+ ppc_vmaddfp(gen->f, v3, v0, v3, v2); /* v3 = v0 * v3 + v2 */
+ break;
+ default:
+ assert(0);
+ }
+
+ /* store v3 */
+ emit_store(gen, v3, inst, chan, TRUE);
+ }
+
+ release_src_vecs(gen);
+}
+
+
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ uint chan;
+ int one_vec = gen_one_vec(gen);
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ /* fetch src operands */
+ int v0 = get_src_vec(gen, inst, 0, chan);
+ int v1 = get_src_vec(gen, inst, 1, chan);
+ int v2 = get_dst_vec(gen, inst, chan);
+ boolean complement = FALSE;
+
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_SNE:
+ complement = TRUE;
+ /* fall-through */
+ case TGSI_OPCODE_SEQ:
+ ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+ break;
+
+ case TGSI_OPCODE_SGE:
+ complement = TRUE;
+ /* fall-through */
+ case TGSI_OPCODE_SLT:
+ ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+ break;
+
+ case TGSI_OPCODE_SLE:
+ complement = TRUE;
+ /* fall-through */
+ case TGSI_OPCODE_SGT:
+ ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+ break;
+ default:
+ assert(0);
+ }
+
+ /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+ if (complement)
+ ppc_vandc(gen->f, v2, one_vec, v2); /* v2 = one_vec & ~v2 */
+ else
+ ppc_vand(gen->f, v2, one_vec, v2); /* v2 = one_vec & v2 */
+
+ /* store v2 */
+ emit_store(gen, v2, inst, chan, TRUE);
+ }
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int v0, v1, v2;
+ uint chan_index;
+
+ v2 = ppc_allocate_vec_register(gen->f);
+
+ ppc_vxor(gen->f, v2, v2, v2); /* v2 = {0, 0, 0, 0} */
+
+ v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */
+ v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */
+ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+
+ v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */
+ v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */
+ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+
+ v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */
+ v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */
+ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+
+ if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+ v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */
+ v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+ ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+ }
+ else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+ v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+ ppc_vaddfp(gen->f, v2, v2, v1); /* v2 = v2 + v1 */
+ }
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+ emit_store(gen, v2, inst, chan_index, FALSE); /* store v2, free v2 later */
+ }
+
+ release_src_vecs(gen);
+
+ ppc_release_vec_register(gen->f, v2);
+}
+
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+ /* pow(a,b) ~= exp2(log2(a) * b) */
+ int t_vec = ppc_allocate_vec_register(f);
+ int zero_vec = ppc_allocate_vec_register(f);
+
+ ppc_vzero(f, zero_vec);
+
+ ppc_vlogefp(f, t_vec, va); /* t = log2(va) */
+ ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec); /* t = t * vb */
+ ppc_vexptefp(f, vr, t_vec); /* vr = 2^t */
+
+ ppc_release_vec_register(f, t_vec);
+ ppc_release_vec_register(f, zero_vec);
+}
+
+
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int one_vec = gen_one_vec(gen);
+
+ /* Compute X */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+ emit_store(gen, one_vec, inst, CHAN_X, FALSE);
+ }
+
+ /* Compute Y, Z */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ int x_vec;
+ int zero_vec = ppc_allocate_vec_register(gen->f);
+
+ x_vec = get_src_vec(gen, inst, 0, CHAN_X); /* x_vec = src[0].x */
+
+ ppc_vzero(gen->f, zero_vec); /* zero = {0,0,0,0} */
+ ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ emit_store(gen, x_vec, inst, CHAN_Y, FALSE);
+ }
+
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ int y_vec, w_vec;
+ int z_vec = ppc_allocate_vec_register(gen->f);
+ int pow_vec = ppc_allocate_vec_register(gen->f);
+ int pos_vec = ppc_allocate_vec_register(gen->f);
+ int p128_vec = ppc_allocate_vec_register(gen->f);
+ int n128_vec = ppc_allocate_vec_register(gen->f);
+
+ y_vec = get_src_vec(gen, inst, 0, CHAN_Y); /* y_vec = src[0].y */
+ ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+ w_vec = get_src_vec(gen, inst, 0, CHAN_W); /* w_vec = src[0].w */
+
+ /* clamp W to [-128, 128] */
+ load_constant_vec(gen, p128_vec, 128.0f);
+ load_constant_vec(gen, n128_vec, -128.0f);
+ ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */
+ ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */
+
+ /* if temp.x > 0
+ * z = pow(tmp.y, tmp.w)
+ * else
+ * z = 0.0
+ */
+ ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec); /* pow = pow(y, w) */
+ ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+ ppc_vand(gen->f, z_vec, pow_vec, pos_vec); /* z = pow & pos */
+
+ emit_store(gen, z_vec, inst, CHAN_Z, FALSE);
+
+ ppc_release_vec_register(gen->f, z_vec);
+ ppc_release_vec_register(gen->f, pow_vec);
+ ppc_release_vec_register(gen->f, pos_vec);
+ ppc_release_vec_register(gen->f, p128_vec);
+ ppc_release_vec_register(gen->f, n128_vec);
+ }
+
+ ppc_release_vec_register(gen->f, zero_vec);
+ }
+
+ /* Compute W */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+ emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+ }
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ const int one_vec = gen_one_vec(gen);
+ int src_vec;
+
+ /* get src arg */
+ src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+ /* Compute X = 2^floor(src) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_X);
+ int tmp_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */
+ ppc_vexptefp(gen->f, dst_vec, tmp_vec); /* dst = 2 ^ tmp */
+ emit_store(gen, dst_vec, inst, CHAN_X, TRUE);
+ ppc_release_vec_register(gen->f, tmp_vec);
+ }
+
+ /* Compute Y = src - floor(src) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_Y);
+ int tmp_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */
+ ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec); /* dst = src - tmp */
+ emit_store(gen, dst_vec, inst, CHAN_Y, TRUE);
+ ppc_release_vec_register(gen->f, tmp_vec);
+ }
+
+ /* Compute Z = RoughApprox2ToX(src) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+ ppc_vexptefp(gen->f, dst_vec, src_vec); /* dst = 2 ^ src */
+ emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+ }
+
+ /* Compute W = 1.0 */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+ emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+ }
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ const int bit31_vec = gen_get_bit31_vec(gen);
+ const int one_vec = gen_one_vec(gen);
+ int src_vec, abs_vec;
+
+ /* get src arg */
+ src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+ /* compute abs(src) */
+ abs_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec); /* abs = src & ~bit31 */
+
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) &&
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+
+ /* compute tmp = floor(log2(abs)) */
+ int tmp_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vlogefp(gen->f, tmp_vec, abs_vec); /* tmp = log2(abs) */
+ ppc_vrfim(gen->f, tmp_vec, tmp_vec); /* tmp = floor(tmp); */
+
+ /* Compute X = tmp */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+ emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+ }
+
+ /* Compute Y = abs / 2^tmp */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ const int zero_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vzero(gen->f, zero_vec);
+ ppc_vexptefp(gen->f, tmp_vec, tmp_vec); /* tmp = 2 ^ tmp */
+ ppc_vrefp(gen->f, tmp_vec, tmp_vec); /* tmp = 1 / tmp */
+ /* tmp = abs * tmp + zero */
+ ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+ ppc_release_vec_register(gen->f, zero_vec);
+ }
+
+ ppc_release_vec_register(gen->f, tmp_vec);
+ }
+
+ /* Compute Z = RoughApproxLog2(abs) */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+ ppc_vlogefp(gen->f, dst_vec, abs_vec); /* dst = log2(abs) */
+ emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+ }
+
+ /* Compute W = 1.0 */
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+ emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+ }
+
+ ppc_release_vec_register(gen->f, abs_vec);
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int s0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+ int s1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+ int pow_vec = ppc_allocate_vec_register(gen->f);
+ int chan;
+
+ ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec);
+
+ FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+ emit_store(gen, pow_vec, inst, chan, FALSE);
+ }
+
+ ppc_release_vec_register(gen->f, pow_vec);
+
+ release_src_vecs(gen);
+}
+
+
+static void
+emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+ int x0_vec, y0_vec, z0_vec;
+ int x1_vec, y1_vec, z1_vec;
+ int zero_vec, tmp_vec;
+ int tmp2_vec;
+
+ zero_vec = ppc_allocate_vec_register(gen->f);
+ ppc_vzero(gen->f, zero_vec);
+
+ tmp_vec = ppc_allocate_vec_register(gen->f);
+ tmp2_vec = ppc_allocate_vec_register(gen->f);
+
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ x0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+ x1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+ }
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+ y0_vec = get_src_vec(gen, inst, 0, CHAN_Y);
+ y1_vec = get_src_vec(gen, inst, 1, CHAN_Y);
+ }
+ if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+ IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+ z0_vec = get_src_vec(gen, inst, 0, CHAN_Z);
+ z1_vec = get_src_vec(gen, inst, 1, CHAN_Z);
+ }
+
+ IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) {
+ /* tmp = y0 * z1 */
+ ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec);
+ /* tmp = tmp - z0 * y1*/
+ ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+ }
+ IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) {
+ /* tmp = z0 * x1 */
+ ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec);
+ /* tmp = tmp - x0 * z1 */
+ ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+ }
+ IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) {
+ /* tmp = x0 * y1 */
+ ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec);
+ /* tmp = tmp - y0 * x1 */
+ ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec);
+ emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE);
+ }
+ /* W is undefined */
+
+ ppc_release_vec_register(gen->f, tmp_vec);
+ ppc_release_vec_register(gen->f, zero_vec);
+ release_src_vecs(gen);
+}
+
+static int
+emit_instruction(struct gen_context *gen,
+ struct tgsi_full_instruction *inst)
+{
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_MOV:
+ case TGSI_OPCODE_SWZ:
+ case TGSI_OPCODE_ABS:
+ case TGSI_OPCODE_FLOOR:
+ case TGSI_OPCODE_FRAC:
+ case TGSI_OPCODE_EXPBASE2:
+ case TGSI_OPCODE_LOGBASE2:
+ emit_unaryop(gen, inst);
+ break;
+ case TGSI_OPCODE_RSQ:
+ case TGSI_OPCODE_RCP:
+ emit_scalar_unaryop(gen, inst);
+ break;
+ case TGSI_OPCODE_ADD:
+ case TGSI_OPCODE_SUB:
+ case TGSI_OPCODE_MUL:
+ case TGSI_OPCODE_MIN:
+ case TGSI_OPCODE_MAX:
+ emit_binop(gen, inst);
+ break;
+ case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_SNE:
+ case TGSI_OPCODE_SLT:
+ case TGSI_OPCODE_SGT:
+ case TGSI_OPCODE_SLE:
+ case TGSI_OPCODE_SGE:
+ emit_inequality(gen, inst);
+ break;
+ case TGSI_OPCODE_MAD:
+ case TGSI_OPCODE_LRP:
+ emit_triop(gen, inst);
+ break;
+ case TGSI_OPCODE_DP3:
+ case TGSI_OPCODE_DP4:
+ case TGSI_OPCODE_DPH:
+ emit_dotprod(gen, inst);
+ break;
+ case TGSI_OPCODE_LIT:
+ emit_lit(gen, inst);
+ break;
+ case TGSI_OPCODE_LOG:
+ emit_log(gen, inst);
+ break;
+ case TGSI_OPCODE_EXP:
+ emit_exp(gen, inst);
+ break;
+ case TGSI_OPCODE_POW:
+ emit_pow(gen, inst);
+ break;
+ case TGSI_OPCODE_XPD:
+ emit_xpd(gen, inst);
+ break;
+ case TGSI_OPCODE_END:
+ /* normal end */
+ return 1;
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+
+static void
+emit_declaration(
+ struct ppc_function *func,
+ struct tgsi_full_declaration *decl )
+{
+ if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+ unsigned first, last, mask;
+ unsigned i, j;
+
+ first = decl->DeclarationRange.First;
+ last = decl->DeclarationRange.Last;
+ mask = decl->Declaration.UsageMask;
+
+ for( i = first; i <= last; i++ ) {
+ for( j = 0; j < NUM_CHANNELS; j++ ) {
+ if( mask & (1 << j) ) {
+ switch( decl->Declaration.Interpolate ) {
+ case TGSI_INTERPOLATE_CONSTANT:
+ emit_coef_a0( func, 0, i, j );
+ emit_inputs( func, 0, i, j );
+ break;
+
+ case TGSI_INTERPOLATE_LINEAR:
+ emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+ emit_coef_dadx( func, 1, i, j );
+ emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+ emit_coef_dady( func, 3, i, j );
+ emit_mul( func, 0, 1 ); /* x * dadx */
+ emit_coef_a0( func, 4, i, j );
+ emit_mul( func, 2, 3 ); /* y * dady */
+ emit_add( func, 0, 4 ); /* x * dadx + a0 */
+ emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
+ emit_inputs( func, 0, i, j );
+ break;
+
+ case TGSI_INTERPOLATE_PERSPECTIVE:
+ emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+ emit_coef_dadx( func, 1, i, j );
+ emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+ emit_coef_dady( func, 3, i, j );
+ emit_mul( func, 0, 1 ); /* x * dadx */
+ emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+ emit_coef_a0( func, 5, i, j );
+ emit_rcp( func, 4, 4 ); /* 1.0 / w */
+ emit_mul( func, 2, 3 ); /* y * dady */
+ emit_add( func, 0, 5 ); /* x * dadx + a0 */
+ emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
+ emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
+ emit_inputs( func, 0, i, j );
+ break;
+
+ default:
+ assert( 0 );
+ break;
+ }
+ }
+ }
+ }
+#endif
+ }
+}
+
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+ /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+ ppc_return(func);
+ /* XXX restore prev stack frame */
+ debug_printf("PPC: Emitted %u instructions\n", func->num_inst);
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens the TGSI input shader
+ * \param func the output PPC code/function
+ * \param immediates buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+ struct ppc_function *func,
+ float (*immediates)[4],
+ boolean do_swizzles )
+{
+ static int use_ppc_asm = -1;
+ struct tgsi_parse_context parse;
+ /*boolean instruction_phase = FALSE;*/
+ unsigned ok = 1;
+ uint num_immediates = 0;
+ struct gen_context gen;
+
+ if (use_ppc_asm < 0) {
+ /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+ use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+ }
+ if (!use_ppc_asm)
+ return FALSE;
+
+ if (0) {
+ debug_printf("\n********* TGSI->PPC ********\n");
+ tgsi_dump(tokens, 0);
+ }
+
+ util_init_math();
+
+ init_gen_context(&gen, func);
+
+ emit_prologue(func);
+
+ tgsi_parse_init( &parse, tokens );
+
+ while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+ tgsi_parse_token(&parse);
+
+ switch (parse.FullToken.Token.Type) {
+ case TGSI_TOKEN_TYPE_DECLARATION:
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+ emit_declaration(func, &parse.FullToken.FullDeclaration );
+ }
+ break;
+
+ case TGSI_TOKEN_TYPE_INSTRUCTION:
+ ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+ if (!ok) {
+ debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n",
+ parse.FullToken.FullInstruction.Instruction.Opcode,
+ parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+ "vertex shader" : "fragment shader");
+ }
+ break;
+
+ case TGSI_TOKEN_TYPE_IMMEDIATE:
+ /* splat each immediate component into a float[4] vector for SoA */
+ {
+ const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+ uint i;
+ assert(size <= 4);
+ assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+ for (i = 0; i < size; i++) {
+ immediates[num_immediates][i] =
+ parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+ }
+ num_immediates++;
+ }
+ break;
+
+ default:
+ ok = 0;
+ assert( 0 );
+ }
+ }
+
+ emit_epilogue(func);
+
+ tgsi_parse_free( &parse );
+
+ if (ppc_num_instructions(func) == 0) {
+ /* ran out of memory for instructions */
+ ok = FALSE;
+ }
+
+ if (!ok)
+ debug_printf("TGSI->PPC translation failed\n");
+
+ return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
new file mode 100644
index 0000000000..829ec075e7
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -0,0 +1,51 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TGSI_PPC_H
+#define TGSI_PPC_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct ppc_function;
+
+extern const float ppc_builtin_constants[];
+
+
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+ struct ppc_function *function,
+ float (*immediates)[4],
+ boolean do_swizzles);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PPC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4d59106dbf..8dfd2ced08 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,9 +25,16 @@
*
**************************************************************************/
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
+
#include "pipe/p_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
+#include "util/u_sse.h"
+#endif
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi_exec.h"
@@ -35,8 +42,6 @@
#include "rtasm/rtasm_x86sse.h"
-#ifdef PIPE_ARCH_X86
-
/* for 1/sqrt()
*
* This costs about 100fps (close to 10%) in gears:
@@ -509,10 +514,31 @@ emit_coef_dady(
* Function call helpers.
*/
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
static void
-emit_push_gp(
- struct x86_function *func )
+emit_func_call_dst(
+ struct x86_function *func,
+ unsigned xmm_save,
+ unsigned xmm_dst,
+ void (PIPE_CDECL *code)() )
{
+ struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+ unsigned i, n;
+ unsigned xmm_mask;
+
+ /* Bitmask of the xmm registers to save */
+ xmm_mask = (1 << xmm_save) - 1;
+ xmm_mask &= ~(1 << xmm_dst);
+
+ sse_movaps(
+ func,
+ get_temp( TEMP_R0, 0 ),
+ make_xmm( xmm_dst ) );
+
x86_push(
func,
x86_make_reg( file_REG32, reg_AX) );
@@ -522,12 +548,49 @@ emit_push_gp(
x86_push(
func,
x86_make_reg( file_REG32, reg_DX) );
-}
+
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_mask & (1 << i))
+ ++n;
+
+ x86_sub_imm(
+ func,
+ x86_make_reg( file_REG32, reg_SP ),
+ n*16);
+
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_mask & (1 << i)) {
+ sse_movups(
+ func,
+ x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+ make_xmm( i ) );
+ ++n;
+ }
+
+ x86_lea(
+ func,
+ ecx,
+ get_temp( TEMP_R0, 0 ) );
+
+ x86_push( func, ecx );
+ x86_mov_reg_imm( func, ecx, (unsigned long) code );
+ x86_call( func, ecx );
+ x86_pop(func, ecx );
+
+ for(i = 0, n = 0; i < 8; ++i)
+ if(xmm_mask & (1 << i)) {
+ sse_movups(
+ func,
+ make_xmm( i ),
+ x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+ ++n;
+ }
+
+ x86_add_imm(
+ func,
+ x86_make_reg( file_REG32, reg_SP ),
+ n*16);
-static void
-x86_pop_gp(
- struct x86_function *func )
-{
/* Restore GP registers in a reverse order.
*/
x86_pop(
@@ -539,39 +602,6 @@ x86_pop_gp(
x86_pop(
func,
x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst(
- struct x86_function *func,
- unsigned xmm_dst,
- void (PIPE_CDECL *code)() )
-{
- sse_movaps(
- func,
- get_temp( TEMP_R0, 0 ),
- make_xmm( xmm_dst ) );
-
- emit_push_gp(
- func );
-
- {
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
- x86_lea(
- func,
- ecx,
- get_temp( TEMP_R0, 0 ) );
-
- x86_push( func, ecx );
- x86_mov_reg_imm( func, ecx, (unsigned long) code );
- x86_call( func, ecx );
- x86_pop(func, ecx );
- }
-
-
- x86_pop_gp(
- func );
sse_movaps(
func,
@@ -582,6 +612,7 @@ emit_func_call_dst(
static void
emit_func_call_dst_src(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst,
unsigned xmm_src,
void (PIPE_CDECL *code)() )
@@ -593,10 +624,119 @@ emit_func_call_dst_src(
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
code );
}
+
+#if defined(PIPE_ARCH_SSE)
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128
+exp2f4(__m128 x)
+{
+ __m128i ipart;
+ __m128 fpart, expipart, expfpart;
+
+ x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+ x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+ /* ipart = int(x - 0.5) */
+ ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+ /* fpart = x - ipart */
+ fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+ /* expipart = (float) (1 << ipart) */
+ expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+ /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+ expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+ expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+ expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+ expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+ return _mm_mul_ps(expipart, expfpart);
+}
+
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128
+log2f4(__m128 x)
+{
+ __m128i expmask = _mm_set1_epi32(0x7f800000);
+ __m128i mantmask = _mm_set1_epi32(0x007fffff);
+ __m128 one = _mm_set1_ps(1.0f);
+
+ __m128i i = _mm_castps_si128(x);
+
+ /* exp = (float) exponent(x) */
+ __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+ /* mant = (float) mantissa(x) */
+ __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+ __m128 logmant;
+
+ /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+ * These coefficients can be generate with
+ * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+ */
+#if LOG_POLY_DEGREE == 6
+ logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+ logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+ logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+ logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+ /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+ logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+ return _mm_add_ps(logmant, exp);
+}
+
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+ return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+#endif /* PIPE_ARCH_SSE */
+
+
+
/**
* Low-level instruction translators.
*/
@@ -639,38 +779,42 @@ cos4f(
static void
emit_cos(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
cos4f );
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
ex24f(
float *store )
{
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+ _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
store[0] = util_fast_exp2( store[0] );
store[1] = util_fast_exp2( store[1] );
store[2] = util_fast_exp2( store[2] );
store[3] = util_fast_exp2( store[3] );
-#else
- store[0] = powf( 2.0f, store[0] );
- store[1] = powf( 2.0f, store[1] );
- store[2] = powf( 2.0f, store[2] );
- store[3] = powf( 2.0f, store[3] );
#endif
}
static void
emit_ex2(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
ex24f );
}
@@ -710,10 +854,12 @@ flr4f(
static void
emit_flr(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
flr4f );
}
@@ -731,31 +877,42 @@ frc4f(
static void
emit_frc(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
frc4f );
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
lg24f(
float *store )
{
+#if defined(PIPE_ARCH_SSE)
+ _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
store[0] = util_fast_log2( store[0] );
store[1] = util_fast_log2( store[1] );
store[2] = util_fast_log2( store[2] );
store[3] = util_fast_log2( store[3] );
+#endif
}
static void
emit_lg2(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst )
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
lg24f );
}
@@ -797,30 +954,32 @@ emit_neg(
}
static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
pow4f(
float *store )
{
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+ _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
store[0] = util_fast_pow( store[0], store[4] );
store[1] = util_fast_pow( store[1], store[5] );
store[2] = util_fast_pow( store[2], store[6] );
store[3] = util_fast_pow( store[3], store[7] );
-#else
- store[0] = powf( store[0], store[4] );
- store[1] = powf( store[1], store[5] );
- store[2] = powf( store[2], store[6] );
- store[3] = powf( store[3], store[7] );
#endif
}
static void
emit_pow(
struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst,
unsigned xmm_src )
{
emit_func_call_dst_src(
func,
+ xmm_save,
xmm_dst,
xmm_src,
pow4f );
@@ -913,10 +1072,12 @@ sin4f(
static void
emit_sin (struct x86_function *func,
+ unsigned xmm_save,
unsigned xmm_dst)
{
emit_func_call_dst(
func,
+ xmm_save,
xmm_dst,
sin4f );
}
@@ -1336,7 +1497,7 @@ emit_instruction(
get_temp(
TGSI_EXEC_TEMP_MINUS_128_I,
TGSI_EXEC_TEMP_MINUS_128_C ) );
- emit_pow( func, 1, 2 );
+ emit_pow( func, 3, 1, 2 );
FETCH( func, *inst, 0, 0, CHAN_X );
sse_xorps(
func,
@@ -1382,11 +1543,11 @@ emit_instruction(
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
emit_MOV( func, 1, 0 );
- emit_flr( func, 1 );
+ emit_flr( func, 2, 1 );
/* dst.x = ex2(floor(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
emit_MOV( func, 2, 1 );
- emit_ex2( func, 2 );
+ emit_ex2( func, 3, 2 );
STORE( func, *inst, 2, 0, CHAN_X );
}
/* dst.y = src.x - floor(src.x) */
@@ -1398,7 +1559,7 @@ emit_instruction(
}
/* dst.z = ex2(src.x) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
- emit_ex2( func, 0 );
+ emit_ex2( func, 3, 0 );
STORE( func, *inst, 0, 0, CHAN_Z );
}
}
@@ -1416,21 +1577,21 @@ emit_instruction(
FETCH( func, *inst, 0, 0, CHAN_X );
emit_abs( func, 0 );
emit_MOV( func, 1, 0 );
- emit_lg2( func, 1 );
+ emit_lg2( func, 2, 1 );
/* dst.z = lg2(abs(src.x)) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
STORE( func, *inst, 1, 0, CHAN_Z );
}
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_flr( func, 1 );
+ emit_flr( func, 2, 1 );
/* dst.x = floor(lg2(abs(src.x))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
STORE( func, *inst, 1, 0, CHAN_X );
}
/* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_ex2( func, 1 );
+ emit_ex2( func, 2, 1 );
emit_rcp( func, 1, 1 );
emit_mul( func, 0, 1 );
STORE( func, *inst, 0, 0, CHAN_Y );
@@ -1605,7 +1766,18 @@ emit_instruction(
case TGSI_OPCODE_DOT2ADD:
/* TGSI_OPCODE_DP2A */
- return 0;
+ FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
+ FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
+ emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
+ FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
+ FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
+ emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
+ }
break;
case TGSI_OPCODE_INDEX:
@@ -1620,7 +1792,7 @@ emit_instruction(
/* TGSI_OPCODE_FRC */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
- emit_frc( func, 0 );
+ emit_frc( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
@@ -1633,7 +1805,7 @@ emit_instruction(
/* TGSI_OPCODE_FLR */
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( func, *inst, 0, 0, chan_index );
- emit_flr( func, 0 );
+ emit_flr( func, 0, 0 );
STORE( func, *inst, 0, 0, chan_index );
}
break;
@@ -1645,7 +1817,7 @@ emit_instruction(
case TGSI_OPCODE_EXPBASE2:
/* TGSI_OPCODE_EX2 */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_ex2( func, 0 );
+ emit_ex2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1654,7 +1826,7 @@ emit_instruction(
case TGSI_OPCODE_LOGBASE2:
/* TGSI_OPCODE_LG2 */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_lg2( func, 0 );
+ emit_lg2( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1664,7 +1836,7 @@ emit_instruction(
/* TGSI_OPCODE_POW */
FETCH( func, *inst, 0, 0, CHAN_X );
FETCH( func, *inst, 1, 1, CHAN_X );
- emit_pow( func, 0, 1 );
+ emit_pow( func, 0, 0, 1 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1755,7 +1927,7 @@ emit_instruction(
case TGSI_OPCODE_COS:
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0 );
+ emit_cos( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1814,7 +1986,7 @@ emit_instruction(
case TGSI_OPCODE_SIN:
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0 );
+ emit_sin( func, 0, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( func, *inst, 0, 0, chan_index );
}
@@ -1908,12 +2080,12 @@ emit_instruction(
case TGSI_OPCODE_SCS:
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0 );
+ emit_cos( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_X );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0 );
+ emit_sin( func, 0, 0 );
STORE( func, *inst, 0, 0, CHAN_Y );
}
IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
@@ -1939,7 +2111,39 @@ emit_instruction(
break;
case TGSI_OPCODE_NRM:
- return 0;
+ /* fall-through */
+ case TGSI_OPCODE_NRM4:
+ /* 3 or 4-component normalization */
+ {
+ uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+ /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
+ FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */
+ FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */
+ FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */
+ if (dims == 4) {
+ FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
+ }
+ emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */
+ emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */
+ emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */
+ emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */
+ emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
+ emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */
+ emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */
+ emit_add( func, 0, 1 ); /* xmm0 += xmm1 */
+ if (dims == 4) {
+ emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */
+ emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */
+ emit_add( func, 0, 0 ); /* xmm0 += xmm1 */
+ }
+ emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ if (chan_index < dims) {
+ emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
+ STORE( func, *inst, 4+chan_index, 0, chan_index );
+ }
+ }
+ }
break;
case TGSI_OPCODE_DIV:
@@ -1947,7 +2151,16 @@ emit_instruction(
break;
case TGSI_OPCODE_DP2:
- return 0;
+ FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
+ FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
+ emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
+ FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
+ FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
+ emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
+ emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
+ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+ STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
+ }
break;
case TGSI_OPCODE_TXL:
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index d3951e4e7d..b3d1045a8f 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
u_gen_mipmap.c \
u_handle_table.c \
u_hash_table.c \
+ u_keymap.c \
u_math.c \
u_mm.c \
u_rect.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index e65c17b1cc..8a04955a16 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -11,13 +11,14 @@ util = env.ConvenienceLibrary(
'u_gen_mipmap.c',
'u_handle_table.c',
'u_hash_table.c',
+ 'u_keymap.c',
'u_math.c',
'u_mm.c',
'u_rect.c',
'u_simple_shaders.c',
'u_snprintf.c',
- 'u_stream_stdc.c',
- 'u_stream_wd.c',
+ 'u_stream_stdc.c',
+ 'u_stream_wd.c',
'u_tile.c',
'u_time.c',
])
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 6ff3e6e0a6..0d019808b0 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -36,6 +36,13 @@
#include <windows.h>
#include <winddi.h>
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <windows.h>
+#include <types.h>
+
#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
#ifndef WIN32_LEAN_AND_MEAN
@@ -98,7 +105,35 @@ void _debug_vprintf(const char *format, va_list ap)
OutputDebugStringA(buf);
buf[0] = '\0';
}
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+ wchar_t *wide_format;
+ long wide_str_len;
+ char buf[512];
+ int ret;
+#if (_WIN32_WCE < 600)
+ ret = vsprintf(buf, format, ap);
+ if(ret < 0){
+ sprintf(buf, "Cant handle debug print!");
+ ret = 25;
+ }
+#else
+ ret = vsprintf_s(buf, 512, format, ap);
+ if(ret < 0){
+ sprintf_s(buf, 512, "Cant handle debug print!");
+ ret = 25;
+ }
+#endif
+ buf[ret] = '\0';
+ /* Format is ascii - needs to be converted to wchar_t for printing */
+ wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);
+ wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));
+ if (wide_format) {
+ MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,
+ wide_format, wide_str_len);
+ NKDbgPrintfW(wide_format, wide_format);
+ free(wide_format);
+ }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
/* TODO */
#else /* !PIPE_SUBSYSTEM_WINDOWS */
#ifdef DEBUG
@@ -308,6 +343,13 @@ debug_get_flags_option(const char *name,
str = _debug_get_option(name);
if(!str)
result = dfault;
+ else if (!util_strcmp(str, "help")) {
+ result = dfault;
+ while (flags->name) {
+ debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
+ flags++;
+ }
+ }
else {
result = 0;
while( flags->name ) {
@@ -317,7 +359,12 @@ debug_get_flags_option(const char *name,
}
}
- debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+ if (str) {
+ debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str);
+ }
+ else {
+ debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+ }
return result;
}
@@ -627,6 +674,7 @@ void
debug_dump_surface_bmp(const char *filename,
struct pipe_surface *surface)
{
+#ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
struct util_stream *stream;
unsigned surface_usage;
struct bmp_file_header bmfh;
@@ -693,6 +741,7 @@ error2:
FREE(rgba);
error1:
;
+#endif
}
#endif
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 0000000000..01b17ddb1b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+ struct cso_hash *cso;
+ unsigned key_size;
+ unsigned max_entries; /* XXX not obeyed net */
+ unsigned num_entries;
+ keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+ void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+ const void *key, void *data, void *user)
+{
+ FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+ return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+ unsigned i, hash;
+
+ keySize /= 4; /* convert from bytes to uints */
+
+ hash = 0;
+ for (i = 0; i < keySize; i++) {
+ hash ^= (i + 1) * ((const unsigned *) key)[i];
+ }
+
+ /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+ return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize size of the keys in bytes
+ * \param maxEntries max number of entries to allow (~0 = infinity)
+ * \param deleteFunc optional callback to call when entries
+ * are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+ keymap_delete_func deleteFunc)
+{
+ struct keymap *map = MALLOC_STRUCT(keymap);
+ if (!map)
+ return NULL;
+
+ map->cso = cso_hash_create();
+ if (!map->cso) {
+ FREE(map);
+ return NULL;
+ }
+
+ map->max_entries = maxEntries;
+ map->num_entries = 0;
+ map->key_size = keySize;
+ map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+ return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries. The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+ util_keymap_remove_all(map, user);
+ cso_hash_delete(map->cso);
+ FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+ unsigned key_hash)
+{
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ iter = cso_hash_find(map->cso, key_hash);
+ while (!cso_hash_iter_is_null(iter)) {
+ item = (struct keymap_item *) cso_hash_iter_data(iter);
+ if (!memcmp(item->key, key, map->key_size))
+ break;
+ iter = cso_hash_iter_next(iter);
+ }
+
+ return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+ unsigned key_hash)
+{
+ struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+ if (cso_hash_iter_is_null(iter)) {
+ return NULL;
+ }
+ else {
+ return hash_table_item(iter);
+ }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+ const void *data, void *user)
+{
+ unsigned key_hash;
+ struct keymap_item *item;
+ struct cso_hash_iter iter;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ item = hash_table_find_item(map, key, key_hash);
+ if (item) {
+ /* call delete callback for old entry/item */
+ map->delete_func(map, item->key, item->value, user);
+ item->value = (void *) data;
+ return TRUE;
+ }
+
+ item = MALLOC_STRUCT(keymap_item);
+ if (!item)
+ return FALSE;
+
+ item->key = mem_dup(key, map->key_size);
+ item->value = (void *) data;
+
+ iter = cso_hash_insert(map->cso, key_hash, item);
+ if (cso_hash_iter_is_null(iter)) {
+ FREE(item);
+ return FALSE;
+ }
+
+ map->num_entries++;
+
+ return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+ unsigned key_hash;
+ struct keymap_item *item;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ item = hash_table_find_item(map, key, key_hash);
+ if (!item)
+ return NULL;
+
+ return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+ unsigned key_hash;
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ iter = hash_table_find_iter(map, key, key_hash);
+ if (cso_hash_iter_is_null(iter))
+ return;
+
+ item = hash_table_item(iter);
+ assert(item);
+ map->delete_func(map, item->key, item->value, user);
+ FREE(item->key);
+ FREE(item);
+
+ map->num_entries--;
+
+ cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ assert(map);
+
+ iter = cso_hash_first_node(map->cso);
+ while (!cso_hash_iter_is_null(iter)) {
+ item = (struct keymap_item *)
+ cso_hash_take(map->cso, cso_hash_iter_key(iter));
+ map->delete_func(map, item->key, item->value, user);
+ FREE(item->key);
+ FREE(item);
+ iter = cso_hash_first_node(map->cso);
+ }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+ debug_printf("Keymap %p: %u of max %u entries\n",
+ (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 0000000000..8d60a76fc3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+ const void *key, void *data,
+ void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+ keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+ const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 1ae3234423..aa4fa17b59 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -68,7 +68,7 @@ __inline double ceil(double val)
return ceil_val;
}
-#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE_OGL
__inline double floor(double val)
{
double floor_val;
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 01dd67c810..45ce257b5e 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -31,7 +31,7 @@
void
-mmDumpMemInfo(const struct mem_block *heap)
+u_mmDumpMemInfo(const struct mem_block *heap)
{
debug_printf("Memory heap %p:\n", (void *)heap);
if (heap == 0) {
@@ -58,7 +58,7 @@ mmDumpMemInfo(const struct mem_block *heap)
}
struct mem_block *
-mmInit(int ofs, int size)
+u_mmInit(int ofs, int size)
{
struct mem_block *heap, *block;
@@ -165,7 +165,7 @@ SliceBlock(struct mem_block *p,
struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
{
struct mem_block *p;
const int mask = (1 << align2)-1;
@@ -202,7 +202,7 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
+u_mmFindBlock(struct mem_block *heap, int start)
{
struct mem_block *p;
@@ -241,7 +241,7 @@ Join2Blocks(struct mem_block *p)
}
int
-mmFreeMem(struct mem_block *b)
+u_mmFreeMem(struct mem_block *b)
{
if (!b)
return 0;
@@ -270,7 +270,7 @@ mmFreeMem(struct mem_block *b)
void
-mmDestroy(struct mem_block *heap)
+u_mmDestroy(struct mem_block *heap)
{
struct mem_block *p;
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
index b226b101cb..ce20e48763 100644
--- a/src/gallium/auxiliary/util/u_mm.h
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -49,7 +49,7 @@ struct mem_block {
* input: total size in bytes
* return: a heap pointer if OK, NULL if error
*/
-extern struct mem_block *mmInit(int ofs, int size);
+extern struct mem_block *u_mmInit(int ofs, int size);
/**
* Allocate 'size' bytes with 2^align2 bytes alignment,
@@ -61,7 +61,7 @@ extern struct mem_block *mmInit(int ofs, int size);
* startSearch = linear offset from start of heap to begin search
* return: pointer to the allocated block, 0 if error
*/
-extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2,
+extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2,
int startSearch);
/**
@@ -69,23 +69,23 @@ extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2
* input: pointer to a block
* return: 0 if OK, -1 if error
*/
-extern int mmFreeMem(struct mem_block *b);
+extern int u_mmFreeMem(struct mem_block *b);
/**
* Free block starts at offset
* input: pointer to a heap, start offset
* return: pointer to a block
*/
-extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start);
/**
* destroy MM
*/
-extern void mmDestroy(struct mem_block *mmInit);
+extern void u_mmDestroy(struct mem_block *mmInit);
/**
* For debuging purpose.
*/
-extern void mmDumpMemInfo(const struct mem_block *mmInit);
+extern void u_mmDumpMemInfo(const struct mem_block *mmInit);
#endif
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
new file mode 100644
index 0000000000..e2a8491e62
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SSE intrinsics portability header.
+ *
+ * Although the SSE intrinsics are support by all modern x86 and x86-64
+ * compilers, there are some intrisincs missing in some implementations
+ * (especially older MSVC versions). This header abstracts that away.
+ */
+
+#ifndef U_SSE_H_
+#define U_SSE_H_
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* MSVC before VC8 does not support the _mm_castxxx_yyy */
+#if defined(_MSC_VER) && _MSC_VER < 1500
+
+union __declspec(align(16)) m128_types {
+ __m128 m128;
+ __m128i m128i;
+ __m128d m128d;
+};
+
+static __inline __m128
+_mm_castsi128_ps(__m128i a)
+{
+ union m128_types u;
+ u.m128i = a;
+ return u.m128;
+}
+
+static __inline __m128i
+_mm_castps_si128(__m128 a)
+{
+ union m128_types u;
+ u.m128 = a;
+ return u.m128i;
+}
+
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
+
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 853c503f4f..32f6b072a0 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -460,7 +460,7 @@ l8_put_tile_rgba(ubyte *dst,
for (j = 0; j < w; j++, pRow += 4) {
unsigned r;
r = float_to_ubyte(pRow[0]);
- *dst++ = r;
+ *dst++ = (ubyte) r;
}
p += src_stride;
}
@@ -504,7 +504,7 @@ a8_put_tile_rgba(ubyte *dst,
for (j = 0; j < w; j++, pRow += 4) {
unsigned a;
a = float_to_ubyte(pRow[3]);
- *dst++ = a;
+ *dst++ = (ubyte) a;
}
p += src_stride;
}
@@ -634,7 +634,7 @@ i8_put_tile_rgba(ubyte *dst,
for (j = 0; j < w; j++, pRow += 4) {
unsigned r;
r = float_to_ubyte(pRow[0]);
- *dst++ = r;
+ *dst++ = (ubyte) r;
}
p += src_stride;
}
@@ -769,6 +769,32 @@ z24s8_get_tile_rgba(const unsigned *src,
}
+/*** PIPE_FORMAT_Z32_FLOAT ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32f_get_tile_rgba(const float *src,
+ unsigned w, unsigned h,
+ float *p,
+ unsigned dst_stride)
+{
+ unsigned i, j;
+
+ for (i = 0; i < h; i++) {
+ float *pRow = p;
+ for (j = 0; j < w; j++, pRow += 4) {
+ pRow[0] =
+ pRow[1] =
+ pRow[2] =
+ pRow[3] = *src++;
+ }
+ p += dst_stride;
+ }
+}
+
+
/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
/**
@@ -913,6 +939,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
case PIPE_FORMAT_Z24S8_UNORM:
z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
break;
+ case PIPE_FORMAT_Z32_FLOAT:
+ z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
+ break;
case PIPE_FORMAT_YCBCR:
ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
break;
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
index bf7d1d1c8d..57b80e5604 100644
--- a/src/gallium/auxiliary/util/u_time.c
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -200,7 +200,7 @@ util_time_timeout(const struct util_time *start,
}
-#if defined(PIPE_SUBSYSYEM_WINDOWS_DISPLAY)
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
void util_time_sleep(unsigned usecs)
{
LONGLONG start, curr, end;