summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/llvmpipe
diff options
context:
space:
mode:
authorKeith Whitwell <keithw@vmware.com>2010-10-17 19:03:42 -0700
committerKeith Whitwell <keithw@vmware.com>2010-10-17 19:09:42 -0700
commit0072acd447dc6be652e63752e50215c3105322c8 (patch)
tree847d1763b54772d336a04e606f8248291c3092b7 /src/gallium/drivers/llvmpipe
parent543fb77ddece7e1806e8eaa0d65bb2a945ef9a75 (diff)
parentca2b2ac131933b4171b519813df1aaa3a81621cd (diff)
Merge remote branch 'origin/master' into lp-setup-llvm
Conflicts: src/gallium/drivers/llvmpipe/lp_setup_coef.c src/gallium/drivers/llvmpipe/lp_setup_coef.h src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c src/gallium/drivers/llvmpipe/lp_setup_point.c src/gallium/drivers/llvmpipe/lp_setup_tri.c src/gallium/drivers/llvmpipe/lp_state_derived.c src/gallium/drivers/llvmpipe/lp_state_fs.h
Diffstat (limited to 'src/gallium/drivers/llvmpipe')
-rw-r--r--src/gallium/drivers/llvmpipe/Makefile4
-rw-r--r--src/gallium/drivers/llvmpipe/README38
-rw-r--r--src/gallium/drivers/llvmpipe/SConscript8
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_alpha.c6
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_alpha.h3
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c3
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c46
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_depth.c412
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_depth.h22
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_interp.c45
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_interp.h6
-rw-r--r--src/gallium/drivers/llvmpipe/lp_jit.c21
-rw-r--r--src/gallium/drivers/llvmpipe/lp_jit.h11
-rw-r--r--src/gallium/drivers/llvmpipe/lp_query.c5
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast.c64
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast.h64
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast_debug.c39
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast_priv.h19
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast_tri.c286
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h136
-rw-r--r--src/gallium/drivers/llvmpipe/lp_scene.c4
-rw-r--r--src/gallium/drivers/llvmpipe/lp_scene.h28
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup.c148
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup.h8
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup_context.h8
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup_line.c187
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup_point.c292
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup_tri.c505
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup_vbuf.c6
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_derived.c14
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_fs.c493
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_fs.h5
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_rasterizer.c5
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_sampler.c6
-rw-r--r--src/gallium/drivers/llvmpipe/lp_state_setup.c2
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_blend.c41
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_main.c33
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_round.c24
-rw-r--r--src/gallium/drivers/llvmpipe/lp_test_sincos.c5
-rw-r--r--src/gallium/drivers/llvmpipe/lp_tex_sample.c9
-rw-r--r--src/gallium/drivers/llvmpipe/lp_tile_soa.py267
41 files changed, 2086 insertions, 1242 deletions
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 379f14b43d..726a19db2b 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -64,12 +64,12 @@ PROGS := lp_test_format \
# Need this for the lp_test_*.o files
CLEAN_EXTRA = *.o
+include ../../Makefile.template
+
lp_test_sincos.o : sse_mathfun.h
PROGS_DEPS := ../../auxiliary/libgallium.a
-include ../../Makefile.template
-
lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_pack.py ../../auxiliary/util/u_format.csv
python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 8b5539d2c5..ec30d4d708 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -131,6 +131,44 @@ replacing the native ICD driver, but it's quite an advanced usage, so if you
need to ask, don't even try it.
+Profiling
+=========
+
+To profile llvmpipe you should pass the options
+
+ scons debug=no profile=yes <same-as-before>
+
+This will ensure that frame pointers are used both in C and JIT functions, and
+that no tail call optimizations are done by gcc.
+
+
+To better profile JIT code you'll need to build LLVM with oprofile integration.
+
+ source_dir=$PWD/llvm-2.6
+ build_dir=$source_dir/build/profile
+ install_dir=$source_dir-profile
+
+ mkdir -p "$build_dir"
+ cd "$build_dir" && \
+ $source_dir/configure \
+ --prefix=$install_dir \
+ --enable-optimized \
+ --disable-profiling \
+ --enable-targets=host-only \
+ --with-oprofile
+
+ make -C "$build_dir"
+ make -C "$build_dir" install
+
+ find "$install_dir/lib" -iname '*.a' -print0 | xargs -0 strip --strip-debug
+
+The you should define
+
+ export LLVM=/path/to/llvm-2.6-profile
+
+and rebuild.
+
+
Unit testing
============
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index f893878daa..d63bdd72a7 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -27,13 +27,7 @@ env.Depends('lp_tile_soa.c', [
])
-# Only enable SSSE3 for lp_tile_soa_sse3.c
-ssse3_env = env.Clone()
-if env['gcc'] \
- and distutils.version.LooseVersion(env['CCVERSION']) >= distutils.version.LooseVersion('4.3') \
- and env['machine'] in ('x86', 'x86_64') :
- ssse3_env.Append(CCFLAGS = ['-mssse3'])
-lp_tile_soa_os = ssse3_env.SharedObject('lp_tile_soa.c')
+lp_tile_soa_os = env.SharedObject('lp_tile_soa.c')
llvmpipe = env.ConvenienceLibrary(
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
index e28efe778f..e50643790c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
struct lp_type type,
struct lp_build_mask_context *mask,
LLVMValueRef alpha,
- LLVMValueRef ref)
+ LLVMValueRef ref,
+ boolean do_branch)
{
struct lp_build_context bld;
LLVMValueRef test;
@@ -60,4 +61,7 @@ lp_build_alpha_test(LLVMBuilderRef builder,
lp_build_name(test, "alpha_mask");
lp_build_mask_update(mask, test);
+
+ if (do_branch)
+ lp_build_mask_check(mask);
}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 44603b418c..27ca8aad4d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
struct lp_type type,
struct lp_build_mask_context *mask,
LLVMValueRef alpha,
- LLVMValueRef ref);
+ LLVMValueRef ref,
+ boolean do_branch);
#endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index b5924cbb7d..d1c9b88f9b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -320,9 +320,6 @@ lp_build_blend_aos(LLVMBuilderRef builder,
if(!blend->rt[rt].blend_enable)
return src;
- /* It makes no sense to blend unless values are normalized */
- assert(type.norm);
-
/* Setup build context */
memset(&bld, 0, sizeof bld);
lp_build_context_init(&bld.base, builder, type);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
index b9c7a6ceed..30d261e979 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -1,6 +1,6 @@
/**************************************************************************
*
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
@@ -195,6 +195,13 @@ lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
}
+static boolean
+lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
+{
+ return dst_factor == (src_factor ^ 0x10);
+}
+
+
/**
* Generate blend code in SOA mode.
* \param rt render target index (to index the blend / colormask state)
@@ -243,8 +250,41 @@ lp_build_blend_soa(LLVMBuilderRef builder,
unsigned func = i < 3 ? blend->rt[rt].rgb_func : blend->rt[rt].alpha_func;
boolean func_commutative = lp_build_blend_func_commutative(func);
- /* It makes no sense to blend unless values are normalized */
- assert(type.norm);
+ if (func == PIPE_BLEND_ADD &&
+ lp_build_blend_factor_complementary(src_factor, dst_factor) && 0) {
+ /*
+ * Special case linear interpolation, (i.e., complementary factors).
+ */
+
+ LLVMValueRef weight;
+ if (src_factor < dst_factor) {
+ weight = lp_build_blend_soa_factor(&bld, src_factor, i);
+ res[i] = lp_build_lerp(&bld.base, weight, dst[i], src[i]);
+ } else {
+ weight = lp_build_blend_soa_factor(&bld, dst_factor, i);
+ res[i] = lp_build_lerp(&bld.base, weight, src[i], dst[i]);
+ }
+ continue;
+ }
+
+ if ((func == PIPE_BLEND_ADD ||
+ func == PIPE_BLEND_SUBTRACT ||
+ func == PIPE_BLEND_REVERSE_SUBTRACT) &&
+ src_factor == dst_factor &&
+ type.floating) {
+ /*
+ * Special common factor.
+ *
+ * XXX: Only for floating points for now, since saturation will
+ * cause different results.
+ */
+
+ LLVMValueRef factor;
+ factor = lp_build_blend_soa_factor(&bld, src_factor, i);
+ res[i] = lp_build_blend_func(&bld.base, func, src[i], dst[i]);
+ res[i] = lp_build_mul(&bld.base, res[i], factor);
+ continue;
+ }
/*
* Compute src/dst factors.
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 7561899a74..7eb76d4fb3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1,6 +1,6 @@
/**************************************************************************
*
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
@@ -53,15 +53,8 @@
* ... ... ... ... ... ... ... ... ...
*
*
- * Stencil test:
- * Two-sided stencil test is supported but probably not as efficient as
- * it could be. Currently, we use if/then/else constructs to do the
- * operations for front vs. back-facing polygons. We could probably do
- * both the front and back arithmetic then use a Select() instruction to
- * choose the result depending on polyon orientation. We'd have to
- * measure performance both ways and see which is better.
- *
* @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <jfonseca@vmware.com>
*/
#include "pipe/p_state.h"
@@ -71,6 +64,7 @@
#include "gallivm/lp_bld_arit.h"
#include "gallivm/lp_bld_bitarit.h"
#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
#include "gallivm/lp_bld_logic.h"
#include "gallivm/lp_bld_flow.h"
#include "gallivm/lp_bld_intr.h"
@@ -128,57 +122,32 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
/**
* Do the one or two-sided stencil test comparison.
* \sa lp_build_stencil_test_single
- * \param face an integer indicating front (+) or back (-) facing polygon.
- * If NULL, assume front-facing.
+ * \param front_facing an integer vector mask, indicating front (~0) or back
+ * (0) facing polygon. If NULL, assume front-facing.
*/
static LLVMValueRef
lp_build_stencil_test(struct lp_build_context *bld,
const struct pipe_stencil_state stencil[2],
LLVMValueRef stencilRefs[2],
LLVMValueRef stencilVals,
- LLVMValueRef face)
+ LLVMValueRef front_facing)
{
LLVMValueRef res;
assert(stencil[0].enabled);
- if (stencil[1].enabled && face) {
- /* do two-sided test */
- struct lp_build_flow_context *flow_ctx;
- struct lp_build_if_state if_ctx;
- LLVMValueRef front_facing;
- LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
- LLVMValueRef result = bld->undef;
-
- flow_ctx = lp_build_flow_create(bld->builder);
- lp_build_flow_scope_begin(flow_ctx);
+ /* do front face test */
+ res = lp_build_stencil_test_single(bld, &stencil[0],
+ stencilRefs[0], stencilVals);
- lp_build_flow_scope_declare(flow_ctx, &result);
+ if (stencil[1].enabled && front_facing) {
+ /* do back face test */
+ LLVMValueRef back_res;
- /* front_facing = face > 0.0 */
- front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
-
- lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
- {
- result = lp_build_stencil_test_single(bld, &stencil[0],
- stencilRefs[0], stencilVals);
- }
- lp_build_else(&if_ctx);
- {
- result = lp_build_stencil_test_single(bld, &stencil[1],
- stencilRefs[1], stencilVals);
- }
- lp_build_endif(&if_ctx);
+ back_res = lp_build_stencil_test_single(bld, &stencil[1],
+ stencilRefs[1], stencilVals);
- lp_build_flow_scope_end(flow_ctx);
- lp_build_flow_destroy(flow_ctx);
-
- res = result;
- }
- else {
- /* do single-side test */
- res = lp_build_stencil_test_single(bld, &stencil[0],
- stencilRefs[0], stencilVals);
+ res = lp_build_select(bld, front_facing, res, back_res);
}
return res;
@@ -195,14 +164,12 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
const struct pipe_stencil_state *stencil,
enum stencil_op op,
LLVMValueRef stencilRef,
- LLVMValueRef stencilVals,
- LLVMValueRef mask)
+ LLVMValueRef stencilVals)
{
- const unsigned stencilMax = 255; /* XXX fix */
struct lp_type type = bld->type;
LLVMValueRef res;
- LLVMValueRef max = lp_build_const_int_vec(type, stencilMax);
+ LLVMValueRef max = lp_build_const_int_vec(type, 0xff);
unsigned stencil_op;
assert(type.sign);
@@ -255,19 +222,7 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
break;
default:
assert(0 && "bad stencil op mode");
- res = NULL;
- }
-
- if (stencil->writemask != stencilMax) {
- /* mask &= stencil->writemask */
- LLVMValueRef writemask = lp_build_const_int_vec(type, stencil->writemask);
- mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
- /* res = (res & mask) | (stencilVals & ~mask) */
- res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
- }
- else {
- /* res = mask ? res : stencilVals */
- res = lp_build_select(bld, mask, res, stencilVals);
+ res = bld->undef;
}
return res;
@@ -284,49 +239,40 @@ lp_build_stencil_op(struct lp_build_context *bld,
LLVMValueRef stencilRefs[2],
LLVMValueRef stencilVals,
LLVMValueRef mask,
- LLVMValueRef face)
+ LLVMValueRef front_facing)
{
- assert(stencil[0].enabled);
-
- if (stencil[1].enabled && face) {
- /* do two-sided op */
- struct lp_build_flow_context *flow_ctx;
- struct lp_build_if_state if_ctx;
- LLVMValueRef front_facing;
- LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
- LLVMValueRef result = bld->undef;
+ LLVMValueRef res;
- flow_ctx = lp_build_flow_create(bld->builder);
- lp_build_flow_scope_begin(flow_ctx);
+ assert(stencil[0].enabled);
- lp_build_flow_scope_declare(flow_ctx, &result);
+ /* do front face op */
+ res = lp_build_stencil_op_single(bld, &stencil[0], op,
+ stencilRefs[0], stencilVals);
- /* front_facing = face > 0.0 */
- front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+ if (stencil[1].enabled && front_facing) {
+ /* do back face op */
+ LLVMValueRef back_res;
- lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
- {
- result = lp_build_stencil_op_single(bld, &stencil[0], op,
- stencilRefs[0], stencilVals, mask);
- }
- lp_build_else(&if_ctx);
- {
- result = lp_build_stencil_op_single(bld, &stencil[1], op,
- stencilRefs[1], stencilVals, mask);
- }
- lp_build_endif(&if_ctx);
+ back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
+ stencilRefs[1], stencilVals);
- lp_build_flow_scope_end(flow_ctx);
- lp_build_flow_destroy(flow_ctx);
+ res = lp_build_select(bld, front_facing, res, back_res);
+ }
- return result;
+ if (stencil->writemask != 0xff) {
+ /* mask &= stencil->writemask */
+ LLVMValueRef writemask = lp_build_const_int_vec(bld->type, stencil->writemask);
+ mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
+ /* res = (res & mask) | (stencilVals & ~mask) */
+ res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
}
else {
- /* do single-sided op */
- return lp_build_stencil_op_single(bld, &stencil[0], op,
- stencilRefs[0], stencilVals, mask);
+ /* res = mask ? res : stencilVals */
+ res = lp_build_select(bld, mask, res, stencilVals);
}
+
+ return res;
}
@@ -358,8 +304,13 @@ lp_depth_type(const struct util_format_description *format_desc,
}
else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
assert(format_desc->block.bits <= 32);
- if(format_desc->channel[swizzle].normalized)
- type.norm = TRUE;
+ assert(format_desc->channel[swizzle].normalized);
+ if (format_desc->channel[swizzle].size < format_desc->block.bits) {
+ /* Prefer signed integers when possible, as SSE has less support
+ * for unsigned comparison;
+ */
+ type.sign = TRUE;
+ }
}
else
assert(0);
@@ -381,7 +332,7 @@ lp_depth_type(const struct util_format_description *format_desc,
*/
static boolean
get_z_shift_and_mask(const struct util_format_description *format_desc,
- unsigned *shift, unsigned *mask)
+ unsigned *shift, unsigned *width, unsigned *mask)
{
const unsigned total_bits = format_desc->block.bits;
unsigned z_swizzle;
@@ -397,12 +348,14 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
return FALSE;
+ *width = format_desc->channel[z_swizzle].size;
+
padding_right = 0;
for (chan = 0; chan < z_swizzle; ++chan)
padding_right += format_desc->channel[chan].size;
padding_left =
- total_bits - (padding_right + format_desc->channel[z_swizzle].size);
+ total_bits - (padding_right + *width);
if (padding_left || padding_right) {
unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
@@ -413,7 +366,7 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
*mask = 0xffffffff;
}
- *shift = padding_left;
+ *shift = padding_right;
return TRUE;
}
@@ -457,7 +410,7 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
* \param maskvalue is the depth test mask.
* \param counter is a pointer of the uint32 counter.
*/
-static void
+void
lp_build_occlusion_count(LLVMBuilderRef builder,
struct lp_type type,
LLVMValueRef maskvalue,
@@ -494,31 +447,57 @@ lp_build_occlusion_count(LLVMBuilderRef builder,
* \param format_desc description of the depth/stencil surface
* \param mask the alive/dead pixel mask for the quad (vector)
* \param stencil_refs the front/back stencil ref values (scalar)
- * \param z_src the incoming depth/stencil values (a 2x2 quad)
+ * \param z_src the incoming depth/stencil values (a 2x2 quad, float32)
* \param zs_dst_ptr pointer to depth/stencil values in framebuffer
- * \param facing contains float value indicating front/back facing polygon
+ * \param facing contains boolean value indicating front/back facing polygon
*/
void
lp_build_depth_stencil_test(LLVMBuilderRef builder,
const struct pipe_depth_state *depth,
const struct pipe_stencil_state stencil[2],
- struct lp_type type,
+ struct lp_type z_src_type,
const struct util_format_description *format_desc,
struct lp_build_mask_context *mask,
LLVMValueRef stencil_refs[2],
LLVMValueRef z_src,
LLVMValueRef zs_dst_ptr,
LLVMValueRef face,
- LLVMValueRef counter)
+ LLVMValueRef *zs_value,
+ boolean do_branch)
{
- struct lp_build_context bld;
- struct lp_build_context sbld;
+ struct lp_type z_type;
+ struct lp_build_context z_bld;
+ struct lp_build_context s_bld;
struct lp_type s_type;
+ unsigned z_shift = 0, z_width = 0, z_mask = 0;
LLVMValueRef zs_dst, z_dst = NULL;
LLVMValueRef stencil_vals = NULL;
LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
- LLVMValueRef orig_mask = mask->value;
+ LLVMValueRef orig_mask = lp_build_mask_value(mask);
+ LLVMValueRef front_facing = NULL;
+
+
+ /*
+ * Depths are expected to be between 0 and 1, even if they are stored in
+ * floats. Setting these bits here will ensure that the lp_build_conv() call
+ * below won't try to unnecessarily clamp the incoming values.
+ */
+ if(z_src_type.floating) {
+ z_src_type.sign = FALSE;
+ z_src_type.norm = TRUE;
+ }
+ else {
+ assert(!z_src_type.sign);
+ assert(z_src_type.norm);
+ }
+
+ /* Pick the depth type. */
+ z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+
+ /* FIXME: Cope with a depth test type with a different bit width. */
+ assert(z_type.width == z_src_type.width);
+ assert(z_type.length == z_src_type.length);
/* Sanity checking */
{
@@ -540,8 +519,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
}
assert(z_swizzle < 4);
- assert(format_desc->block.bits == type.width);
- if (type.floating) {
+ assert(format_desc->block.bits == z_type.width);
+ if (z_type.floating) {
assert(z_swizzle == 0);
assert(format_desc->channel[z_swizzle].type ==
UTIL_FORMAT_TYPE_FLOAT);
@@ -552,54 +531,56 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
assert(format_desc->channel[z_swizzle].type ==
UTIL_FORMAT_TYPE_UNSIGNED);
assert(format_desc->channel[z_swizzle].normalized);
- assert(!type.fixed);
- assert(!type.sign);
- assert(type.norm);
+ assert(!z_type.fixed);
}
}
/* Setup build context for Z vals */
- lp_build_context_init(&bld, builder, type);
+ lp_build_context_init(&z_bld, builder, z_type);
/* Setup build context for stencil vals */
- s_type = lp_type_int_vec(type.width);
- lp_build_context_init(&sbld, builder, s_type);
+ s_type = lp_type_int_vec(z_type.width);
+ lp_build_context_init(&s_bld, builder, s_type);
/* Load current z/stencil value from z/stencil buffer */
+ zs_dst_ptr = LLVMBuildBitCast(builder,
+ zs_dst_ptr,
+ LLVMPointerType(z_bld.vec_type, 0), "");
zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
- lp_build_name(zs_dst, "zsbufval");
+ lp_build_name(zs_dst, "zs_dst");
/* Compute and apply the Z/stencil bitmasks and shifts.
*/
{
- unsigned z_shift, z_mask;
unsigned s_shift, s_mask;
- if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
- if (z_shift) {
- LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
- z_src = LLVMBuildLShr(builder, z_src, shift, "");
- }
-
+ if (get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask)) {
if (z_mask != 0xffffffff) {
- LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
- z_src = LLVMBuildAnd(builder, z_src, mask, "");
- z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
- z_bitmask = mask; /* used below */
+ z_bitmask = lp_build_const_int_vec(z_type, z_mask);
}
- else {
+
+ /*
+ * Align the framebuffer Z 's LSB to the right.
+ */
+ if (z_shift) {
+ LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+ z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst");
+ } else if (z_bitmask) {
+ /* TODO: Instead of loading a mask from memory and ANDing, it's
+ * probably faster to just shake the bits with two shifts. */
+ z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst");
+ } else {
z_dst = zs_dst;
+ lp_build_name(z_dst, "z_dst");
}
-
- lp_build_name(z_dst, "zsbuf.z");
}
if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
if (s_shift) {
- LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
+ LLVMValueRef shift = lp_build_const_int_vec(s_type, s_shift);
stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
stencil_shift = shift; /* used below */
}
@@ -608,35 +589,85 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
}
if (s_mask != 0xffffffff) {
- LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+ LLVMValueRef mask = lp_build_const_int_vec(s_type, s_mask);
stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
}
- lp_build_name(stencil_vals, "stencil");
+ lp_build_name(stencil_vals, "s_dst");
}
}
-
if (stencil[0].enabled) {
+
+ if (face) {
+ LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
+
+ /* front_facing = face != 0 ? ~0 : 0 */
+ front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+ front_facing = LLVMBuildSExt(builder, front_facing,
+ LLVMIntType(s_bld.type.length*s_bld.type.width),
+ "");
+ front_facing = LLVMBuildBitCast(builder, front_facing,
+ s_bld.int_vec_type, "");
+ }
+
/* convert scalar stencil refs into vectors */
- stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
- stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
+ stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
+ stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
- s_pass_mask = lp_build_stencil_test(&sbld, stencil,
- stencil_refs, stencil_vals, face);
+ s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
+ stencil_refs, stencil_vals,
+ front_facing);
/* apply stencil-fail operator */
{
- LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask);
- stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+ LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask);
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
stencil_refs, stencil_vals,
- s_fail_mask, face);
+ s_fail_mask, front_facing);
}
}
if (depth->enabled) {
+ /*
+ * Convert fragment Z to the desired type, aligning the LSB to the right.
+ */
+
+ assert(z_type.width == z_src_type.width);
+ assert(z_type.length == z_src_type.length);
+ assert(lp_check_value(z_src_type, z_src));
+ if (z_src_type.floating) {
+ /*
+ * Convert from floating point values
+ */
+
+ if (!z_type.floating) {
+ z_src = lp_build_clamped_float_to_unsigned_norm(builder,
+ z_src_type,
+ z_width,
+ z_src);
+ }
+ } else {
+ /*
+ * Convert from unsigned normalized values.
+ */
+
+ assert(!z_src_type.sign);
+ assert(!z_src_type.fixed);
+ assert(z_src_type.norm);
+ assert(!z_type.floating);
+ if (z_src_type.width > z_width) {
+ LLVMValueRef shift = lp_build_const_int_vec(z_src_type,
+ z_src_type.width - z_width);
+ z_src = LLVMBuildLShr(builder, z_src, shift, "");
+ }
+ }
+ assert(lp_check_value(z_type, z_src));
+
+ lp_build_name(z_src, "z_src");
+
/* compare src Z to dst Z, returning 'pass' mask */
- z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+ z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
if (!stencil[0].enabled) {
/* We can potentially skip all remaining operations here, but only
@@ -644,28 +675,28 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
* buffer values. Don't need to update Z buffer values.
*/
lp_build_mask_update(mask, z_pass);
+
+ if (do_branch) {
+ lp_build_mask_check(mask);
+ do_branch = FALSE;
+ }
}
if (depth->writemask) {
- LLVMValueRef zselectmask = mask->value;
+ LLVMValueRef zselectmask;
/* mask off bits that failed Z test */
- zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
+ zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
/* mask off bits that failed stencil test */
if (s_pass_mask) {
zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
}
- /* if combined Z/stencil format, mask off the stencil bits */
- if (z_bitmask) {
- zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, "");
- }
-
/* Mix the old and new Z buffer values.
- * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i])
+ * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
*/
- z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst);
+ z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst);
}
if (stencil[0].enabled) {
@@ -673,33 +704,35 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
LLVMValueRef z_fail_mask, z_pass_mask;
/* apply Z-fail operator */
- z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass);
- stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+ z_fail_mask = lp_build_andnot(&z_bld, orig_mask, z_pass);
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
stencil_refs, stencil_vals,
- z_fail_mask, face);
+ z_fail_mask, front_facing);
/* apply Z-pass operator */
- z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
- stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+ z_pass_mask = LLVMBuildAnd(z_bld.builder, orig_mask, z_pass, "");
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
stencil_refs, stencil_vals,
- z_pass_mask, face);
+ z_pass_mask, front_facing);
}
}
else {
/* No depth test: apply Z-pass operator to stencil buffer values which
* passed the stencil test.
*/
- s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
- stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+ s_pass_mask = LLVMBuildAnd(s_bld.builder, orig_mask, s_pass_mask, "");
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
stencil_refs, stencil_vals,
- s_pass_mask, face);
+ s_pass_mask, front_facing);
}
- /* The Z bits are already in the right place but we may need to shift the
- * stencil bits before ORing Z with Stencil to make the final pixel value.
- */
+ /* Put Z and ztencil bits in the right place */
+ if (z_dst && z_shift) {
+ LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+ z_dst = LLVMBuildShl(builder, z_dst, shift, "");
+ }
if (stencil_vals && stencil_shift)
- stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+ stencil_vals = LLVMBuildShl(s_bld.builder, stencil_vals,
stencil_shift, "");
/* Finally, merge/store the z/stencil values */
@@ -707,13 +740,13 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
(stencil[0].enabled && stencil[0].writemask)) {
if (z_dst && stencil_vals)
- zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
+ zs_dst = LLVMBuildOr(z_bld.builder, z_dst, stencil_vals, "");
else if (z_dst)
zs_dst = z_dst;
else
zs_dst = stencil_vals;
- LLVMBuildStore(builder, zs_dst, zs_dst_ptr);
+ *zs_value = zs_dst;
}
if (s_pass_mask)
@@ -722,6 +755,47 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
if (depth->enabled && stencil[0].enabled)
lp_build_mask_update(mask, z_pass);
- if (counter)
- lp_build_occlusion_count(builder, type, mask->value, counter);
+ if (do_branch)
+ lp_build_mask_check(mask);
+
+}
+
+
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+ const struct util_format_description *format_desc,
+ LLVMValueRef zs_dst_ptr,
+ LLVMValueRef zs_value)
+{
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+ LLVMPointerType(LLVMTypeOf(zs_value), 0), "");
+
+ LLVMBuildStore(builder, zs_value, zs_dst_ptr);
+}
+
+
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+ struct lp_type z_src_type,
+ const struct util_format_description *format_desc,
+ struct lp_build_mask_context *mask,
+ LLVMValueRef zs_dst_ptr,
+ LLVMValueRef zs_value)
+{
+ struct lp_type z_type;
+ struct lp_build_context z_bld;
+ LLVMValueRef z_dst;
+
+ /* XXX: pointlessly redo type logic:
+ */
+ z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+ lp_build_context_init(&z_bld, builder, z_type);
+
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+ LLVMPointerType(z_bld.vec_type, 0), "");
+
+ z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
+ z_dst = lp_build_select(&z_bld, lp_build_mask_value(mask), zs_value, z_dst);
+
+ LLVMBuildStore(builder, z_dst, zs_dst_ptr);
}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index e257a5bd7d..a54ef3a711 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -61,7 +61,27 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
LLVMValueRef zs_src,
LLVMValueRef zs_dst_ptr,
LLVMValueRef facing,
- LLVMValueRef counter);
+ LLVMValueRef *zs_value,
+ boolean do_branch);
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+ const struct util_format_description *format_desc,
+ LLVMValueRef zs_dst_ptr,
+ LLVMValueRef zs_value);
+
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+ struct lp_type z_src_type,
+ const struct util_format_description *format_desc,
+ struct lp_build_mask_context *mask,
+ LLVMValueRef zs_dst_ptr,
+ LLVMValueRef zs_value);
+
+void
+lp_build_occlusion_count(LLVMBuilderRef builder,
+ struct lp_type type,
+ LLVMValueRef maskvalue,
+ LLVMValueRef counter);
#endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 2a374f8c39..c9da8900d0 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -206,7 +206,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
/*
- * a = a0 + x * dadx + y * dady
+ * a = a0 + (x * dadx + y * dady)
*/
if (attrib == 0 && chan == 0) {
@@ -219,11 +219,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
a = a0;
if (interp != LP_INTERP_CONSTANT &&
interp != LP_INTERP_FACING) {
- LLVMValueRef tmp;
- tmp = LLVMBuildFMul(builder, bld->x, dadx, "");
- a = LLVMBuildFAdd(builder, a, tmp, "");
- tmp = LLVMBuildFMul(builder, bld->y, dady, "");
- a = LLVMBuildFAdd(builder, a, tmp, "");
+ LLVMValueRef ax, ay, axy;
+ ax = LLVMBuildFMul(builder, bld->x, dadx, "");
+ ay = LLVMBuildFMul(builder, bld->y, dady, "");
+ axy = LLVMBuildFAdd(builder, ax, ay, "");
+ a = LLVMBuildFAdd(builder, a, axy, "");
}
}
@@ -272,7 +272,10 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
* This is called when we move from one quad to the next.
*/
static void
-attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
+attribs_update(struct lp_build_interp_soa_context *bld,
+ int quad_index,
+ int start,
+ int end)
{
struct lp_build_context *coeff_bld = &bld->coeff_bld;
LLVMValueRef shuffle = lp_build_const_int_vec(coeff_bld->type, quad_index);
@@ -282,7 +285,7 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
assert(quad_index < 4);
- for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+ for(attrib = start; attrib < end; ++attrib) {
const unsigned mask = bld->mask[attrib];
const unsigned interp = bld->interp[attrib];
for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -350,6 +353,14 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
}
#endif
+ if (attrib == 0 && chan == 2) {
+ /* FIXME: Depth values can exceed 1.0, due to the fact that
+ * setup interpolation coefficients refer to (0,0) which causes
+ * precision loss. So we must clamp to 1.0 here to avoid artifacts
+ */
+ a = lp_build_min(coeff_bld, a, coeff_bld->one);
+ }
+
attrib_name(a, attrib, chan, "");
}
bld->attribs[attrib][chan] = a;
@@ -434,8 +445,6 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
pos_init(bld, x0, y0);
coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
-
- attribs_update(bld, 0);
}
@@ -443,10 +452,20 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
* Advance the position and inputs to the given quad within the block.
*/
void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
- int quad_index)
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+ int quad_index)
+{
+ assert(quad_index < 4);
+
+ attribs_update(bld, quad_index, 1, bld->num_attribs);
+}
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
+ int quad_index)
{
assert(quad_index < 4);
- attribs_update(bld, quad_index);
+ attribs_update(bld, quad_index, 0, 1);
}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 37479fca9d..a7ebdd1bfa 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -113,7 +113,11 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
LLVMValueRef y);
void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+ int quad_index);
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
int quad_index);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 8e6dfb293d..e09ec504ab 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -64,6 +64,11 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
elem_types[LP_JIT_TEXTURE_DATA] =
LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
LP_MAX_TEXTURE_LEVELS);
+ elem_types[LP_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
+ elem_types[LP_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
+ elem_types[LP_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
+ elem_types[LP_JIT_TEXTURE_BORDER_COLOR] =
+ LLVMArrayType(LLVMFloatType(), 4);
texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
@@ -88,6 +93,19 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, data,
screen->target, texture_type,
LP_JIT_TEXTURE_DATA);
+ LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, min_lod,
+ screen->target, texture_type,
+ LP_JIT_TEXTURE_MIN_LOD);
+ LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, max_lod,
+ screen->target, texture_type,
+ LP_JIT_TEXTURE_MAX_LOD);
+ LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, lod_bias,
+ screen->target, texture_type,
+ LP_JIT_TEXTURE_LOD_BIAS);
+ LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, border_color,
+ screen->target, texture_type,
+ LP_JIT_TEXTURE_BORDER_COLOR);
+
LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
screen->target, texture_type);
@@ -144,9 +162,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
void
lp_jit_screen_cleanup(struct llvmpipe_screen *screen)
{
- if(screen->engine)
- LLVMDisposeExecutionEngine(screen->engine);
-
if(screen->pass)
LLVMDisposePassManager(screen->pass);
}
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index c94189413a..114f21f2d1 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -54,6 +54,11 @@ struct lp_jit_texture
uint32_t row_stride[LP_MAX_TEXTURE_LEVELS];
uint32_t img_stride[LP_MAX_TEXTURE_LEVELS];
const void *data[LP_MAX_TEXTURE_LEVELS];
+ /* sampler state, actually */
+ float min_lod;
+ float max_lod;
+ float lod_bias;
+ float border_color[4];
};
@@ -65,6 +70,10 @@ enum {
LP_JIT_TEXTURE_ROW_STRIDE,
LP_JIT_TEXTURE_IMG_STRIDE,
LP_JIT_TEXTURE_DATA,
+ LP_JIT_TEXTURE_MIN_LOD,
+ LP_JIT_TEXTURE_MAX_LOD,
+ LP_JIT_TEXTURE_LOD_BIAS,
+ LP_JIT_TEXTURE_BORDER_COLOR,
LP_JIT_TEXTURE_NUM_FIELDS /* number of fields above */
};
@@ -135,7 +144,7 @@ typedef void
(*lp_jit_frag_func)(const struct lp_jit_context *context,
uint32_t x,
uint32_t y,
- float facing,
+ uint32_t facing,
const void *a0,
const void *dadx,
const void *dady,
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
index ff0e207a54..84c66dd36e 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -92,8 +92,9 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
int i;
if (!pq->fence) {
- assert(0); /* query not in issued state */
- return FALSE;
+ /* no fence because there was no scene, so results is zero */
+ *result = 0;
+ return TRUE;
}
if (!lp_fence_signalled(pq->fence)) {
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index d7e6415e13..d358a98394 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -211,8 +211,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
const struct lp_scene *scene = task->scene;
- unsigned clear_value = arg.clear_zstencil.value;
- unsigned clear_mask = arg.clear_zstencil.mask;
+ uint32_t clear_value = arg.clear_zstencil.value;
+ uint32_t clear_mask = arg.clear_zstencil.mask;
const unsigned height = TILE_SIZE / TILE_VECTOR_HEIGHT;
const unsigned width = TILE_SIZE * TILE_VECTOR_HEIGHT;
const unsigned block_size = scene->zsbuf.blocksize;
@@ -220,7 +220,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
uint8_t *dst;
unsigned i, j;
- LP_DBG(DEBUG_RAST, "%s 0x%x%x\n", __FUNCTION__, clear_value, clear_mask);
+ LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n",
+ __FUNCTION__, clear_value, clear_mask);
/*
* Clear the aera of the swizzled depth/depth buffer matching this tile, in
@@ -232,16 +233,31 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
dst = task->depth_tile;
+ clear_value &= clear_mask;
+
switch (block_size) {
case 1:
+ assert(clear_mask == 0xff);
memset(dst, (uint8_t) clear_value, height * width);
break;
case 2:
- for (i = 0; i < height; i++) {
- uint16_t *row = (uint16_t *)dst;
- for (j = 0; j < width; j++)
- *row++ = (uint16_t) clear_value;
- dst += dst_stride;
+ if (clear_mask == 0xffff) {
+ for (i = 0; i < height; i++) {
+ uint16_t *row = (uint16_t *)dst;
+ for (j = 0; j < width; j++)
+ *row++ = (uint16_t) clear_value;
+ dst += dst_stride;
+ }
+ }
+ else {
+ for (i = 0; i < height; i++) {
+ uint16_t *row = (uint16_t *)dst;
+ for (j = 0; j < width; j++) {
+ uint16_t tmp = ~clear_mask & *row;
+ *row++ = clear_value | tmp;
+ }
+ dst += dst_stride;
+ }
}
break;
case 4:
@@ -258,7 +274,7 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
uint32_t *row = (uint32_t *)dst;
for (j = 0; j < width; j++) {
uint32_t tmp = ~clear_mask & *row;
- *row++ = (clear_value & clear_mask) | tmp;
+ *row++ = clear_value | tmp;
}
dst += dst_stride;
}
@@ -318,7 +334,7 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
{
const struct lp_scene *scene = task->scene;
const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
- const struct lp_rast_state *state = inputs->state;
+ const struct lp_rast_state *state = task->state;
struct lp_fragment_shader_variant *variant = state->variant;
const unsigned tile_x = task->x, tile_y = task->y;
unsigned x, y;
@@ -349,10 +365,10 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
BEGIN_JIT_CALL(state);
variant->jit_function[RAST_WHOLE]( &state->jit_context,
tile_x + x, tile_y + y,
- inputs->facing,
- inputs->a0,
- inputs->dadx,
- inputs->dady,
+ inputs->frontfacing,
+ GET_A0(inputs),
+ GET_DADX(inputs),
+ GET_DADY(inputs),
color,
depth,
0xffff,
@@ -398,7 +414,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
unsigned x, unsigned y,
unsigned mask)
{
- const struct lp_rast_state *state = inputs->state;
+ const struct lp_rast_state *state = task->state;
struct lp_fragment_shader_variant *variant = state->variant;
const struct lp_scene *scene = task->scene;
uint8_t *color[PIPE_MAX_COLOR_BUFS];
@@ -430,10 +446,10 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
BEGIN_JIT_CALL(state);
variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
x, y,
- inputs->facing,
- inputs->a0,
- inputs->dadx,
- inputs->dady,
+ inputs->frontfacing,
+ GET_A0(inputs),
+ GET_DADX(inputs),
+ GET_DADY(inputs),
color,
depth,
mask,
@@ -474,6 +490,14 @@ lp_rast_end_query(struct lp_rasterizer_task *task,
}
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ task->state = arg.state;
+}
+
+
/**
* Set top row and left column of the tile's pixels to white. For debugging.
@@ -581,10 +605,12 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
lp_rast_triangle_8,
lp_rast_triangle_3_4,
lp_rast_triangle_3_16,
+ lp_rast_triangle_4_16,
lp_rast_shade_tile,
lp_rast_shade_tile_opaque,
lp_rast_begin_query,
lp_rast_end_query,
+ lp_rast_set_state,
};
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c55b97a9d1..a64c152cf8 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -78,30 +78,28 @@ struct lp_rast_state {
* These pointers point into the bin data buffer.
*/
struct lp_rast_shader_inputs {
- float facing; /** Positive for front-facing, negative for back-facing */
- unsigned disable:1; /** Partially binned, disable this command */
- unsigned opaque:1; /** Is opaque */
-
- float (*a0)[4];
- float (*dadx)[4];
- float (*dady)[4];
-
- const struct lp_rast_state *state;
+ unsigned frontfacing:1; /** True for front-facing */
+ unsigned disable:1; /** Partially binned, disable this command */
+ unsigned opaque:1; /** Is opaque */
+ unsigned pad0:29; /* wasted space */
+ unsigned stride; /* how much to advance data between a0, dadx, dady */
+ unsigned pad2; /* wasted space */
+ unsigned pad3; /* wasted space */
+ /* followed by a0, dadx, dady and planes[] */
};
-
+/* Note: the order of these values is important as they are loaded by
+ * sse code in rasterization:
+ */
struct lp_rast_plane {
- /* one-pixel sized trivial accept offsets for each plane */
- int ei;
-
- /* one-pixel sized trivial reject offsets for each plane */
- int eo;
-
/* edge function values at minx,miny ?? */
int c;
int dcdx;
int dcdy;
+
+ /* one-pixel sized trivial reject offsets for each plane */
+ int eo;
};
/**
@@ -111,17 +109,24 @@ struct lp_rast_plane {
* Objects of this type are put into the lp_setup_context::data buffer.
*/
struct lp_rast_triangle {
- /* inputs for the shader */
- struct lp_rast_shader_inputs inputs;
-
#ifdef DEBUG
float v[3][2];
+ float pad0;
+ float pad1;
#endif
- struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */
+ /* inputs for the shader */
+ struct lp_rast_shader_inputs inputs;
+ /* planes are also allocated here */
};
+#define GET_A0(inputs) ((float (*)[4])((inputs)+1))
+#define GET_DADX(inputs) ((float (*)[4])((char *)((inputs) + 1) + (inputs)->stride))
+#define GET_DADY(inputs) ((float (*)[4])((char *)((inputs) + 1) + 2 * (inputs)->stride))
+#define GET_PLANES(tri) ((struct lp_rast_plane *)((char *)(&(tri)->inputs + 1) + 3 * (tri)->inputs.stride))
+
+
struct lp_rasterizer *
lp_rast_create( unsigned num_threads );
@@ -149,9 +154,10 @@ union lp_rast_cmd_arg {
const struct lp_rast_state *set_state;
uint8_t clear_color[4];
struct {
- unsigned value;
- unsigned mask;
+ uint32_t value;
+ uint32_t mask;
} clear_zstencil;
+ const struct lp_rast_state *state;
struct lp_fence *fence;
struct llvmpipe_query *query_obj;
};
@@ -238,12 +244,14 @@ lp_rast_arg_null( void )
#define LP_RAST_OP_TRIANGLE_8 0x9
#define LP_RAST_OP_TRIANGLE_3_4 0xa
#define LP_RAST_OP_TRIANGLE_3_16 0xb
-#define LP_RAST_OP_SHADE_TILE 0xc
-#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xd
-#define LP_RAST_OP_BEGIN_QUERY 0xe
-#define LP_RAST_OP_END_QUERY 0xf
-
-#define LP_RAST_OP_MAX 0x10
+#define LP_RAST_OP_TRIANGLE_4_16 0xc
+#define LP_RAST_OP_SHADE_TILE 0xd
+#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xe
+#define LP_RAST_OP_BEGIN_QUERY 0xf
+#define LP_RAST_OP_END_QUERY 0x10
+#define LP_RAST_OP_SET_STATE 0x11
+
+#define LP_RAST_OP_MAX 0x12
#define LP_RAST_OP_MASK 0xff
void
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
index 9fc78645a3..e2783aa568 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -12,6 +12,7 @@ static INLINE int u_bit_scan(unsigned *mask)
struct tile {
int coverage;
int overdraw;
+ const struct lp_rast_state *state;
char data[TILE_SIZE][TILE_SIZE];
};
@@ -42,10 +43,12 @@ static const char *cmd_names[LP_RAST_OP_MAX] =
"triangle_8",
"triangle_3_4",
"triangle_3_16",
+ "triangle_4_16",
"shade_tile",
"shade_tile_opaque",
"begin_query",
"end_query",
+ "set_state",
};
static const char *cmd_name(unsigned cmd)
@@ -55,31 +58,31 @@ static const char *cmd_name(unsigned cmd)
}
static const struct lp_fragment_shader_variant *
-get_variant( const struct cmd_block *block,
- int k )
+get_variant( const struct lp_rast_state *state,
+ const struct cmd_block *block,
+ int k )
{
if (block->cmd[k] == LP_RAST_OP_SHADE_TILE ||
- block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE)
- return block->arg[k].shade_tile->state->variant;
-
- if (block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
+ block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE ||
+ block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
block->cmd[k] == LP_RAST_OP_TRIANGLE_2 ||
block->cmd[k] == LP_RAST_OP_TRIANGLE_3 ||
block->cmd[k] == LP_RAST_OP_TRIANGLE_4 ||
block->cmd[k] == LP_RAST_OP_TRIANGLE_5 ||
block->cmd[k] == LP_RAST_OP_TRIANGLE_6 ||
block->cmd[k] == LP_RAST_OP_TRIANGLE_7)
- return block->arg[k].triangle.tri->inputs.state->variant;
+ return state->variant;
return NULL;
}
static boolean
-is_blend( const struct cmd_block *block,
+is_blend( const struct lp_rast_state *state,
+ const struct cmd_block *block,
int k )
{
- const struct lp_fragment_shader_variant *variant = get_variant(block, k);
+ const struct lp_fragment_shader_variant *variant = get_variant(state, block, k);
if (variant)
return variant->key.blend.rt[0].blend_enable;
@@ -92,6 +95,7 @@ is_blend( const struct cmd_block *block,
static void
debug_bin( const struct cmd_bin *bin )
{
+ const struct lp_rast_state *state;
const struct cmd_block *head = bin->head;
int i, j = 0;
@@ -99,9 +103,12 @@ debug_bin( const struct cmd_bin *bin )
while (head) {
for (i = 0; i < head->count; i++, j++) {
+ if (head->cmd[i] == LP_RAST_OP_SET_STATE)
+ state = head->arg[i].state;
+
debug_printf("%d: %s %s\n", j,
cmd_name(head->cmd[i]),
- is_blend(head, i) ? "blended" : "");
+ is_blend(state, head, i) ? "blended" : "");
}
head = head->next;
}
@@ -133,7 +140,7 @@ debug_shade_tile(int x, int y,
char val)
{
const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
- boolean blend = inputs->state->variant->key.blend.rt[0].blend_enable;
+ boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
unsigned i,j;
if (inputs->disable)
@@ -171,11 +178,12 @@ debug_triangle(int tilex, int tiley,
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
unsigned plane_mask = arg.triangle.plane_mask;
+ const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
struct lp_rast_plane plane[8];
int x, y;
int count = 0;
unsigned i, nr_planes = 0;
- boolean blend = tri->inputs.state->variant->key.blend.rt[0].blend_enable;
+ boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
if (tri->inputs.disable) {
/* This triangle was partially binned and has been disabled */
@@ -183,7 +191,7 @@ debug_triangle(int tilex, int tiley,
}
while (plane_mask) {
- plane[nr_planes] = tri->plane[u_bit_scan(&plane_mask)];
+ plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
plane[nr_planes].c = (plane[nr_planes].c +
plane[nr_planes].dcdy * tiley -
plane[nr_planes].dcdx * tilex);
@@ -235,12 +243,15 @@ do_debug_bin( struct tile *tile,
for (block = bin->head; block; block = block->next) {
for (k = 0; k < block->count; k++, j++) {
- boolean blend = is_blend(block, k);
+ boolean blend = is_blend(tile->state, block, k);
char val = get_label(j);
int count = 0;
if (print_cmds)
debug_printf("%c: %15s", val, cmd_name(block->cmd[k]));
+
+ if (block->cmd[k] == LP_RAST_OP_SET_STATE)
+ tile->state = block->arg[k].state;
if (block->cmd[k] == LP_RAST_OP_CLEAR_COLOR ||
block->cmd[k] == LP_RAST_OP_CLEAR_ZSTENCIL)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 7370119e96..b30408f097 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -77,6 +77,7 @@ struct cmd_bin;
struct lp_rasterizer_task
{
const struct cmd_bin *bin;
+ const struct lp_rast_state *state;
struct lp_scene *scene;
unsigned x, y; /**< Pos of this tile in framebuffer, in pixels */
@@ -244,7 +245,7 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
unsigned x, unsigned y )
{
const struct lp_scene *scene = task->scene;
- const struct lp_rast_state *state = inputs->state;
+ const struct lp_rast_state *state = task->state;
struct lp_fragment_shader_variant *variant = state->variant;
uint8_t *color[PIPE_MAX_COLOR_BUFS];
void *depth;
@@ -260,10 +261,10 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
BEGIN_JIT_CALL(state);
variant->jit_function[RAST_WHOLE]( &state->jit_context,
x, y,
- inputs->facing,
- inputs->a0,
- inputs->dadx,
- inputs->dady,
+ inputs->frontfacing,
+ GET_A0(inputs),
+ GET_DADX(inputs),
+ GET_DADY(inputs),
color,
depth,
0xffff,
@@ -293,6 +294,14 @@ void lp_rast_triangle_3_4(struct lp_rasterizer_task *,
void lp_rast_triangle_3_16( struct lp_rasterizer_task *,
const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_4_16( struct lp_rasterizer_task *,
+ const union lp_rast_cmd_arg );
+
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg);
+
void
lp_debug_bin( const struct cmd_bin *bin );
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index a1f309d4b0..042c315635 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -123,6 +123,16 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
}
void
+lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ union lp_rast_cmd_arg arg2;
+ arg2.triangle.tri = arg.triangle.tri;
+ arg2.triangle.plane_mask = (1<<4)-1;
+ lp_rast_triangle_3(task, arg2);
+}
+
+void
lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
@@ -230,144 +240,207 @@ sign_bits4(const __m128i *cstep, int cdiff)
}
-/* Special case for 3 plane triangle which is contained entirely
- * within a 16x16 block.
- */
+#define NR_PLANES 3
+
+
+
+
+
+
+
void
lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
- const struct lp_rast_plane *plane = tri->plane;
- unsigned mask = arg.triangle.plane_mask;
- const int x = task->x + (mask & 0xff);
- const int y = task->y + (mask >> 8);
- unsigned outmask, inmask, partmask, partial_mask;
- unsigned j;
- __m128i cstep4[3][4];
-
- outmask = 0; /* outside one or more trivial reject planes */
- partmask = 0; /* outside one or more trivial accept planes */
-
- for (j = 0; j < 3; j++) {
- const int dcdx = -plane[j].dcdx * 4;
- const int dcdy = plane[j].dcdy * 4;
- __m128i xdcdy = _mm_set1_epi32(dcdy);
-
- cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
- cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
- cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
- cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
-
- {
- const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
- const int cox = plane[j].eo * 4;
- const int cio = plane[j].ei * 4 - 1;
-
- outmask |= sign_bits4(cstep4[j], c + cox);
- partmask |= sign_bits4(cstep4[j], c + cio);
- }
- }
+ const struct lp_rast_plane *plane = GET_PLANES(tri);
+ int x = (arg.triangle.plane_mask & 0xff) + task->x;
+ int y = (arg.triangle.plane_mask >> 8) + task->y;
+ unsigned i, j;
+
+ struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+ unsigned nr = 0;
+
+ __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+ __m128i zero = _mm_setzero_si128();
+
+ __m128i c;
+ __m128i dcdx;
+ __m128i dcdy;
+ __m128i rej4;
+
+ __m128i dcdx2;
+ __m128i dcdx3;
+
+ __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+ __m128i unused;
+
+ transpose4_epi32(&p0, &p1, &p2, &zero,
+ &c, &dcdx, &dcdy, &rej4);
+
+ /* Adjust dcdx;
+ */
+ dcdx = _mm_sub_epi32(zero, dcdx);
- if (outmask == 0xffff)
- return;
+ c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+ c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+ rej4 = _mm_slli_epi32(rej4, 2);
- /* Mask of sub-blocks which are inside all trivial accept planes:
- */
- inmask = ~partmask & 0xffff;
+ dcdx2 = _mm_add_epi32(dcdx, dcdx);
+ dcdx3 = _mm_add_epi32(dcdx2, dcdx);
- /* Mask of sub-blocks which are inside all trivial reject planes,
- * but outside at least one trivial accept plane:
- */
- partial_mask = partmask & ~outmask;
+ transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+ &span_0, &span_1, &span_2, &unused);
- assert((partial_mask & inmask) == 0);
+ for (i = 0; i < 4; i++) {
+ __m128i cx = c;
- /* Iterate over partials:
- */
- while (partial_mask) {
- int i = ffs(partial_mask) - 1;
- int ix = (i & 3) * 4;
- int iy = (i >> 2) * 4;
- int px = x + ix;
- int py = y + iy;
- unsigned mask = 0xffff;
-
- partial_mask &= ~(1 << i);
-
- for (j = 0; j < 3; j++) {
- const int cx = (plane[j].c
- - plane[j].dcdx * px
- + plane[j].dcdy * py) * 4;
-
- mask &= ~sign_bits4(cstep4[j], cx);
- }
+ for (j = 0; j < 4; j++) {
+ __m128i c4rej = _mm_add_epi32(cx, rej4);
+ __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
- if (mask)
- lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
- }
+ /* if (is_zero(rej_masks)) */
+ if (_mm_movemask_epi8(rej_masks) == 0) {
+ __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
+ __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
+ __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
- /* Iterate over fulls:
- */
- while (inmask) {
- int i = ffs(inmask) - 1;
- int ix = (i & 3) * 4;
- int iy = (i >> 2) * 4;
- int px = x + ix;
- int py = y + iy;
+ __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
+
+ __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+ __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+ __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
+
+ __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+ __m128i c_01 = _mm_packs_epi32(c_0, c_1);
+
+ __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+ __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+ __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
+
+ __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
- inmask &= ~(1 << i);
+ __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+ __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+ __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
- block_full_4(task, tri, px, py);
+ __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+ __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+ __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+ unsigned mask = _mm_movemask_epi8(c_0123);
+
+ out[nr].i = i;
+ out[nr].j = j;
+ out[nr].mask = mask;
+ if (mask != 0xffff)
+ nr++;
+ }
+ cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
+ }
+
+ c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
}
+
+ for (i = 0; i < nr; i++)
+ lp_rast_shade_quads_mask(task,
+ &tri->inputs,
+ x + 4 * out[i].j,
+ y + 4 * out[i].i,
+ 0xffff & ~out[i].mask);
}
+
+
+
void
lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
- const union lp_rast_cmd_arg arg)
+ const union lp_rast_cmd_arg arg)
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
- const struct lp_rast_plane *plane = tri->plane;
- unsigned mask = arg.triangle.plane_mask;
- const int x = task->x + (mask & 0xff);
- const int y = task->y + (mask >> 8);
- unsigned j;
-
- /* Iterate over partials:
+ const struct lp_rast_plane *plane = GET_PLANES(tri);
+ int x = (arg.triangle.plane_mask & 0xff) + task->x;
+ int y = (arg.triangle.plane_mask >> 8) + task->y;
+
+ __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+ __m128i zero = _mm_setzero_si128();
+
+ __m128i c;
+ __m128i dcdx;
+ __m128i dcdy;
+
+ __m128i dcdx2;
+ __m128i dcdx3;
+
+ __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+ __m128i unused;
+
+ transpose4_epi32(&p0, &p1, &p2, &zero,
+ &c, &dcdx, &dcdy, &unused);
+
+ /* Adjust dcdx;
*/
+ dcdx = _mm_sub_epi32(zero, dcdx);
+
+ c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+ c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+
+ dcdx2 = _mm_add_epi32(dcdx, dcdx);
+ dcdx3 = _mm_add_epi32(dcdx2, dcdx);
+
+ transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+ &span_0, &span_1, &span_2, &unused);
+
+
{
- unsigned mask = 0xffff;
+ __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
+ __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
+ __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
+
+ __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
- for (j = 0; j < 3; j++) {
- const int cx = (plane[j].c
- - plane[j].dcdx * x
- + plane[j].dcdy * y);
+ __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+ __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+ __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
- const int dcdx = -plane[j].dcdx;
- const int dcdy = plane[j].dcdy;
- __m128i xdcdy = _mm_set1_epi32(dcdy);
+ __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+ __m128i c_01 = _mm_packs_epi32(c_0, c_1);
- __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
- __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
- __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
- __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+ __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+ __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+ __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
- __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
- __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
- __m128i result = _mm_packs_epi16(cstep01, cstep23);
+ __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
- /* Extract the sign bits
- */
- mask &= ~_mm_movemask_epi8(result);
- }
+ __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+ __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+ __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
- if (mask)
- lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+ __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+ __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+ __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+ unsigned mask = _mm_movemask_epi8(c_0123);
+
+ if (mask != 0xffff)
+ lp_rast_shade_quads_mask(task,
+ &tri->inputs,
+ x,
+ y,
+ 0xffff & ~mask);
}
}
-
+#undef NR_PLANES
#endif
@@ -383,10 +456,13 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
#define TAG(x) x##_3
#define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
#include "lp_rast_tri_tmp.h"
#define TAG(x) x##_4
#define NR_PLANES 4
+#define TRI_16 lp_rast_triangle_4_16
#include "lp_rast_tri_tmp.h"
#define TAG(x) x##_5
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 9830a43ba5..4825d651c0 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -82,7 +82,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
const int dcdx = -plane[j].dcdx * 4;
const int dcdy = plane[j].dcdy * 4;
const int cox = plane[j].eo * 4;
- const int cio = plane[j].ei * 4 - 1;
+ const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+ const int cio = ei * 4 - 1;
build_masks(c[j] + cox,
cio - cox,
@@ -156,6 +157,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
unsigned plane_mask = arg.triangle.plane_mask;
+ const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
const int x = task->x, y = task->y;
struct lp_rast_plane plane[NR_PLANES];
int c[NR_PLANES];
@@ -172,7 +174,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
while (plane_mask) {
int i = ffs(plane_mask) - 1;
- plane[j] = tri->plane[i];
+ plane[j] = tri_plane[i];
plane_mask &= ~(1 << i);
c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
@@ -180,7 +182,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
const int dcdx = -plane[j].dcdx * 16;
const int dcdy = plane[j].dcdy * 16;
const int cox = plane[j].eo * 16;
- const int cio = plane[j].ei * 16 - 1;
+ const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+ const int cio = ei * 16 - 1;
build_masks(c[j] + cox,
cio - cox,
@@ -245,6 +248,133 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
}
}
+#if defined(PIPE_ARCH_SSE) && defined(TRI_16)
+/* XXX: special case this when intersection is not required.
+ * - tile completely within bbox,
+ * - bbox completely within tile.
+ */
+void
+TRI_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ const struct lp_rast_triangle *tri = arg.triangle.tri;
+ const struct lp_rast_plane *plane = GET_PLANES(tri);
+ unsigned mask = arg.triangle.plane_mask;
+ unsigned outmask, partial_mask;
+ unsigned j;
+ __m128i cstep4[NR_PLANES][4];
+
+ int x = (mask & 0xff);
+ int y = (mask >> 8);
+
+ outmask = 0; /* outside one or more trivial reject planes */
+
+ x += task->x;
+ y += task->y;
+
+ for (j = 0; j < NR_PLANES; j++) {
+ const int dcdx = -plane[j].dcdx * 4;
+ const int dcdy = plane[j].dcdy * 4;
+ __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+ cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
+ cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
+ cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
+ cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
+
+ {
+ const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+ const int cox = plane[j].eo * 4;
+
+ outmask |= sign_bits4(cstep4[j], c + cox);
+ }
+ }
+
+ if (outmask == 0xffff)
+ return;
+
+
+ /* Mask of sub-blocks which are inside all trivial reject planes,
+ * but outside at least one trivial accept plane:
+ */
+ partial_mask = 0xffff & ~outmask;
+
+ /* Iterate over partials:
+ */
+ while (partial_mask) {
+ int i = ffs(partial_mask) - 1;
+ int ix = (i & 3) * 4;
+ int iy = (i >> 2) * 4;
+ int px = x + ix;
+ int py = y + iy;
+ unsigned mask = 0xffff;
+
+ partial_mask &= ~(1 << i);
+
+ for (j = 0; j < NR_PLANES; j++) {
+ const int cx = (plane[j].c - 1
+ - plane[j].dcdx * px
+ + plane[j].dcdy * py) * 4;
+
+ mask &= ~sign_bits4(cstep4[j], cx);
+ }
+
+ if (mask)
+ lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
+ }
+}
+#endif
+
+#if defined(PIPE_ARCH_SSE) && defined(TRI_4)
+void
+TRI_4(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ const struct lp_rast_triangle *tri = arg.triangle.tri;
+ const struct lp_rast_plane *plane = GET_PLANES(tri);
+ unsigned mask = arg.triangle.plane_mask;
+ const int x = task->x + (mask & 0xff);
+ const int y = task->y + (mask >> 8);
+ unsigned j;
+
+ /* Iterate over partials:
+ */
+ {
+ unsigned mask = 0xffff;
+
+ for (j = 0; j < NR_PLANES; j++) {
+ const int cx = (plane[j].c
+ - plane[j].dcdx * x
+ + plane[j].dcdy * y);
+
+ const int dcdx = -plane[j].dcdx;
+ const int dcdy = plane[j].dcdy;
+ __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+ __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
+ __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+ __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+ __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+ __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+ __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+ __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+ /* Extract the sign bits
+ */
+ mask &= ~_mm_movemask_epi8(result);
+ }
+
+ if (mask)
+ lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+ }
+}
+#endif
+
+
+
#undef TAG
+#undef TRI_4
+#undef TRI_16
#undef NR_PLANES
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index 8b504f23a3..a4fdf7cff3 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -203,7 +203,9 @@ lp_scene_end_rasterization(struct lp_scene *scene )
for (i = 0; i < scene->tiles_x; i++) {
for (j = 0; j < scene->tiles_y; j++) {
struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
- bin->head = bin->tail = NULL;
+ bin->head = NULL;
+ bin->tail = NULL;
+ bin->last_state = NULL;
}
}
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index dbef7692e4..622c522f11 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -41,6 +41,7 @@
#include "lp_debug.h"
struct lp_scene_queue;
+struct lp_rast_state;
/* We're limited to 2K by 2K for 32bit fixed point rasterization.
* Will need a 64-bit version for larger framebuffers.
@@ -94,6 +95,7 @@ struct data_block {
struct cmd_bin {
ushort x;
ushort y;
+ const struct lp_rast_state *last_state; /* most recent state set in bin */
struct cmd_block *head;
struct cmd_block *tail;
};
@@ -297,7 +299,7 @@ lp_scene_bin_command( struct lp_scene *scene,
assert(x < scene->tiles_x);
assert(y < scene->tiles_y);
- assert(cmd <= LP_RAST_OP_END_QUERY);
+ assert(cmd < LP_RAST_OP_MAX);
if (tail == NULL || tail->count == CMD_BLOCK_MAX) {
tail = lp_scene_new_cmd_block( scene, bin );
@@ -318,6 +320,30 @@ lp_scene_bin_command( struct lp_scene *scene,
}
+static INLINE boolean
+lp_scene_bin_cmd_with_state( struct lp_scene *scene,
+ unsigned x, unsigned y,
+ const struct lp_rast_state *state,
+ unsigned cmd,
+ union lp_rast_cmd_arg arg )
+{
+ struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+
+ if (state != bin->last_state) {
+ bin->last_state = state;
+ if (!lp_scene_bin_command(scene, x, y,
+ LP_RAST_OP_SET_STATE,
+ lp_rast_arg_state(state)))
+ return FALSE;
+ }
+
+ if (!lp_scene_bin_command( scene, x, y, cmd, arg ))
+ return FALSE;
+
+ return TRUE;
+}
+
+
/* Add a command to all active bins.
*/
static INLINE boolean
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 3854fd70af..6118434d3d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -56,7 +56,7 @@
#include "draw/draw_vbuf.h"
-static void set_scene_state( struct lp_setup_context *, enum setup_state,
+static boolean set_scene_state( struct lp_setup_context *, enum setup_state,
const char *reason);
static boolean try_update_scene_state( struct lp_setup_context *setup );
@@ -167,7 +167,7 @@ lp_setup_rasterize_scene( struct lp_setup_context *setup )
-static void
+static boolean
begin_binning( struct lp_setup_context *setup )
{
struct lp_scene *scene = setup->scene;
@@ -181,6 +181,8 @@ begin_binning( struct lp_setup_context *setup )
/* Always create a fence:
*/
scene->fence = lp_fence_create(MAX2(1, setup->num_threads));
+ if (!scene->fence)
+ return FALSE;
/* Initialize the bin flags and x/y coords:
*/
@@ -192,7 +194,8 @@ begin_binning( struct lp_setup_context *setup )
}
ok = try_update_scene_state(setup);
- assert(ok);
+ if (!ok)
+ return FALSE;
if (setup->fb.zsbuf &&
((setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
@@ -208,7 +211,8 @@ begin_binning( struct lp_setup_context *setup )
ok = lp_scene_bin_everywhere( scene,
LP_RAST_OP_CLEAR_COLOR,
setup->clear.color );
- assert(ok);
+ if (!ok)
+ return FALSE;
}
}
@@ -216,12 +220,14 @@ begin_binning( struct lp_setup_context *setup )
if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) {
if (!need_zsload)
scene->has_depthstencil_clear = TRUE;
+
ok = lp_scene_bin_everywhere( scene,
LP_RAST_OP_CLEAR_ZSTENCIL,
lp_rast_arg_clearzs(
setup->clear.zsvalue,
setup->clear.zsmask));
- assert(ok);
+ if (!ok)
+ return FALSE;
}
}
@@ -229,15 +235,16 @@ begin_binning( struct lp_setup_context *setup )
ok = lp_scene_bin_everywhere( scene,
LP_RAST_OP_BEGIN_QUERY,
lp_rast_arg_query(setup->active_query) );
- assert(ok);
+ if (!ok)
+ return FALSE;
}
-
setup->clear.flags = 0;
setup->clear.zsmask = 0;
setup->clear.zsvalue = 0;
LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
+ return TRUE;
}
@@ -246,12 +253,12 @@ begin_binning( struct lp_setup_context *setup )
*
* TODO: fast path for fullscreen clears and no triangles.
*/
-static void
+static boolean
execute_clears( struct lp_setup_context *setup )
{
LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
- begin_binning( setup );
+ return begin_binning( setup );
}
const char *states[] = {
@@ -262,7 +269,7 @@ const char *states[] = {
};
-static void
+static boolean
set_scene_state( struct lp_setup_context *setup,
enum setup_state new_state,
const char *reason)
@@ -270,7 +277,7 @@ set_scene_state( struct lp_setup_context *setup,
unsigned old_state = setup->state;
if (old_state == new_state)
- return;
+ return TRUE;
if (LP_DEBUG & DEBUG_SCENE) {
debug_printf("%s old %s new %s%s%s\n",
@@ -294,12 +301,14 @@ set_scene_state( struct lp_setup_context *setup,
break;
case SETUP_ACTIVE:
- begin_binning( setup );
+ if (!begin_binning( setup ))
+ goto fail;
break;
case SETUP_FLUSHED:
if (old_state == SETUP_CLEARED)
- execute_clears( setup );
+ if (!execute_clears( setup ))
+ goto fail;
lp_setup_rasterize_scene( setup );
assert(setup->scene == NULL);
@@ -307,9 +316,21 @@ set_scene_state( struct lp_setup_context *setup,
default:
assert(0 && "invalid setup state mode");
+ goto fail;
}
setup->state = new_state;
+ return TRUE;
+
+fail:
+ if (setup->scene) {
+ lp_scene_end_rasterization(setup->scene);
+ setup->scene = NULL;
+ }
+
+ setup->state = SETUP_FLUSHED;
+ lp_setup_reset( setup );
+ return FALSE;
}
@@ -377,16 +398,19 @@ lp_setup_try_clear( struct lp_setup_context *setup,
}
if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
- unsigned zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
- unsigned smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
+ uint32_t zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
+ uint32_t smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
zsvalue = util_pack_z_stencil(setup->fb.zsbuf->format,
depth,
stencil);
- zsmask = util_pack_uint_z_stencil(setup->fb.zsbuf->format,
+
+ zsmask = util_pack_mask_z_stencil(setup->fb.zsbuf->format,
zmask,
smask);
+
+ zsvalue &= zsmask;
}
if (setup->state == SETUP_ACTIVE) {
@@ -431,7 +455,7 @@ lp_setup_try_clear( struct lp_setup_context *setup,
if (flags & PIPE_CLEAR_COLOR) {
memcpy(setup->clear.color.clear_color,
&color_arg,
- sizeof color_arg);
+ sizeof setup->clear.color.clear_color);
}
}
@@ -490,12 +514,14 @@ void
lp_setup_set_point_state( struct lp_setup_context *setup,
float point_size,
boolean point_size_per_vertex,
- uint sprite)
+ uint sprite_coord_enable,
+ uint sprite_coord_origin)
{
LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
setup->point_size = point_size;
- setup->sprite = sprite;
+ setup->sprite_coord_enable = sprite_coord_enable;
+ setup->sprite_coord_origin = sprite_coord_origin;
setup->point_size_per_vertex = point_size_per_vertex;
}
@@ -624,7 +650,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
struct pipe_sampler_view *view = i < num ? views[i] : NULL;
- if(view) {
+ if (view) {
struct pipe_resource *tex = view->texture;
struct llvmpipe_resource *lp_tex = llvmpipe_resource(tex);
struct lp_jit_texture *jit_tex;
@@ -683,6 +709,38 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
/**
+ * Called during state validation when LP_NEW_SAMPLER is set.
+ */
+void
+lp_setup_set_fragment_sampler_state(struct lp_setup_context *setup,
+ unsigned num,
+ const struct pipe_sampler_state **samplers)
+{
+ unsigned i;
+
+ LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+ assert(num <= PIPE_MAX_SAMPLERS);
+
+ for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+ const struct pipe_sampler_state *sampler = i < num ? samplers[i] : NULL;
+
+ if (sampler) {
+ struct lp_jit_texture *jit_tex;
+ jit_tex = &setup->fs.current.jit_context.textures[i];
+
+ jit_tex->min_lod = sampler->min_lod;
+ jit_tex->max_lod = sampler->max_lod;
+ jit_tex->lod_bias = sampler->lod_bias;
+ COPY_4V(jit_tex->border_color, sampler->border_color);
+ }
+ }
+
+ setup->dirty |= LP_SETUP_NEW_FS;
+}
+
+
+/**
* Is the given texture referenced by any scene?
* Note: we have to check all scenes including any scenes currently
* being rendered and the current scene being built.
@@ -839,7 +897,7 @@ try_update_scene_state( struct lp_setup_context *setup )
return TRUE;
}
-void
+boolean
lp_setup_update_state( struct lp_setup_context *setup,
boolean update_scene )
{
@@ -870,20 +928,38 @@ lp_setup_update_state( struct lp_setup_context *setup,
setup->setup.variant->key.size) == 0);
}
- if (update_scene)
- set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ );
+ if (update_scene) {
+ if (!set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ ))
+ return FALSE;
+ }
/* Only call into update_scene_state() if we already have a
* scene:
*/
if (update_scene && setup->scene) {
assert(setup->state == SETUP_ACTIVE);
- if (!try_update_scene_state(setup)) {
- lp_setup_flush_and_restart(setup);
- if (!try_update_scene_state(setup))
- assert(0);
- }
+
+ if (try_update_scene_state(setup))
+ return TRUE;
+
+ /* Update failed, try to restart the scene.
+ *
+ * Cannot call lp_setup_flush_and_restart() directly here
+ * because of potential recursion.
+ */
+ if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+ return FALSE;
+
+ if (!set_scene_state(setup, SETUP_ACTIVE, __FUNCTION__))
+ return FALSE;
+
+ if (!setup->scene)
+ return FALSE;
+
+ return try_update_scene_state(setup);
}
+
+ return TRUE;
}
@@ -987,12 +1063,12 @@ lp_setup_begin_query(struct lp_setup_context *setup,
LP_RAST_OP_BEGIN_QUERY,
lp_rast_arg_query(pq))) {
- lp_setup_flush_and_restart(setup);
+ if (!lp_setup_flush_and_restart(setup))
+ return;
if (!lp_scene_bin_everywhere(setup->scene,
LP_RAST_OP_BEGIN_QUERY,
lp_rast_arg_query(pq))) {
- assert(0);
return;
}
}
@@ -1036,14 +1112,20 @@ lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq)
}
-void
+boolean
lp_setup_flush_and_restart(struct lp_setup_context *setup)
{
if (0) debug_printf("%s\n", __FUNCTION__);
assert(setup->state == SETUP_ACTIVE);
- set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__);
- lp_setup_update_state(setup, TRUE);
+
+ if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+ return FALSE;
+
+ if (!lp_setup_update_state(setup, TRUE))
+ return FALSE;
+
+ return TRUE;
}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 19078ebbcb..ebb18f8134 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -85,7 +85,8 @@ void
lp_setup_set_point_state( struct lp_setup_context *setup,
float point_size,
boolean point_size_per_vertex,
- uint sprite);
+ uint sprite_coord_enable,
+ uint sprite_coord_origin);
void
lp_setup_set_setup_variant( struct lp_setup_context *setup,
@@ -121,6 +122,11 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
unsigned num,
struct pipe_sampler_view **views);
+void
+lp_setup_set_fragment_sampler_state(struct lp_setup_context *setup,
+ unsigned num,
+ const struct pipe_sampler_state **samplers);
+
unsigned
lp_setup_is_resource_referenced( const struct lp_setup_context *setup,
const struct pipe_resource *texture );
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index ff3b69afa8..dc2533bedc 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -76,7 +76,7 @@ struct lp_setup_context
uint prim;
uint vertex_size;
uint nr_vertices;
- uint sprite;
+ uint sprite_coord_enable, sprite_coord_origin;
uint vertex_buffer_size;
void *vertex_buffer;
@@ -164,12 +164,12 @@ void lp_setup_choose_point( struct lp_setup_context *setup );
void lp_setup_init_vbuf(struct lp_setup_context *setup);
-void lp_setup_update_state( struct lp_setup_context *setup,
+boolean lp_setup_update_state( struct lp_setup_context *setup,
boolean update_scene);
void lp_setup_destroy( struct lp_setup_context *setup );
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
+boolean lp_setup_flush_and_restart(struct lp_setup_context *setup);
void
lp_setup_print_triangle(struct lp_setup_context *setup,
@@ -195,6 +195,4 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
const struct u_rect *bbox,
int nr_planes );
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
-
#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 928ffdc5cb..827413bb33 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -47,6 +47,10 @@ struct lp_line_info {
const float (*v1)[4];
const float (*v2)[4];
+
+ float (*a0)[4];
+ float (*dadx)[4];
+ float (*dady)[4];
};
@@ -54,14 +58,14 @@ struct lp_line_info {
* Compute a0 for a constant-valued coefficient (GL_FLAT shading).
*/
static void constant_coef( struct lp_setup_context *setup,
- struct lp_rast_triangle *tri,
+ struct lp_line_info *info,
unsigned slot,
const float value,
unsigned i )
{
- tri->inputs.a0[slot][i] = value;
- tri->inputs.dadx[slot][i] = 0.0f;
- tri->inputs.dady[slot][i] = 0.0f;
+ info->a0[slot][i] = value;
+ info->dadx[slot][i] = 0.0f;
+ info->dady[slot][i] = 0.0f;
}
@@ -70,7 +74,6 @@ static void constant_coef( struct lp_setup_context *setup,
* for a triangle.
*/
static void linear_coef( struct lp_setup_context *setup,
- struct lp_rast_triangle *tri,
struct lp_line_info *info,
unsigned slot,
unsigned vert_attr,
@@ -83,10 +86,10 @@ static void linear_coef( struct lp_setup_context *setup,
float dadx = da21 * info->dx * info->oneoverarea;
float dady = da21 * info->dy * info->oneoverarea;
- tri->inputs.dadx[slot][i] = dadx;
- tri->inputs.dady[slot][i] = dady;
+ info->dadx[slot][i] = dadx;
+ info->dady[slot][i] = dady;
- tri->inputs.a0[slot][i] = (a1 -
+ info->a0[slot][i] = (a1 -
(dadx * (info->v1[0][0] - setup->pixel_offset) +
dady * (info->v1[0][1] - setup->pixel_offset)));
}
@@ -101,7 +104,6 @@ static void linear_coef( struct lp_setup_context *setup,
* divide the interpolated value by the interpolated W at that fragment.
*/
static void perspective_coef( struct lp_setup_context *setup,
- struct lp_rast_triangle *tri,
struct lp_line_info *info,
unsigned slot,
unsigned vert_attr,
@@ -116,43 +118,42 @@ static void perspective_coef( struct lp_setup_context *setup,
float dadx = da21 * info->dx * info->oneoverarea;
float dady = da21 * info->dy * info->oneoverarea;
- tri->inputs.dadx[slot][i] = dadx;
- tri->inputs.dady[slot][i] = dady;
+ info->dadx[slot][i] = dadx;
+ info->dady[slot][i] = dady;
- tri->inputs.a0[slot][i] = (a1 -
- (dadx * (info->v1[0][0] - setup->pixel_offset) +
- dady * (info->v1[0][1] - setup->pixel_offset)));
+ info->a0[slot][i] = (a1 -
+ (dadx * (info->v1[0][0] - setup->pixel_offset) +
+ dady * (info->v1[0][1] - setup->pixel_offset)));
}
static void
setup_fragcoord_coef( struct lp_setup_context *setup,
- struct lp_rast_triangle *tri,
struct lp_line_info *info,
unsigned slot,
unsigned usage_mask)
{
/*X*/
if (usage_mask & TGSI_WRITEMASK_X) {
- tri->inputs.a0[slot][0] = 0.0;
- tri->inputs.dadx[slot][0] = 1.0;
- tri->inputs.dady[slot][0] = 0.0;
+ info->a0[slot][0] = 0.0;
+ info->dadx[slot][0] = 1.0;
+ info->dady[slot][0] = 0.0;
}
/*Y*/
if (usage_mask & TGSI_WRITEMASK_Y) {
- tri->inputs.a0[slot][1] = 0.0;
- tri->inputs.dadx[slot][1] = 0.0;
- tri->inputs.dady[slot][1] = 1.0;
+ info->a0[slot][1] = 0.0;
+ info->dadx[slot][1] = 0.0;
+ info->dady[slot][1] = 1.0;
}
/*Z*/
if (usage_mask & TGSI_WRITEMASK_Z) {
- linear_coef(setup, tri, info, slot, 0, 2);
+ linear_coef(setup, info, slot, 0, 2);
}
/*W*/
if (usage_mask & TGSI_WRITEMASK_W) {
- linear_coef(setup, tri, info, slot, 0, 3);
+ linear_coef(setup, info, slot, 0, 3);
}
}
@@ -160,7 +161,6 @@ setup_fragcoord_coef( struct lp_setup_context *setup,
* Compute the tri->coef[] array dadx, dady, a0 values.
*/
static void setup_line_coefficients( struct lp_setup_context *setup,
- struct lp_rast_triangle *tri,
struct lp_line_info *info)
{
const struct lp_setup_variant_key *key = &setup->setup.variant->key;
@@ -179,25 +179,25 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
if (key->flatshade_first) {
for (i = 0; i < NUM_CHANNELS; i++)
if (usage_mask & (1 << i))
- constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i);
+ constant_coef(setup, info, slot+1, info->v1[vert_attr][i], i);
}
else {
for (i = 0; i < NUM_CHANNELS; i++)
if (usage_mask & (1 << i))
- constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i);
+ constant_coef(setup, info, slot+1, info->v2[vert_attr][i], i);
}
break;
case LP_INTERP_LINEAR:
for (i = 0; i < NUM_CHANNELS; i++)
if (usage_mask & (1 << i))
- linear_coef(setup, tri, info, slot+1, vert_attr, i);
+ linear_coef(setup, info, slot+1, vert_attr, i);
break;
case LP_INTERP_PERSPECTIVE:
for (i = 0; i < NUM_CHANNELS; i++)
if (usage_mask & (1 << i))
- perspective_coef(setup, tri, info, slot+1, vert_attr, i);
+ perspective_coef(setup, info, slot+1, vert_attr, i);
fragcoord_usage_mask |= TGSI_WRITEMASK_W;
break;
@@ -210,6 +210,12 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
fragcoord_usage_mask |= usage_mask;
break;
+ case LP_INTERP_FACING:
+ for (i = 0; i < NUM_CHANNELS; i++)
+ if (usage_mask & (1 << i))
+ constant_coef(setup, info, slot+1, 1.0, i);
+ break;
+
default:
assert(0);
}
@@ -217,7 +223,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
/* The internal position input is in slot zero:
*/
- setup_fragcoord_coef(setup, tri, info, 0,
+ setup_fragcoord_coef(setup, info, 0,
fragcoord_usage_mask);
}
@@ -274,6 +280,7 @@ try_setup_line( struct lp_setup_context *setup,
struct lp_scene *scene = setup->scene;
const struct lp_setup_variant_key *key = &setup->setup.variant->key;
struct lp_rast_triangle *line;
+ struct lp_rast_plane *plane;
struct lp_line_info info;
float width = MAX2(1.0, setup->line_width);
struct u_rect bbox;
@@ -296,6 +303,7 @@ try_setup_line( struct lp_setup_context *setup,
float x2diff;
float y2diff;
float dx, dy;
+ float area;
boolean draw_start;
boolean draw_end;
@@ -315,6 +323,18 @@ try_setup_line( struct lp_setup_context *setup,
dx = v1[0][0] - v2[0][0];
dy = v1[0][1] - v2[0][1];
+ area = (dx * dx + dy * dy);
+ if (area == 0) {
+ LP_COUNT(nr_culled_tris);
+ return TRUE;
+ }
+
+ info.oneoverarea = 1.0f / area;
+ info.dx = dx;
+ info.dy = dy;
+ info.v1 = v1;
+ info.v2 = v2;
+
/* X-MAJOR LINE */
if (fabsf(dx) >= fabsf(dy)) {
@@ -460,7 +480,7 @@ try_setup_line( struct lp_setup_context *setup,
else {
/* do intersection test */
float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
- draw_end = (xintersect < 1.0 && xintersect > 0.0);
+ draw_end = (xintersect < 1.0 && xintersect >= 0.0);
}
/* Are we already drawing start/end?
@@ -498,7 +518,7 @@ try_setup_line( struct lp_setup_context *setup,
x_offset_end = y_offset_end * dxdy;
}
}
-
+
/* x/y positions in fixed point */
x[0] = subpixel_snap(v1[0][0] + x_offset - setup->pixel_offset) - fixed_width/2;
x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
@@ -566,39 +586,35 @@ try_setup_line( struct lp_setup_context *setup,
#endif
/* calculate the deltas */
- line->plane[0].dcdy = x[0] - x[1];
- line->plane[1].dcdy = x[1] - x[2];
- line->plane[2].dcdy = x[2] - x[3];
- line->plane[3].dcdy = x[3] - x[0];
+ plane = GET_PLANES(line);
+ plane[0].dcdy = x[0] - x[1];
+ plane[1].dcdy = x[1] - x[2];
+ plane[2].dcdy = x[2] - x[3];
+ plane[3].dcdy = x[3] - x[0];
- line->plane[0].dcdx = y[0] - y[1];
- line->plane[1].dcdx = y[1] - y[2];
- line->plane[2].dcdx = y[2] - y[3];
- line->plane[3].dcdx = y[3] - y[0];
+ plane[0].dcdx = y[0] - y[1];
+ plane[1].dcdx = y[1] - y[2];
+ plane[2].dcdx = y[2] - y[3];
+ plane[3].dcdx = y[3] - y[0];
- info.oneoverarea = 1.0f / (dx * dx + dy * dy);
- info.dx = dx;
- info.dy = dy;
- info.v1 = v1;
- info.v2 = v2;
-
/* Setup parameter interpolants:
*/
- setup_line_coefficients( setup, line, &info);
+ info.a0 = GET_A0(&line->inputs);
+ info.dadx = GET_DADX(&line->inputs);
+ info.dady = GET_DADY(&line->inputs);
+ setup_line_coefficients(setup, &info);
- line->inputs.facing = 1.0F;
- line->inputs.state = setup->fs.stored;
+ line->inputs.frontfacing = TRUE;
line->inputs.disable = FALSE;
line->inputs.opaque = FALSE;
for (i = 0; i < 4; i++) {
- struct lp_rast_plane *plane = &line->plane[i];
/* half-edge constants, will be interated over the whole render
* target.
*/
- plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
+ plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
/* correct for top-left vs. bottom-left fill convention.
@@ -614,38 +630,34 @@ try_setup_line( struct lp_setup_context *setup,
* to its usual method, in which case it will probably want
* to use the opposite, top-left convention.
*/
- if (plane->dcdx < 0) {
+ if (plane[i].dcdx < 0) {
/* both fill conventions want this - adjust for left edges */
- plane->c++;
+ plane[i].c++;
}
- else if (plane->dcdx == 0) {
+ else if (plane[i].dcdx == 0) {
if (setup->pixel_offset == 0) {
/* correct for top-left fill convention:
*/
- if (plane->dcdy > 0) plane->c++;
+ if (plane[i].dcdy > 0) plane[i].c++;
}
else {
/* correct for bottom-left fill convention:
*/
- if (plane->dcdy < 0) plane->c++;
+ if (plane[i].dcdy < 0) plane[i].c++;
}
}
- plane->dcdx *= FIXED_ONE;
- plane->dcdy *= FIXED_ONE;
+ plane[i].dcdx *= FIXED_ONE;
+ plane[i].dcdy *= FIXED_ONE;
/* find trivial reject offsets for each edge for a single-pixel
* sized block. These will be scaled up at each recursive level to
* match the active blocksize. Scaling in this way works best if
* the blocks are square.
*/
- plane->eo = 0;
- if (plane->dcdx < 0) plane->eo -= plane->dcdx;
- if (plane->dcdy > 0) plane->eo += plane->dcdy;
-
- /* Calculate trivial accept offsets from the above.
- */
- plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+ plane[i].eo = 0;
+ if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+ if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
}
@@ -668,29 +680,25 @@ try_setup_line( struct lp_setup_context *setup,
* these planes elsewhere.
*/
if (nr_planes == 8) {
- line->plane[4].dcdx = -1;
- line->plane[4].dcdy = 0;
- line->plane[4].c = 1-bbox.x0;
- line->plane[4].ei = 0;
- line->plane[4].eo = 1;
-
- line->plane[5].dcdx = 1;
- line->plane[5].dcdy = 0;
- line->plane[5].c = bbox.x1+1;
- line->plane[5].ei = -1;
- line->plane[5].eo = 0;
-
- line->plane[6].dcdx = 0;
- line->plane[6].dcdy = 1;
- line->plane[6].c = 1-bbox.y0;
- line->plane[6].ei = 0;
- line->plane[6].eo = 1;
-
- line->plane[7].dcdx = 0;
- line->plane[7].dcdy = -1;
- line->plane[7].c = bbox.y1+1;
- line->plane[7].ei = -1;
- line->plane[7].eo = 0;
+ plane[4].dcdx = -1;
+ plane[4].dcdy = 0;
+ plane[4].c = 1-bbox.x0;
+ plane[4].eo = 1;
+
+ plane[5].dcdx = 1;
+ plane[5].dcdy = 0;
+ plane[5].c = bbox.x1+1;
+ plane[5].eo = 0;
+
+ plane[6].dcdx = 0;
+ plane[6].dcdy = 1;
+ plane[6].c = 1-bbox.y0;
+ plane[6].eo = 1;
+
+ plane[7].dcdx = 0;
+ plane[7].dcdy = -1;
+ plane[7].c = bbox.y1+1;
+ plane[7].eo = 0;
}
return lp_setup_bin_triangle(setup, line, &bbox, nr_planes);
@@ -703,10 +711,11 @@ static void lp_setup_line( struct lp_setup_context *setup,
{
if (!try_setup_line( setup, v0, v1 ))
{
- lp_setup_flush_and_restart(setup);
+ if (!lp_setup_flush_and_restart(setup))
+ return;
if (!try_setup_line( setup, v0, v1 ))
- assert(0);
+ return;
}
}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index c98966022e..146f1bd07c 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -33,7 +33,6 @@
#include "util/u_math.h"
#include "util/u_memory.h"
#include "lp_perf.h"
-#include "lp_setup_context.h"
#include "lp_rast.h"
#include "lp_state_fs.h"
#include "lp_state_setup.h"
@@ -47,63 +46,118 @@ struct point_info {
int dx01, dx12;
const float (*v0)[4];
+
+ float (*a0)[4];
+ float (*dadx)[4];
+ float (*dady)[4];
};
/**
* Compute a0 for a constant-valued coefficient (GL_FLAT shading).
*/
-static void constant_coef( struct lp_setup_context *setup,
- struct lp_rast_triangle *point,
- unsigned slot,
- const float value,
- unsigned i )
+static void
+constant_coef(struct lp_setup_context *setup,
+ struct point_info *info,
+ unsigned slot,
+ const float value,
+ unsigned i)
{
- point->inputs.a0[slot][i] = value;
- point->inputs.dadx[slot][i] = 0.0f;
- point->inputs.dady[slot][i] = 0.0f;
+ info->a0[slot][i] = value;
+ info->dadx[slot][i] = 0.0f;
+ info->dady[slot][i] = 0.0f;
}
-static void perspective_coef( struct lp_setup_context *setup,
- struct lp_rast_triangle *point,
- const struct point_info *info,
- unsigned slot,
- unsigned vert_attr,
- unsigned i)
+
+static void
+point_persp_coeff(struct lp_setup_context *setup,
+ const struct point_info *info,
+ unsigned slot,
+ unsigned i)
{
- if (i == 0) {
- float dadx = FIXED_ONE / (float)info->dx12;
+ /*
+ * Fragment shader expects pre-multiplied w for LP_INTERP_PERSPECTIVE. A
+ * better stratergy would be to take the primitive in consideration when
+ * generating the fragment shader key, and therefore avoid the per-fragment
+ * perspective divide.
+ */
+
+ float w0 = info->v0[0][3];
+
+ assert(i < 4);
+
+ info->a0[slot][i] = info->v0[slot][i]*w0;
+ info->dadx[slot][i] = 0.0f;
+ info->dady[slot][i] = 0.0f;
+}
+
+
+/**
+ * Setup automatic texcoord coefficients (for sprite rendering).
+ * \param slot the vertex attribute slot to setup
+ * \param i the attribute channel in [0,3]
+ * \param sprite_coord_origin one of PIPE_SPRITE_COORD_x
+ * \param perspective does the shader expects pre-multiplied w, i.e.,
+ * LP_INTERP_PERSPECTIVE is specified in the shader key
+ */
+static void
+texcoord_coef(struct lp_setup_context *setup,
+ const struct point_info *info,
+ unsigned slot,
+ unsigned i,
+ unsigned sprite_coord_origin,
+ boolean perspective)
+{
+ float w0 = info->v0[0][3];
+
+ assert(i < 4);
+
+ if (i == 0) {
+ float dadx = FIXED_ONE / (float)info->dx12;
float dady = 0.0f;
- point->inputs.dadx[slot][i] = dadx;
- point->inputs.dady[slot][i] = dady;
- point->inputs.a0[slot][i] = (0.5 -
- (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
- dady * ((float)info->v0[0][1] - setup->pixel_offset)));
- }
+ float x0 = info->v0[0][0] - setup->pixel_offset;
+ float y0 = info->v0[0][1] - setup->pixel_offset;
- else if (i == 1) {
- float dadx = 0.0f;
- float dady = FIXED_ONE / (float)info->dx12;
-
- point->inputs.dadx[slot][i] = dadx;
- point->inputs.dady[slot][i] = dady;
- point->inputs.a0[slot][i] = (0.5 -
- (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
- dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+ info->dadx[slot][0] = dadx;
+ info->dady[slot][0] = dady;
+ info->a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
+
+ if (perspective) {
+ info->dadx[slot][0] *= w0;
+ info->dady[slot][0] *= w0;
+ info->a0[slot][0] *= w0;
+ }
}
+ else if (i == 1) {
+ float dadx = 0.0f;
+ float dady = FIXED_ONE / (float)info->dx12;
+ float x0 = info->v0[0][0] - setup->pixel_offset;
+ float y0 = info->v0[0][1] - setup->pixel_offset;
+ if (sprite_coord_origin == PIPE_SPRITE_COORD_LOWER_LEFT) {
+ dady = -dady;
+ }
+
+ info->dadx[slot][1] = dadx;
+ info->dady[slot][1] = dady;
+ info->a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
+
+ if (perspective) {
+ info->dadx[slot][1] *= w0;
+ info->dady[slot][1] *= w0;
+ info->a0[slot][1] *= w0;
+ }
+ }
else if (i == 2) {
- point->inputs.a0[slot][i] = 0.0f;
- point->inputs.dadx[slot][i] = 0.0f;
- point->inputs.dady[slot][i] = 0.0f;
+ info->a0[slot][2] = 0.0f;
+ info->dadx[slot][2] = 0.0f;
+ info->dady[slot][2] = 0.0f;
}
-
- else if (i == 3) {
- point->inputs.a0[slot][i] = 1.0f;
- point->inputs.dadx[slot][i] = 0.0f;
- point->inputs.dady[slot][i] = 0.0f;
+ else {
+ info->a0[slot][3] = perspective ? w0 : 1.0f;
+ info->dadx[slot][3] = 0.0f;
+ info->dady[slot][3] = 0.0f;
}
-
}
@@ -115,45 +169,45 @@ static void perspective_coef( struct lp_setup_context *setup,
*/
static void
setup_point_fragcoord_coef(struct lp_setup_context *setup,
- struct lp_rast_triangle *point,
- const struct point_info *info,
+ struct point_info *info,
unsigned slot,
unsigned usage_mask)
{
/*X*/
if (usage_mask & TGSI_WRITEMASK_X) {
- point->inputs.a0[slot][0] = 0.0;
- point->inputs.dadx[slot][0] = 1.0;
- point->inputs.dady[slot][0] = 0.0;
+ info->a0[slot][0] = 0.0;
+ info->dadx[slot][0] = 1.0;
+ info->dady[slot][0] = 0.0;
}
/*Y*/
if (usage_mask & TGSI_WRITEMASK_Y) {
- point->inputs.a0[slot][1] = 0.0;
- point->inputs.dadx[slot][1] = 0.0;
- point->inputs.dady[slot][1] = 1.0;
+ info->a0[slot][1] = 0.0;
+ info->dadx[slot][1] = 0.0;
+ info->dady[slot][1] = 1.0;
}
/*Z*/
if (usage_mask & TGSI_WRITEMASK_Z) {
- constant_coef(setup, point, slot, info->v0[0][2], 2);
+ constant_coef(setup, info, slot, info->v0[0][2], 2);
}
/*W*/
if (usage_mask & TGSI_WRITEMASK_W) {
- constant_coef(setup, point, slot, info->v0[0][3], 3);
+ constant_coef(setup, info, slot, info->v0[0][3], 3);
}
}
+
/**
* Compute the point->coef[] array dadx, dady, a0 values.
*/
static void
setup_point_coefficients( struct lp_setup_context *setup,
- struct lp_rast_triangle *point,
- const struct point_info *info)
+ struct point_info *info)
{
const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+ const struct lp_fragment_shader *shader = setup->fs.current.variant->shader;
unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
unsigned slot;
@@ -162,9 +216,15 @@ setup_point_coefficients( struct lp_setup_context *setup,
for (slot = 0; slot < key->num_inputs; slot++) {
unsigned vert_attr = key->inputs[slot].src_index;
unsigned usage_mask = key->inputs[slot].usage_mask;
+ enum lp_interp interp = key->inputs[slot].interp;
+ boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE);
unsigned i;
+
+ if (perspective & usage_mask) {
+ fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+ }
- switch (key->inputs[slot].interp) {
+ switch (interp) {
case LP_INTERP_POSITION:
/*
* The generated pixel interpolators will pick up the coeffs from
@@ -174,37 +234,63 @@ setup_point_coefficients( struct lp_setup_context *setup,
fragcoord_usage_mask |= usage_mask;
break;
+ case LP_INTERP_LINEAR:
+ /* Sprite tex coords may use linear interpolation someday */
+ /* fall-through */
case LP_INTERP_PERSPECTIVE:
- /* For point sprite textures */
- if (setup->fs.current.variant->shader->info.input_semantic_name[slot]
- == TGSI_SEMANTIC_GENERIC)
- {
- int index = setup->fs.current.variant->shader->info.input_semantic_index[slot];
-
- if (setup->sprite & (1 << index)) {
- for (i = 0; i < NUM_CHANNELS; i++)
- if (usage_mask & (1 << i))
- perspective_coef(setup, point, info, slot+1, vert_attr, i);
- fragcoord_usage_mask |= TGSI_WRITEMASK_W;
- break;
+ /* check if the sprite coord flag is set for this attribute.
+ * If so, set it up so it up so x and y vary from 0 to 1.
+ */
+ if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
+ unsigned semantic_index = shader->info.base.input_semantic_index[slot];
+ /* Note that sprite_coord enable is a bitfield of
+ * PIPE_MAX_SHADER_OUTPUTS bits.
+ */
+ if (semantic_index < PIPE_MAX_SHADER_OUTPUTS &&
+ (setup->sprite_coord_enable & (1 << semantic_index))) {
+ for (i = 0; i < NUM_CHANNELS; i++) {
+ if (usage_mask & (1 << i)) {
+ texcoord_coef(setup, info, slot + 1, i,
+ setup->sprite_coord_origin,
+ perspective);
+ }
+ }
+ break;
}
}
-
- /* Otherwise fallthrough */
- default:
+ /* fall-through */
+ case LP_INTERP_CONSTANT:
for (i = 0; i < NUM_CHANNELS; i++) {
- if (usage_mask & (1 << i))
- constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+ if (usage_mask & (1 << i)) {
+ if (perspective) {
+ point_persp_coeff(setup, info, slot+1, i);
+ }
+ else {
+ constant_coef(setup, info, slot+1, info->v0[vert_attr][i], i);
+ }
+ }
}
+ break;
+
+ case LP_INTERP_FACING:
+ for (i = 0; i < NUM_CHANNELS; i++)
+ if (usage_mask & (1 << i))
+ constant_coef(setup, info, slot+1, 1.0, i);
+ break;
+
+ default:
+ assert(0);
+ break;
}
}
/* The internal position input is in slot zero:
*/
- setup_point_fragcoord_coef(setup, point, info, 0,
+ setup_point_fragcoord_coef(setup, info, 0,
fragcoord_usage_mask);
}
+
static INLINE int
subpixel_snap(float a)
{
@@ -285,55 +371,57 @@ try_setup_point( struct lp_setup_context *setup,
info.dx12 = fixed_width;
info.dy01 = fixed_width;
info.dy12 = 0;
+ info.a0 = GET_A0(&point->inputs);
+ info.dadx = GET_DADX(&point->inputs);
+ info.dady = GET_DADY(&point->inputs);
/* Setup parameter interpolants:
*/
- setup_point_coefficients(setup, point, &info);
+ setup_point_coefficients(setup, &info);
- point->inputs.facing = 1.0F;
- point->inputs.state = setup->fs.stored;
+ point->inputs.frontfacing = TRUE;
point->inputs.disable = FALSE;
point->inputs.opaque = FALSE;
{
- point->plane[0].dcdx = -1;
- point->plane[0].dcdy = 0;
- point->plane[0].c = 1-bbox.x0;
- point->plane[0].ei = 0;
- point->plane[0].eo = 1;
-
- point->plane[1].dcdx = 1;
- point->plane[1].dcdy = 0;
- point->plane[1].c = bbox.x1+1;
- point->plane[1].ei = -1;
- point->plane[1].eo = 0;
-
- point->plane[2].dcdx = 0;
- point->plane[2].dcdy = 1;
- point->plane[2].c = 1-bbox.y0;
- point->plane[2].ei = 0;
- point->plane[2].eo = 1;
-
- point->plane[3].dcdx = 0;
- point->plane[3].dcdy = -1;
- point->plane[3].c = bbox.y1+1;
- point->plane[3].ei = -1;
- point->plane[3].eo = 0;
+ struct lp_rast_plane *plane = GET_PLANES(point);
+
+ plane[0].dcdx = -1;
+ plane[0].dcdy = 0;
+ plane[0].c = 1-bbox.x0;
+ plane[0].eo = 1;
+
+ plane[1].dcdx = 1;
+ plane[1].dcdy = 0;
+ plane[1].c = bbox.x1+1;
+ plane[1].eo = 0;
+
+ plane[2].dcdx = 0;
+ plane[2].dcdy = 1;
+ plane[2].c = 1-bbox.y0;
+ plane[2].eo = 1;
+
+ plane[3].dcdx = 0;
+ plane[3].dcdy = -1;
+ plane[3].c = bbox.y1+1;
+ plane[3].eo = 0;
}
return lp_setup_bin_triangle(setup, point, &bbox, nr_planes);
}
-static void lp_setup_point( struct lp_setup_context *setup,
- const float (*v0)[4] )
+static void
+lp_setup_point(struct lp_setup_context *setup,
+ const float (*v0)[4])
{
if (!try_setup_point( setup, v0 ))
{
- lp_setup_flush_and_restart(setup);
+ if (!lp_setup_flush_and_restart(setup))
+ return;
if (!try_setup_point( setup, v0 ))
- assert(0);
+ return;
}
}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 43617a6b67..03036cd75d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -32,6 +32,7 @@
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/u_rect.h"
+#include "util/u_sse.h"
#include "lp_perf.h"
#include "lp_setup_context.h"
#include "lp_rast.h"
@@ -40,7 +41,9 @@
#define NUM_CHANNELS 4
-
+#if defined(PIPE_ARCH_SSE)
+#include <emmintrin.h>
+#endif
static INLINE int
subpixel_snap(float a)
@@ -70,27 +73,28 @@ fixed_to_float(int a)
*/
struct lp_rast_triangle *
lp_setup_alloc_triangle(struct lp_scene *scene,
- unsigned num_inputs,
+ unsigned nr_inputs,
unsigned nr_planes,
unsigned *tri_size)
{
- unsigned input_array_sz = NUM_CHANNELS * (num_inputs + 1) * sizeof(float);
+ unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+ unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
struct lp_rast_triangle *tri;
- unsigned tri_bytes, bytes;
- char *inputs;
- tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16);
- bytes = tri_bytes + (3 * input_array_sz);
+ *tri_size = (sizeof(struct lp_rast_triangle) +
+ 3 * input_array_sz +
+ plane_sz);
- tri = lp_scene_alloc_aligned( scene, bytes, 16 );
+ tri = lp_scene_alloc_aligned( scene, *tri_size, 16 );
+ if (tri == NULL)
+ return NULL;
- if (tri) {
- inputs = ((char *)tri) + tri_bytes;
- tri->inputs.a0 = (float (*)[4]) inputs;
- tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);
- tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz);
+ tri->inputs.stride = input_array_sz;
- *tri_size = bytes;
+ {
+ char *a = (char *)tri;
+ char *b = (char *)&GET_PLANES(tri)[nr_planes];
+ assert(b - a == *tri_size);
}
return tri;
@@ -161,8 +165,9 @@ lp_setup_print_triangle(struct lp_setup_context *setup,
}
+#define MAX_PLANES 8
static unsigned
-lp_rast_tri_tab[9] = {
+lp_rast_tri_tab[MAX_PLANES+1] = {
0, /* should be impossible */
LP_RAST_OP_TRIANGLE_1,
LP_RAST_OP_TRIANGLE_2,
@@ -200,14 +205,16 @@ lp_setup_whole_tile(struct lp_setup_context *setup,
}
LP_COUNT(nr_shade_opaque_64);
- return lp_scene_bin_command( scene, tx, ty,
- LP_RAST_OP_SHADE_TILE_OPAQUE,
- lp_rast_arg_inputs(inputs) );
+ return lp_scene_bin_cmd_with_state( scene, tx, ty,
+ setup->fs.stored,
+ LP_RAST_OP_SHADE_TILE_OPAQUE,
+ lp_rast_arg_inputs(inputs) );
} else {
LP_COUNT(nr_shade_64);
- return lp_scene_bin_command( scene, tx, ty,
- LP_RAST_OP_SHADE_TILE,
- lp_rast_arg_inputs(inputs) );
+ return lp_scene_bin_cmd_with_state( scene, tx, ty,
+ setup->fs.stored,
+ LP_RAST_OP_SHADE_TILE,
+ lp_rast_arg_inputs(inputs) );
}
}
@@ -227,12 +234,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
struct lp_scene *scene = setup->scene;
const struct lp_setup_variant_key *key = &setup->setup.variant->key;
struct lp_rast_triangle *tri;
- int x[3];
- int y[3];
- int area;
+ struct lp_rast_plane *plane;
+ int x[4];
+ int y[4];
struct u_rect bbox;
unsigned tri_bytes;
- int i;
int nr_planes = 3;
if (0)
@@ -249,10 +255,12 @@ do_triangle_ccw(struct lp_setup_context *setup,
x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
+ x[3] = 0;
y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-
+ y[3] = 0;
+
/* Bounding rectangle (in pixels) */
{
@@ -296,7 +304,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
if (!tri)
return FALSE;
-#ifdef DEBUG
+#if 0
tri->v[0][0] = v0[0][0];
tri->v[1][0] = v1[0][0];
tri->v[2][0] = v2[0][0];
@@ -305,104 +313,173 @@ do_triangle_ccw(struct lp_setup_context *setup,
tri->v[2][1] = v2[0][1];
#endif
- tri->plane[0].dcdy = x[0] - x[1];
- tri->plane[1].dcdy = x[1] - x[2];
- tri->plane[2].dcdy = x[2] - x[0];
-
- tri->plane[0].dcdx = y[0] - y[1];
- tri->plane[1].dcdx = y[1] - y[2];
- tri->plane[2].dcdx = y[2] - y[0];
-
- area = (tri->plane[0].dcdy * tri->plane[2].dcdx -
- tri->plane[2].dcdy * tri->plane[0].dcdx);
-
LP_COUNT(nr_tris);
- /* Cull non-ccw and zero-sized triangles.
- *
- * XXX: subject to overflow??
- */
- if (area <= 0) {
- lp_scene_putback_data( scene, tri_bytes );
- LP_COUNT(nr_culled_tris);
- return TRUE;
- }
-
/* Setup parameter interpolants:
*/
setup->setup.variant->jit_function( v0,
v1,
v2,
frontfacing,
- tri->inputs.a0,
- tri->inputs.dadx,
- tri->inputs.dady,
+ GET_A0(&tri->inputs),
+ GET_DADX(&tri->inputs),
+ GET_DADY(&tri->inputs),
&setup->setup.variant->key );
- tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
+ tri->inputs.frontfacing = frontfacing;
tri->inputs.disable = FALSE;
tri->inputs.opaque = setup->fs.current.variant->opaque;
- tri->inputs.state = setup->fs.stored;
if (0)
lp_dump_setup_coef(&setup->setup.variant->key,
- (const float (*)[4])tri->inputs.a0,
- (const float (*)[4])tri->inputs.dadx,
- (const float (*)[4])tri->inputs.dady);
-
- for (i = 0; i < 3; i++) {
- struct lp_rast_plane *plane = &tri->plane[i];
+ (const float (*)[4])GET_A0(&tri->inputs),
+ (const float (*)[4])GET_DADX(&tri->inputs),
+ (const float (*)[4])GET_DADY(&tri->inputs));
+
+ plane = GET_PLANES(tri);
+
+#if defined(PIPE_ARCH_SSE)
+ {
+ __m128i vertx, verty;
+ __m128i shufx, shufy;
+ __m128i dcdx, dcdy, c;
+ __m128i unused;
+ __m128i dcdx_neg_mask;
+ __m128i dcdy_neg_mask;
+ __m128i dcdx_zero_mask;
+ __m128i top_left_flag;
+ __m128i c_inc_mask, c_inc;
+ __m128i eo, p0, p1, p2;
+ __m128i zero = _mm_setzero_si128();
+
+ vertx = _mm_loadu_si128((__m128i *)x); /* vertex x coords */
+ verty = _mm_loadu_si128((__m128i *)y); /* vertex y coords */
+
+ shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
+ shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
+
+ dcdx = _mm_sub_epi32(verty, shufy);
+ dcdy = _mm_sub_epi32(vertx, shufx);
+
+ dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+ dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero);
+ dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+
+ top_left_flag = _mm_set1_epi32((setup->pixel_offset == 0) ? ~0 : 0);
+
+ c_inc_mask = _mm_or_si128(dcdx_neg_mask,
+ _mm_and_si128(dcdx_zero_mask,
+ _mm_xor_si128(dcdy_neg_mask,
+ top_left_flag)));
- /* half-edge constants, will be interated over the whole render
- * target.
+ c_inc = _mm_srli_epi32(c_inc_mask, 31);
+
+ c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+ mm_mullo_epi32(dcdy, verty));
+
+ c = _mm_add_epi32(c, c_inc);
+
+ /* Scale up to match c:
*/
- plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
-
- /* correct for top-left vs. bottom-left fill convention.
- *
- * note that we're overloading gl_rasterization_rules to mean
- * both (0.5,0.5) pixel centers *and* bottom-left filling
- * convention.
- *
- * GL actually has a top-left filling convention, but GL's
- * notion of "top" differs from gallium's...
- *
- * Also, sometimes (in FBO cases) GL will render upside down
- * to its usual method, in which case it will probably want
- * to use the opposite, top-left convention.
- */
- if (plane->dcdx < 0) {
- /* both fill conventions want this - adjust for left edges */
- plane->c++;
- }
- else if (plane->dcdx == 0) {
- if (setup->pixel_offset == 0) {
- /* correct for top-left fill convention:
- */
- if (plane->dcdy > 0) plane->c++;
+ dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
+ dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
+
+ /* Calculate trivial reject values:
+ */
+ eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+ _mm_and_si128(dcdx_neg_mask, dcdx));
+
+ /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+ /* Pointless transpose which gets undone immediately in
+ * rasterization:
+ */
+ transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+ &p0, &p1, &p2, &unused);
+
+ _mm_store_si128((__m128i *)&plane[0], p0);
+ _mm_store_si128((__m128i *)&plane[1], p1);
+ _mm_store_si128((__m128i *)&plane[2], p2);
+ }
+#else
+ {
+ int i;
+ plane[0].dcdy = x[0] - x[1];
+ plane[1].dcdy = x[1] - x[2];
+ plane[2].dcdy = x[2] - x[0];
+ plane[0].dcdx = y[0] - y[1];
+ plane[1].dcdx = y[1] - y[2];
+ plane[2].dcdx = y[2] - y[0];
+
+ for (i = 0; i < 3; i++) {
+ /* half-edge constants, will be interated over the whole render
+ * target.
+ */
+ plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
+
+ /* correct for top-left vs. bottom-left fill convention.
+ *
+ * note that we're overloading gl_rasterization_rules to mean
+ * both (0.5,0.5) pixel centers *and* bottom-left filling
+ * convention.
+ *
+ * GL actually has a top-left filling convention, but GL's
+ * notion of "top" differs from gallium's...
+ *
+ * Also, sometimes (in FBO cases) GL will render upside down
+ * to its usual method, in which case it will probably want
+ * to use the opposite, top-left convention.
+ */
+ if (plane[i].dcdx < 0) {
+ /* both fill conventions want this - adjust for left edges */
+ plane[i].c++;
}
- else {
- /* correct for bottom-left fill convention:
- */
- if (plane->dcdy < 0) plane->c++;
+ else if (plane[i].dcdx == 0) {
+ if (setup->pixel_offset == 0) {
+ /* correct for top-left fill convention:
+ */
+ if (plane[i].dcdy > 0) plane[i].c++;
+ }
+ else {
+ /* correct for bottom-left fill convention:
+ */
+ if (plane[i].dcdy < 0) plane[i].c++;
+ }
}
- }
- plane->dcdx *= FIXED_ONE;
- plane->dcdy *= FIXED_ONE;
+ plane[i].dcdx *= FIXED_ONE;
+ plane[i].dcdy *= FIXED_ONE;
- /* find trivial reject offsets for each edge for a single-pixel
- * sized block. These will be scaled up at each recursive level to
- * match the active blocksize. Scaling in this way works best if
- * the blocks are square.
- */
- plane->eo = 0;
- if (plane->dcdx < 0) plane->eo -= plane->dcdx;
- if (plane->dcdy > 0) plane->eo += plane->dcdy;
+ /* find trivial reject offsets for each edge for a single-pixel
+ * sized block. These will be scaled up at each recursive level to
+ * match the active blocksize. Scaling in this way works best if
+ * the blocks are square.
+ */
+ plane[i].eo = 0;
+ if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+ if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
+ }
+ }
+#endif
- /* Calculate trivial accept offsets from the above.
- */
- plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+ if (0) {
+ debug_printf("p0: %08x/%08x/%08x/%08x\n",
+ plane[0].c,
+ plane[0].dcdx,
+ plane[0].dcdy,
+ plane[0].eo);
+
+ debug_printf("p1: %08x/%08x/%08x/%08x\n",
+ plane[1].c,
+ plane[1].dcdx,
+ plane[1].dcdy,
+ plane[1].eo);
+
+ debug_printf("p0: %08x/%08x/%08x/%08x\n",
+ plane[2].c,
+ plane[2].dcdx,
+ plane[2].dcdy,
+ plane[2].eo);
}
@@ -425,29 +502,25 @@ do_triangle_ccw(struct lp_setup_context *setup,
* these planes elsewhere.
*/
if (nr_planes == 7) {
- tri->plane[3].dcdx = -1;
- tri->plane[3].dcdy = 0;
- tri->plane[3].c = 1-bbox.x0;
- tri->plane[3].ei = 0;
- tri->plane[3].eo = 1;
-
- tri->plane[4].dcdx = 1;
- tri->plane[4].dcdy = 0;
- tri->plane[4].c = bbox.x1+1;
- tri->plane[4].ei = -1;
- tri->plane[4].eo = 0;
-
- tri->plane[5].dcdx = 0;
- tri->plane[5].dcdy = 1;
- tri->plane[5].c = 1-bbox.y0;
- tri->plane[5].ei = 0;
- tri->plane[5].eo = 1;
-
- tri->plane[6].dcdx = 0;
- tri->plane[6].dcdy = -1;
- tri->plane[6].c = bbox.y1+1;
- tri->plane[6].ei = -1;
- tri->plane[6].eo = 0;
+ plane[3].dcdx = -1;
+ plane[3].dcdy = 0;
+ plane[3].c = 1-bbox.x0;
+ plane[3].eo = 1;
+
+ plane[4].dcdx = 1;
+ plane[4].dcdy = 0;
+ plane[4].c = bbox.x1+1;
+ plane[4].eo = 0;
+
+ plane[5].dcdx = 0;
+ plane[5].dcdy = 1;
+ plane[5].c = 1-bbox.y0;
+ plane[5].eo = 1;
+
+ plane[6].dcdx = 0;
+ plane[6].dcdy = -1;
+ plane[6].c = bbox.y1+1;
+ plane[6].eo = 0;
}
return lp_setup_bin_triangle( setup, tri, &bbox, nr_planes );
@@ -500,56 +573,63 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) |
(bbox->y1 - (bbox->y0 & ~3)));
- if (nr_planes == 3) {
- if (sz < 4 && dx < 64)
- {
- /* Triangle is contained in a single 4x4 stamp:
- */
- int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
- return lp_scene_bin_command( scene,
- bbox->x0/64, bbox->y0/64,
- LP_RAST_OP_TRIANGLE_3_4,
- lp_rast_arg_triangle(tri, mask) );
- }
-
- if (sz < 16 && dx < 64)
- {
- int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
- /* Triangle is contained in a single 16x16 block:
- */
- return lp_scene_bin_command( scene,
- bbox->x0/64, bbox->y0/64,
- LP_RAST_OP_TRIANGLE_3_16,
- lp_rast_arg_triangle(tri, mask) );
- }
- }
-
-
/* Determine which tile(s) intersect the triangle's bounding box
*/
if (dx < TILE_SIZE)
{
int ix0 = bbox->x0 / TILE_SIZE;
int iy0 = bbox->y0 / TILE_SIZE;
+ int px = bbox->x0 & 63 & ~3;
+ int py = bbox->y0 & 63 & ~3;
+ int mask = px | (py << 8);
assert(iy0 == bbox->y1 / TILE_SIZE &&
ix0 == bbox->x1 / TILE_SIZE);
+ if (nr_planes == 3) {
+ if (sz < 4)
+ {
+ /* Triangle is contained in a single 4x4 stamp:
+ */
+ return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+ setup->fs.stored,
+ LP_RAST_OP_TRIANGLE_3_4,
+ lp_rast_arg_triangle(tri, mask) );
+ }
+
+ if (sz < 16)
+ {
+ /* Triangle is contained in a single 16x16 block:
+ */
+ return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+ setup->fs.stored,
+ LP_RAST_OP_TRIANGLE_3_16,
+ lp_rast_arg_triangle(tri, mask) );
+ }
+ }
+ else if (nr_planes == 4 && sz < 16)
+ {
+ return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
+ setup->fs.stored,
+ LP_RAST_OP_TRIANGLE_4_16,
+ lp_rast_arg_triangle(tri, mask) );
+ }
+
+
/* Triangle is contained in a single tile:
*/
- return lp_scene_bin_command( scene, ix0, iy0,
- lp_rast_tri_tab[nr_planes],
- lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
+ return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored,
+ lp_rast_tri_tab[nr_planes],
+ lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
}
else
{
- int c[7];
- int ei[7];
- int eo[7];
- int xstep[7];
- int ystep[7];
+ struct lp_rast_plane *plane = GET_PLANES(tri);
+ int c[MAX_PLANES];
+ int ei[MAX_PLANES];
+ int eo[MAX_PLANES];
+ int xstep[MAX_PLANES];
+ int ystep[MAX_PLANES];
int x, y;
int ix0 = bbox->x0 / TILE_SIZE;
@@ -558,14 +638,17 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
int iy1 = bbox->y1 / TILE_SIZE;
for (i = 0; i < nr_planes; i++) {
- c[i] = (tri->plane[i].c +
- tri->plane[i].dcdy * iy0 * TILE_SIZE -
- tri->plane[i].dcdx * ix0 * TILE_SIZE);
-
- ei[i] = tri->plane[i].ei << TILE_ORDER;
- eo[i] = tri->plane[i].eo << TILE_ORDER;
- xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER);
- ystep[i] = tri->plane[i].dcdy << TILE_ORDER;
+ c[i] = (plane[i].c +
+ plane[i].dcdy * iy0 * TILE_SIZE -
+ plane[i].dcdx * ix0 * TILE_SIZE);
+
+ ei[i] = (plane[i].dcdy -
+ plane[i].dcdx -
+ plane[i].eo) << TILE_ORDER;
+
+ eo[i] = plane[i].eo << TILE_ORDER;
+ xstep[i] = -(plane[i].dcdx << TILE_ORDER);
+ ystep[i] = plane[i].dcdy << TILE_ORDER;
}
@@ -578,7 +661,7 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
for (y = iy0; y <= iy1; y++)
{
boolean in = FALSE; /* are we inside the triangle? */
- int cx[7];
+ int cx[MAX_PLANES];
for (i = 0; i < nr_planes; i++)
cx[i] = c[i];
@@ -607,9 +690,11 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
*/
int count = util_bitcount(partial);
in = TRUE;
- if (!lp_scene_bin_command( scene, x, y,
- lp_rast_tri_tab[count],
- lp_rast_arg_triangle(tri, partial) ))
+
+ if (!lp_scene_bin_cmd_with_state( scene, x, y,
+ setup->fs.stored,
+ lp_rast_tri_tab[count],
+ lp_rast_arg_triangle(tri, partial) ))
goto fail;
LP_COUNT(nr_partially_covered_64);
@@ -648,40 +733,62 @@ fail:
/**
- * Draw triangle if it's CW, cull otherwise.
+ * Try to draw the triangle, restart the scene on failure.
*/
-static void triangle_cw( struct lp_setup_context *setup,
- const float (*v0)[4],
- const float (*v1)[4],
- const float (*v2)[4] )
+static void retry_triangle_ccw( struct lp_setup_context *setup,
+ const float (*v0)[4],
+ const float (*v1)[4],
+ const float (*v2)[4],
+ boolean front)
{
- if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
+ if (!do_triangle_ccw( setup, v0, v1, v2, front ))
{
- lp_setup_flush_and_restart(setup);
+ if (!lp_setup_flush_and_restart(setup))
+ return;
- if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
- assert(0);
+ if (!do_triangle_ccw( setup, v0, v1, v2, front ))
+ return;
}
}
+static INLINE float
+calc_area(const float (*v0)[4],
+ const float (*v1)[4],
+ const float (*v2)[4])
+{
+ float dx01 = v0[0][0] - v1[0][0];
+ float dy01 = v0[0][1] - v1[0][1];
+ float dx20 = v2[0][0] - v0[0][0];
+ float dy20 = v2[0][1] - v0[0][1];
+ return dx01 * dy20 - dx20 * dy01;
+}
+
/**
- * Draw triangle if it's CCW, cull otherwise.
+ * Draw triangle if it's CW, cull otherwise.
*/
-static void triangle_ccw( struct lp_setup_context *setup,
+static void triangle_cw( struct lp_setup_context *setup,
const float (*v0)[4],
const float (*v1)[4],
const float (*v2)[4] )
{
- if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
- {
- lp_setup_flush_and_restart(setup);
- if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
- assert(0);
- }
+ float area = calc_area(v0, v1, v2);
+
+ if (area < 0.0f)
+ retry_triangle_ccw(setup, v0, v2, v1, !setup->ccw_is_frontface);
}
+static void triangle_ccw( struct lp_setup_context *setup,
+ const float (*v0)[4],
+ const float (*v1)[4],
+ const float (*v2)[4])
+{
+ float area = calc_area(v0, v1, v2);
+
+ if (area > 0.0f)
+ retry_triangle_ccw(setup, v0, v1, v2, setup->ccw_is_frontface);
+}
/**
* Draw triangle whether it's CW or CCW.
@@ -691,18 +798,12 @@ static void triangle_both( struct lp_setup_context *setup,
const float (*v1)[4],
const float (*v2)[4] )
{
- /* edge vectors e = v0 - v2, f = v1 - v2 */
- const float ex = v0[0][0] - v2[0][0];
- const float ey = v0[0][1] - v2[0][1];
- const float fx = v1[0][0] - v2[0][0];
- const float fy = v1[0][1] - v2[0][1];
-
- /* det = cross(e,f).z */
- const float det = ex * fy - ey * fx;
- if (det < 0.0f)
- triangle_ccw( setup, v0, v1, v2 );
- else if (det > 0.0f)
- triangle_cw( setup, v0, v1, v2 );
+ float area = calc_area(v0, v1, v2);
+
+ if (area > 0.0f)
+ retry_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface );
+ else if (area < 0.0f)
+ retry_triangle_ccw( setup, v0, v2, v1, !setup->ccw_is_frontface );
}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 6308561f24..9c1f0fe793 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -141,7 +141,8 @@ lp_setup_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
const boolean flatshade_first = setup->flatshade_first;
unsigned i;
- lp_setup_update_state(setup, TRUE);
+ if (!lp_setup_update_state(setup, TRUE))
+ return;
switch (setup->prim) {
case PIPE_PRIM_POINTS:
@@ -338,7 +339,8 @@ lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
const boolean flatshade_first = setup->flatshade_first;
unsigned i;
- lp_setup_update_state(setup, TRUE);
+ if (!lp_setup_update_state(setup, TRUE))
+ return;
switch (setup->prim) {
case PIPE_PRIM_POINTS:
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index de242aa93c..0f5f7369e0 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -67,14 +67,14 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
- for (i = 0; i < lpfs->info.num_inputs; i++) {
+ for (i = 0; i < lpfs->info.base.num_inputs; i++) {
/*
* Search for each input in current vs output:
*/
vs_index = draw_find_shader_output(llvmpipe->draw,
- lpfs->info.input_semantic_name[i],
- lpfs->info.input_semantic_index[i]);
+ lpfs->info.base.input_semantic_name[i],
+ lpfs->info.base.input_semantic_index[i]);
/*
* Emit the requested fs attribute for all but position.
@@ -94,7 +94,6 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
draw_compute_vertex_size(vinfo);
lp_setup_set_vertex_info(llvmpipe->setup, vinfo);
-
}
@@ -153,11 +152,16 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
lp_setup_set_fs_constants(llvmpipe->setup,
llvmpipe->constants[PIPE_SHADER_FRAGMENT][0]);
- if (llvmpipe->dirty & LP_NEW_SAMPLER_VIEW)
+ if (llvmpipe->dirty & (LP_NEW_SAMPLER_VIEW))
lp_setup_set_fragment_sampler_views(llvmpipe->setup,
llvmpipe->num_fragment_sampler_views,
llvmpipe->fragment_sampler_views);
+ if (llvmpipe->dirty & (LP_NEW_SAMPLER))
+ lp_setup_set_fragment_sampler_state(llvmpipe->setup,
+ llvmpipe->num_samplers,
+ llvmpipe->sampler);
+
llvmpipe->dirty = 0;
}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index ad058e384a..9fbedac165 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -99,74 +99,12 @@
#include <llvm-c/Analysis.h>
+#include <llvm-c/BitWriter.h>
static unsigned fs_no = 0;
-/**
- * Generate the depth /stencil test code.
- */
-static void
-generate_depth_stencil(LLVMBuilderRef builder,
- const struct lp_fragment_shader_variant_key *key,
- struct lp_type src_type,
- struct lp_build_mask_context *mask,
- LLVMValueRef stencil_refs[2],
- LLVMValueRef src,
- LLVMValueRef dst_ptr,
- LLVMValueRef facing,
- LLVMValueRef counter)
-{
- const struct util_format_description *format_desc;
- struct lp_type dst_type;
-
- if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled)
- return;
-
- format_desc = util_format_description(key->zsbuf_format);
- assert(format_desc);
-
- /*
- * Depths are expected to be between 0 and 1, even if they are stored in
- * floats. Setting these bits here will ensure that the lp_build_conv() call
- * below won't try to unnecessarily clamp the incoming values.
- */
- if(src_type.floating) {
- src_type.sign = FALSE;
- src_type.norm = TRUE;
- }
- else {
- assert(!src_type.sign);
- assert(src_type.norm);
- }
-
- /* Pick the depth type. */
- dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
-
- /* FIXME: Cope with a depth test type with a different bit width. */
- assert(dst_type.width == src_type.width);
- assert(dst_type.length == src_type.length);
-
- /* Convert fragment Z from float to integer */
- lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
-
- dst_ptr = LLVMBuildBitCast(builder,
- dst_ptr,
- LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
- lp_build_depth_stencil_test(builder,
- &key->depth,
- key->stencil,
- dst_type,
- format_desc,
- mask,
- stencil_refs,
- src,
- dst_ptr,
- facing,
- counter);
-}
-
/**
* Expand the relevent bits of mask_input to a 4-dword mask for the
@@ -248,6 +186,26 @@ generate_quad_mask(LLVMBuilderRef builder,
}
+#define EARLY_DEPTH_TEST 0x1
+#define LATE_DEPTH_TEST 0x2
+#define EARLY_DEPTH_WRITE 0x4
+#define LATE_DEPTH_WRITE 0x8
+
+static int
+find_output_by_semantic( const struct tgsi_shader_info *info,
+ unsigned semantic,
+ unsigned index )
+{
+ int i;
+
+ for (i = 0; i < info->num_outputs; i++)
+ if (info->output_semantic_name[i] == semantic &&
+ info->output_semantic_index[i] == index)
+ return i;
+
+ return -1;
+}
+
/**
* Generate the fragment shader, depth/stencil test, and alpha tests.
@@ -261,7 +219,7 @@ generate_fs(struct lp_fragment_shader *shader,
struct lp_type type,
LLVMValueRef context_ptr,
unsigned i,
- const struct lp_build_interp_soa_context *interp,
+ struct lp_build_interp_soa_context *interp,
struct lp_build_sampler_soa *sampler,
LLVMValueRef *pmask,
LLVMValueRef (*color)[4],
@@ -271,18 +229,52 @@ generate_fs(struct lp_fragment_shader *shader,
LLVMValueRef mask_input,
LLVMValueRef counter)
{
+ const struct util_format_description *zs_format_desc = NULL;
const struct tgsi_token *tokens = shader->base.tokens;
LLVMTypeRef vec_type;
LLVMValueRef consts_ptr;
LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
- LLVMValueRef z = interp->pos[2];
+ LLVMValueRef z;
+ LLVMValueRef zs_value = NULL;
LLVMValueRef stencil_refs[2];
- struct lp_build_flow_context *flow;
struct lp_build_mask_context mask;
- boolean early_depth_stencil_test;
+ boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
+ shader->info.base.num_inputs < 3 &&
+ shader->info.base.num_instructions < 8);
unsigned attrib;
unsigned chan;
unsigned cbuf;
+ unsigned depth_mode;
+
+ if (key->depth.enabled ||
+ key->stencil[0].enabled ||
+ key->stencil[1].enabled) {
+
+ zs_format_desc = util_format_description(key->zsbuf_format);
+ assert(zs_format_desc);
+
+ if (!shader->info.base.writes_z) {
+ if (key->alpha.enabled || shader->info.base.uses_kill)
+ /* With alpha test and kill, can do the depth test early
+ * and hopefully eliminate some quads. But need to do a
+ * special deferred depth write once the final mask value
+ * is known.
+ */
+ depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+ else
+ depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
+ }
+ else {
+ depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+ }
+
+ if (!(key->depth.enabled && key->depth.writemask) &&
+ !(key->stencil[0].enabled && key->stencil[0].writemask))
+ depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
+ }
+ else {
+ depth_mode = 0;
+ }
assert(i < 4);
@@ -293,20 +285,14 @@ generate_fs(struct lp_fragment_shader *shader,
consts_ptr = lp_jit_context_constants(builder, context_ptr);
- flow = lp_build_flow_create(builder);
-
memset(outputs, 0, sizeof outputs);
- lp_build_flow_scope_begin(flow);
-
/* Declare the color and z variables */
for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
for(chan = 0; chan < NUM_CHANNELS; ++chan) {
- color[cbuf][chan] = LLVMGetUndef(vec_type);
- lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+ color[cbuf][chan] = lp_build_alloca(builder, vec_type, "color");
}
}
- lp_build_flow_scope_declare(flow, &z);
/* do triangle edge testing */
if (partial_mask) {
@@ -318,74 +304,126 @@ generate_fs(struct lp_fragment_shader *shader,
}
/* 'mask' will control execution based on quad's pixel alive/killed state */
- lp_build_mask_begin(&mask, flow, type, *pmask);
-
- early_depth_stencil_test =
- (key->depth.enabled || key->stencil[0].enabled) &&
- !key->alpha.enabled &&
- !shader->info.uses_kill &&
- !shader->info.writes_z;
-
- if (early_depth_stencil_test)
- generate_depth_stencil(builder, key,
- type, &mask,
- stencil_refs, z, depth_ptr, facing, counter);
+ lp_build_mask_begin(&mask, builder, type, *pmask);
+
+ if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
+ lp_build_mask_check(&mask);
+
+ lp_build_interp_soa_update_pos(interp, i);
+ z = interp->pos[2];
+
+ if (depth_mode & EARLY_DEPTH_TEST) {
+ lp_build_depth_stencil_test(builder,
+ &key->depth,
+ key->stencil,
+ type,
+ zs_format_desc,
+ &mask,
+ stencil_refs,
+ z,
+ depth_ptr, facing,
+ &zs_value,
+ !simple_shader);
+
+ if (depth_mode & EARLY_DEPTH_WRITE) {
+ lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+ }
+ }
+ lp_build_interp_soa_update_inputs(interp, i);
+
+ /* Build the actual shader */
lp_build_tgsi_soa(builder, tokens, type, &mask,
consts_ptr, interp->pos, interp->inputs,
- outputs, sampler, &shader->info);
+ outputs, sampler, &shader->info.base);
- /* loop over fragment shader outputs/results */
- for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
- for(chan = 0; chan < NUM_CHANNELS; ++chan) {
- if(outputs[attrib][chan]) {
- LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
- lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]);
-
- switch (shader->info.output_semantic_name[attrib]) {
- case TGSI_SEMANTIC_COLOR:
- {
- unsigned cbuf = shader->info.output_semantic_index[attrib];
-
- lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
-
- /* Alpha test */
- /* XXX: should only test the final assignment to alpha */
- if (cbuf == 0 && chan == 3 && key->alpha.enabled) {
- LLVMValueRef alpha = out;
- LLVMValueRef alpha_ref_value;
- alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
- alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
- lp_build_alpha_test(builder, key->alpha.func, type,
- &mask, alpha, alpha_ref_value);
- }
-
- color[cbuf][chan] = out;
- break;
- }
-
- case TGSI_SEMANTIC_POSITION:
- if(chan == 2)
- z = out;
- break;
- }
- }
+
+ /* Alpha test */
+ if (key->alpha.enabled) {
+ int color0 = find_output_by_semantic(&shader->info.base,
+ TGSI_SEMANTIC_COLOR,
+ 0);
+
+ if (color0 != -1 && outputs[color0][3]) {
+ LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+ LLVMValueRef alpha_ref_value;
+
+ alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
+ alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
+
+ lp_build_alpha_test(builder, key->alpha.func, type,
+ &mask, alpha, alpha_ref_value,
+ (depth_mode & LATE_DEPTH_TEST) != 0);
}
}
- if (!early_depth_stencil_test)
- generate_depth_stencil(builder, key,
- type, &mask,
- stencil_refs, z, depth_ptr, facing, counter);
+ /* Late Z test */
+ if (depth_mode & LATE_DEPTH_TEST) {
+ int pos0 = find_output_by_semantic(&shader->info.base,
+ TGSI_SEMANTIC_POSITION,
+ 0);
+
+ if (pos0 != -1 && outputs[pos0][2]) {
+ z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+ }
- lp_build_mask_end(&mask);
+ lp_build_depth_stencil_test(builder,
+ &key->depth,
+ key->stencil,
+ type,
+ zs_format_desc,
+ &mask,
+ stencil_refs,
+ z,
+ depth_ptr, facing,
+ &zs_value,
+ !simple_shader);
+ /* Late Z write */
+ if (depth_mode & LATE_DEPTH_WRITE) {
+ lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+ }
+ }
+ else if ((depth_mode & EARLY_DEPTH_TEST) &&
+ (depth_mode & LATE_DEPTH_WRITE))
+ {
+ /* Need to apply a reduced mask to the depth write. Reload the
+ * depth value, update from zs_value with the new mask value and
+ * write that out.
+ */
+ lp_build_deferred_depth_write(builder,
+ type,
+ zs_format_desc,
+ &mask,
+ depth_ptr,
+ zs_value);
+ }
- lp_build_flow_scope_end(flow);
- lp_build_flow_destroy(flow);
+ /* Color write */
+ for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
+ {
+ if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR &&
+ shader->info.base.output_semantic_index[attrib] < key->nr_cbufs)
+ {
+ unsigned cbuf = shader->info.base.output_semantic_index[attrib];
+ for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+ if(outputs[attrib][chan]) {
+ /* XXX: just initialize outputs to point at colors[] and
+ * skip this.
+ */
+ LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+ lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
+ LLVMBuildStore(builder, out, color[cbuf][chan]);
+ }
+ }
+ }
+ }
- *pmask = mask.value;
+ if (counter)
+ lp_build_occlusion_count(builder, type,
+ lp_build_mask_value(&mask), counter);
+ *pmask = lp_build_mask_end(&mask);
}
@@ -406,10 +444,10 @@ generate_blend(const struct pipe_blend_state *blend,
LLVMValueRef context_ptr,
LLVMValueRef mask,
LLVMValueRef *src,
- LLVMValueRef dst_ptr)
+ LLVMValueRef dst_ptr,
+ boolean do_branch)
{
struct lp_build_context bld;
- struct lp_build_flow_context *flow;
struct lp_build_mask_context mask_ctx;
LLVMTypeRef vec_type;
LLVMValueRef const_ptr;
@@ -420,10 +458,9 @@ generate_blend(const struct pipe_blend_state *blend,
lp_build_context_init(&bld, builder, type);
- flow = lp_build_flow_create(builder);
-
- /* we'll use this mask context to skip blending if all pixels are dead */
- lp_build_mask_begin(&mask_ctx, flow, type, mask);
+ lp_build_mask_begin(&mask_ctx, builder, type, mask);
+ if (do_branch)
+ lp_build_mask_check(&mask_ctx);
vec_type = lp_build_vec_type(type);
@@ -456,7 +493,6 @@ generate_blend(const struct pipe_blend_state *blend,
}
lp_build_mask_end(&mask_ctx);
- lp_build_flow_destroy(flow);
}
@@ -501,6 +537,7 @@ generate_fragment(struct llvmpipe_screen *screen,
LLVMValueRef blend_mask;
LLVMValueRef function;
LLVMValueRef facing;
+ const struct util_format_description *zs_format_desc;
unsigned num_fs;
unsigned i;
unsigned chan;
@@ -508,8 +545,8 @@ generate_fragment(struct llvmpipe_screen *screen,
/* Adjust color input interpolation according to flatshade state:
*/
- memcpy(inputs, shader->inputs, shader->info.num_inputs * sizeof inputs[0]);
- for (i = 0; i < shader->info.num_inputs; i++) {
+ memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
+ for (i = 0; i < shader->info.base.num_inputs; i++) {
if (inputs[i].interp == LP_INTERP_COLOR) {
if (key->flatshade)
inputs[i].interp = LP_INTERP_CONSTANT;
@@ -553,12 +590,12 @@ generate_fragment(struct llvmpipe_screen *screen,
arg_types[0] = screen->context_ptr_type; /* context */
arg_types[1] = LLVMInt32Type(); /* x */
arg_types[2] = LLVMInt32Type(); /* y */
- arg_types[3] = LLVMFloatType(); /* facing */
+ arg_types[3] = LLVMInt32Type(); /* facing */
arg_types[4] = LLVMPointerType(fs_elem_type, 0); /* a0 */
arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* dadx */
arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dady */
arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0); /* color */
- arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+ arg_types[8] = LLVMPointerType(LLVMInt8Type(), 0); /* depth */
arg_types[9] = LLVMInt32Type(); /* mask_input */
arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */
@@ -616,7 +653,7 @@ generate_fragment(struct llvmpipe_screen *screen,
* already included in the shader key.
*/
lp_build_interp_soa_init(&interp,
- shader->info.num_inputs,
+ shader->info.base.num_inputs,
inputs,
builder, fs_type,
a0_ptr, dadx_ptr, dady_ptr,
@@ -626,15 +663,16 @@ generate_fragment(struct llvmpipe_screen *screen,
sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
/* loop over quads in the block */
+ zs_format_desc = util_format_description(key->zsbuf_format);
+
for(i = 0; i < num_fs; ++i) {
- LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+ LLVMValueRef depth_offset = LLVMConstInt(LLVMInt32Type(),
+ i*fs_type.length*zs_format_desc->block.bits/8,
+ 0);
LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
LLVMValueRef depth_ptr_i;
- if(i != 0)
- lp_build_interp_soa_update(&interp, i);
-
- depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
+ depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &depth_offset, 1, "");
generate_fs(shader, key,
builder,
@@ -670,9 +708,18 @@ generate_fragment(struct llvmpipe_screen *screen,
* Convert the fs's output color and mask to fit to the blending type.
*/
for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+ LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
+
+ for (i = 0; i < num_fs; i++) {
+ fs_color_vals[i] =
+ LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
+ }
+
lp_build_conv(builder, fs_type, blend_type,
- fs_out_color[cbuf][chan], num_fs,
+ fs_color_vals,
+ num_fs,
&blend_in_color[chan], 1);
+
lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
}
@@ -695,14 +742,23 @@ generate_fragment(struct llvmpipe_screen *screen,
/*
* Blending.
*/
- generate_blend(&key->blend,
- rt,
- builder,
- blend_type,
- context_ptr,
- blend_mask,
- blend_in_color,
- color_ptr);
+ {
+ /* Could the 4x4 have been killed?
+ */
+ boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
+ !key->alpha.enabled &&
+ !shader->info.base.uses_kill);
+
+ generate_blend(&key->blend,
+ rt,
+ builder,
+ blend_type,
+ context_ptr,
+ blend_mask,
+ blend_in_color,
+ color_ptr,
+ do_branch);
+ }
}
#ifdef PIPE_ARCH_X86
@@ -727,12 +783,17 @@ generate_fragment(struct llvmpipe_screen *screen,
/* Apply optimizations to LLVM IR */
LLVMRunFunctionPassManager(screen->pass, function);
- if (gallivm_debug & GALLIVM_DEBUG_IR) {
+ if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) {
/* Print the LLVM IR to stderr */
lp_debug_dump_value(function);
debug_printf("\n");
}
+ /* Dump byte code to a file */
+ if (0) {
+ LLVMWriteBitcodeToFile(lp_build_module, "llvmpipe.bc");
+ }
+
/*
* Translate the LLVM IR into machine code.
*/
@@ -741,7 +802,7 @@ generate_fragment(struct llvmpipe_screen *screen,
variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);
- if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+ if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) {
lp_disassemble(f);
}
lp_func_delete_body(function);
@@ -756,6 +817,9 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
debug_printf("fs variant %p:\n", (void *) key);
+ if (key->flatshade) {
+ debug_printf("flatshade = 1\n");
+ }
for (i = 0; i < key->nr_cbufs; ++i) {
debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
}
@@ -780,6 +844,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
}
+ if (key->occlusion_count) {
+ debug_printf("occlusion_count = 1\n");
+ }
+
if (key->blend.logicop_enable) {
debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
}
@@ -792,31 +860,33 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
}
debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
- for (i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
- if (key->sampler[i].format) {
- debug_printf("sampler[%u] = \n", i);
- debug_printf(" .format = %s\n",
- util_format_name(key->sampler[i].format));
- debug_printf(" .target = %s\n",
- util_dump_tex_target(key->sampler[i].target, TRUE));
- debug_printf(" .pot = %u %u %u\n",
- key->sampler[i].pot_width,
- key->sampler[i].pot_height,
- key->sampler[i].pot_depth);
- debug_printf(" .wrap = %s %s %s\n",
- util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
- util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
- util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
- debug_printf(" .min_img_filter = %s\n",
- util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
- debug_printf(" .min_mip_filter = %s\n",
- util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
- debug_printf(" .mag_img_filter = %s\n",
- util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
- if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
- debug_printf(" .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
- debug_printf(" .normalized_coords = %u\n", key->sampler[i].normalized_coords);
- }
+ for (i = 0; i < key->nr_samplers; ++i) {
+ debug_printf("sampler[%u] = \n", i);
+ debug_printf(" .format = %s\n",
+ util_format_name(key->sampler[i].format));
+ debug_printf(" .target = %s\n",
+ util_dump_tex_target(key->sampler[i].target, TRUE));
+ debug_printf(" .pot = %u %u %u\n",
+ key->sampler[i].pot_width,
+ key->sampler[i].pot_height,
+ key->sampler[i].pot_depth);
+ debug_printf(" .wrap = %s %s %s\n",
+ util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+ util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+ util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+ debug_printf(" .min_img_filter = %s\n",
+ util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+ debug_printf(" .min_mip_filter = %s\n",
+ util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+ debug_printf(" .mag_img_filter = %s\n",
+ util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+ if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+ debug_printf(" .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
+ debug_printf(" .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+ debug_printf(" .min_max_lod_equal = %u\n", key->sampler[i].min_max_lod_equal);
+ debug_printf(" .lod_bias_non_zero = %u\n", key->sampler[i].lod_bias_non_zero);
+ debug_printf(" .apply_min_lod = %u\n", key->sampler[i].apply_min_lod);
+ debug_printf(" .apply_max_lod = %u\n", key->sampler[i].apply_max_lod);
}
}
@@ -871,7 +941,7 @@ generate_variant(struct llvmpipe_screen *screen,
!key->stencil[0].enabled &&
!key->alpha.enabled &&
!key->depth.enabled &&
- !shader->info.uses_kill
+ !shader->info.base.uses_kill
? TRUE : FALSE;
@@ -896,6 +966,7 @@ static void *
llvmpipe_create_fs_state(struct pipe_context *pipe,
const struct pipe_shader_state *templ)
{
+ struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
struct lp_fragment_shader *shader;
int nr_samplers;
int i;
@@ -908,20 +979,27 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
make_empty_list(&shader->variants);
/* get/save the summary info for this shader */
- tgsi_scan_shader(templ->tokens, &shader->info);
+ lp_build_tgsi_info(templ->tokens, &shader->info);
/* we need to keep a local copy of the tokens */
shader->base.tokens = tgsi_dup_tokens(templ->tokens);
- nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+ shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
+ if (shader->draw_data == NULL) {
+ FREE((void *) shader->base.tokens);
+ FREE(shader);
+ return NULL;
+ }
+
+ nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
sampler[nr_samplers]);
- for (i = 0; i < shader->info.num_inputs; i++) {
- shader->inputs[i].usage_mask = shader->info.input_usage_mask[i];
+ for (i = 0; i < shader->info.base.num_inputs; i++) {
+ shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
- switch (shader->info.input_interpolate[i]) {
+ switch (shader->info.base.input_interpolate[i]) {
case TGSI_INTERPOLATE_CONSTANT:
shader->inputs[i].interp = LP_INTERP_CONSTANT;
break;
@@ -936,7 +1014,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
break;
}
- switch (shader->info.input_semantic_name[i]) {
+ switch (shader->info.base.input_semantic_name[i]) {
case TGSI_SEMANTIC_COLOR:
/* Colors may be either linearly or constant interpolated in
* the fragment shader, but that information isn't available
@@ -963,8 +1041,8 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
tgsi_dump(templ->tokens, 0);
debug_printf("usage masks:\n");
- for (attrib = 0; attrib < shader->info.num_inputs; ++attrib) {
- unsigned usage_mask = shader->info.input_usage_mask[attrib];
+ for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
+ unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
debug_printf(" IN[%u].%s%s%s%s\n",
attrib,
usage_mask & TGSI_WRITEMASK_X ? "x" : "",
@@ -989,6 +1067,9 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
draw_flush(llvmpipe->draw);
+ draw_bind_fragment_shader(llvmpipe->draw,
+ (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL));
+
llvmpipe->fs = fs;
llvmpipe->dirty |= LP_NEW_FS;
@@ -1046,6 +1127,8 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
li = next;
}
+ draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
+
assert(shader->variants_cached == 0);
FREE((void *) shader->base.tokens);
FREE(shader);
@@ -1087,7 +1170,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
* Return the blend factor equivalent to a destination alpha of one.
*/
static INLINE unsigned
-force_dst_alpha_one(unsigned factor, boolean alpha)
+force_dst_alpha_one(unsigned factor)
{
switch(factor) {
case PIPE_BLENDFACTOR_DST_ALPHA:
@@ -1098,15 +1181,6 @@ force_dst_alpha_one(unsigned factor, boolean alpha)
return PIPE_BLENDFACTOR_ZERO;
}
- if (alpha) {
- switch(factor) {
- case PIPE_BLENDFACTOR_DST_COLOR:
- return PIPE_BLENDFACTOR_ONE;
- case PIPE_BLENDFACTOR_INV_DST_COLOR:
- return PIPE_BLENDFACTOR_ZERO;
- }
- }
-
return factor;
}
@@ -1183,21 +1257,24 @@ make_variant_key(struct llvmpipe_context *lp,
*
* TODO: This should be generalized to all channels for better
* performance, but only alpha causes correctness issues.
+ *
+ * Also, force rgb/alpha func/factors match, to make AoS blending easier.
*/
if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W) {
- blend_rt->rgb_src_factor = force_dst_alpha_one(blend_rt->rgb_src_factor, FALSE);
- blend_rt->rgb_dst_factor = force_dst_alpha_one(blend_rt->rgb_dst_factor, FALSE);
- blend_rt->alpha_src_factor = force_dst_alpha_one(blend_rt->alpha_src_factor, TRUE);
- blend_rt->alpha_dst_factor = force_dst_alpha_one(blend_rt->alpha_dst_factor, TRUE);
+ blend_rt->rgb_src_factor = force_dst_alpha_one(blend_rt->rgb_src_factor);
+ blend_rt->rgb_dst_factor = force_dst_alpha_one(blend_rt->rgb_dst_factor);
+ blend_rt->alpha_func = blend_rt->rgb_func;
+ blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
+ blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
}
}
/* This value will be the same for all the variants of a given shader:
*/
- key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+ key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
for(i = 0; i < key->nr_samplers; ++i) {
- if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+ if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
lp_sampler_static_state(&key->sampler[i],
lp->fragment_sampler_views[i],
lp->sampler[i]);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index f73c7801c0..7d58c4936c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -34,6 +34,7 @@
#include "pipe/p_state.h"
#include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */
#include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */
+#include "gallivm/lp_bld_tgsi.h" /* for lp_tgsi_info */
#include "lp_bld_interp.h" /* for struct lp_shader_input */
@@ -97,10 +98,12 @@ struct lp_fragment_shader
{
struct pipe_shader_state base;
- struct tgsi_shader_info info;
+ struct lp_tgsi_info info;
struct lp_fs_variant_list_item variants;
+ struct draw_fragment_shader *draw_data;
+
/* For debugging/profiling purposes */
unsigned variant_key_size;
unsigned no;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index 0bad7320f3..dbd73812e4 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -78,8 +78,9 @@ llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
lp_setup_set_point_state( llvmpipe->setup,
llvmpipe->rasterizer->point_size,
llvmpipe->rasterizer->point_size_per_vertex,
- llvmpipe->rasterizer->sprite_coord_enable);
- }
+ llvmpipe->rasterizer->sprite_coord_enable,
+ llvmpipe->rasterizer->sprite_coord_mode);
+ }
llvmpipe->dirty |= LP_NEW_RASTERIZER;
}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 17a4a0ed02..1dd866195d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -246,9 +246,9 @@ llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
struct pipe_sampler_view **views)
{
unsigned i;
- uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
- uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
- const void *data[DRAW_MAX_TEXTURE_LEVELS];
+ uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+ uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+ const void *data[PIPE_MAX_TEXTURE_LEVELS];
assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
if (!num)
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index ee4991bf8d..a8dee280dd 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -603,7 +603,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
assert(sizeof key->inputs[0] == sizeof(ushort));
- key->num_inputs = fs->info.num_inputs;
+ key->num_inputs = fs->info.base.num_inputs;
key->flatshade_first = lp->rasterizer->flatshade_first;
key->pixel_center_half = lp->rasterizer->gl_rasterization_rules;
key->size = Offset(struct lp_setup_variant_key,
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index d0389f0cb0..8b6b5e1298 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -243,19 +243,6 @@ add_blend_test(LLVMModuleRef module,
}
-/** Add and limit result to ceiling of 1.0 */
-#define ADD_SAT(R, A, B) \
-do { \
- R = (A) + (B); if (R > 1.0f) R = 1.0f; \
-} while (0)
-
-/** Subtract and limit result to floor of 0.0 */
-#define SUB_SAT(R, A, B) \
-do { \
- R = (A) - (B); if (R < 0.0f) R = 0.0f; \
-} while (0)
-
-
static void
compute_blend_ref_term(unsigned rgb_factor,
unsigned alpha_factor,
@@ -423,19 +410,19 @@ compute_blend_ref(const struct pipe_blend_state *blend,
*/
switch (blend->rt[0].rgb_func) {
case PIPE_BLEND_ADD:
- ADD_SAT(res[0], src_term[0], dst_term[0]); /* R */
- ADD_SAT(res[1], src_term[1], dst_term[1]); /* G */
- ADD_SAT(res[2], src_term[2], dst_term[2]); /* B */
+ res[0] = src_term[0] + dst_term[0]; /* R */
+ res[1] = src_term[1] + dst_term[1]; /* G */
+ res[2] = src_term[2] + dst_term[2]; /* B */
break;
case PIPE_BLEND_SUBTRACT:
- SUB_SAT(res[0], src_term[0], dst_term[0]); /* R */
- SUB_SAT(res[1], src_term[1], dst_term[1]); /* G */
- SUB_SAT(res[2], src_term[2], dst_term[2]); /* B */
+ res[0] = src_term[0] - dst_term[0]; /* R */
+ res[1] = src_term[1] - dst_term[1]; /* G */
+ res[2] = src_term[2] - dst_term[2]; /* B */
break;
case PIPE_BLEND_REVERSE_SUBTRACT:
- SUB_SAT(res[0], dst_term[0], src_term[0]); /* R */
- SUB_SAT(res[1], dst_term[1], src_term[1]); /* G */
- SUB_SAT(res[2], dst_term[2], src_term[2]); /* B */
+ res[0] = dst_term[0] - src_term[0]; /* R */
+ res[1] = dst_term[1] - src_term[1]; /* G */
+ res[2] = dst_term[2] - src_term[2]; /* B */
break;
case PIPE_BLEND_MIN:
res[0] = MIN2(src_term[0], dst_term[0]); /* R */
@@ -456,13 +443,13 @@ compute_blend_ref(const struct pipe_blend_state *blend,
*/
switch (blend->rt[0].alpha_func) {
case PIPE_BLEND_ADD:
- ADD_SAT(res[3], src_term[3], dst_term[3]); /* A */
+ res[3] = src_term[3] + dst_term[3]; /* A */
break;
case PIPE_BLEND_SUBTRACT:
- SUB_SAT(res[3], src_term[3], dst_term[3]); /* A */
+ res[3] = src_term[3] - dst_term[3]; /* A */
break;
case PIPE_BLEND_REVERSE_SUBTRACT:
- SUB_SAT(res[3], dst_term[3], src_term[3]); /* A */
+ res[3] = dst_term[3] - src_term[3]; /* A */
break;
case PIPE_BLEND_MIN:
res[3] = MIN2(src_term[3], dst_term[3]); /* A */
@@ -676,6 +663,8 @@ test_one(unsigned verbose,
fprintf(stderr, " Ref%c: ", channel);
dump_vec(stderr, type, ref + j*stride);
fprintf(stderr, "\n");
+
+ fprintf(stderr, "\n");
}
}
}
@@ -773,7 +762,7 @@ blend_funcs[] = {
const struct lp_type blend_types[] = {
/* float, fixed, sign, norm, width, len */
- { TRUE, FALSE, FALSE, TRUE, 32, 4 }, /* f32 x 4 */
+ { TRUE, FALSE, TRUE, FALSE, 32, 4 }, /* f32 x 4 */
{ FALSE, FALSE, FALSE, TRUE, 8, 16 }, /* u8n x 16 */
};
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 7bbbc61d4c..7a0d06ae2c 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -205,16 +205,19 @@ random_elem(struct lp_type type, void *dst, unsigned index)
assert(index < type.length);
value = (double)rand()/(double)RAND_MAX;
if(!type.norm) {
- unsigned long long mask;
- if (type.floating)
- mask = ~(unsigned long long)0;
- else if (type.fixed)
- mask = ((unsigned long long)1 << (type.width / 2)) - 1;
- else if (type.sign)
- mask = ((unsigned long long)1 << (type.width - 1)) - 1;
- else
- mask = ((unsigned long long)1 << type.width) - 1;
- value += (double)(mask & rand());
+ if (type.floating) {
+ value *= 2.0;
+ }
+ else {
+ unsigned long long mask;
+ if (type.fixed)
+ mask = ((unsigned long long)1 << (type.width / 2)) - 1;
+ else if (type.sign)
+ mask = ((unsigned long long)1 << (type.width - 1)) - 1;
+ else
+ mask = ((unsigned long long)1 << type.width) - 1;
+ value += (double)(mask & rand());
+ }
}
if(!type.sign)
if(rand() & 1)
@@ -261,12 +264,18 @@ boolean
compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
{
unsigned i;
+ eps *= type.floating ? 8.0 : 2.0;
for (i = 0; i < type.length; ++i) {
double res_elem = read_elem(type, res, i);
double ref_elem = read_elem(type, ref, i);
- double delta = fabs(res_elem - ref_elem);
- if(delta >= 2.0*eps)
+ double delta = res_elem - ref_elem;
+ if (ref_elem < -1.0 || ref_elem > 1.0) {
+ delta /= ref_elem;
+ }
+ delta = fabs(delta);
+ if (delta >= eps) {
return FALSE;
+ }
}
return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c
index 57b0ee5776..0ca2791592 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_round.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_round.c
@@ -75,10 +75,7 @@ add_test(LLVMModuleRef module, const char *name, lp_func_t lp_func)
LLVMValueRef ret;
struct lp_build_context bld;
- bld.builder = builder;
- bld.type.floating = 1;
- bld.type.width = 32;
- bld.type.length = 4;
+ lp_build_context_init(&bld, builder, lp_float32_vec4_type());
LLVMSetFunctionCallConv(func, LLVMCCallConv);
@@ -100,9 +97,10 @@ printv(char* string, v4sf value)
f[0], f[1], f[2], f[3]);
}
-static void
+static boolean
compare(v4sf x, v4sf y)
{
+ boolean success = TRUE;
float *xp = (float *) &x;
float *yp = (float *) &y;
if (xp[0] != yp[0] ||
@@ -110,7 +108,9 @@ compare(v4sf x, v4sf y)
xp[2] != yp[2] ||
xp[3] != yp[3]) {
printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n");
+ success = FALSE;
}
+ return success;
}
@@ -191,7 +191,7 @@ test_round(unsigned verbose, FILE *fp)
y = round_func(x);
printv("C round(x) ", ref);
printv("LLVM round(x)", y);
- compare(ref, y);
+ success = success && compare(ref, y);
refp[0] = trunc(xp[0]);
refp[1] = trunc(xp[1]);
@@ -200,7 +200,7 @@ test_round(unsigned verbose, FILE *fp)
y = trunc_func(x);
printv("C trunc(x) ", ref);
printv("LLVM trunc(x)", y);
- compare(ref, y);
+ success = success && compare(ref, y);
refp[0] = floor(xp[0]);
refp[1] = floor(xp[1]);
@@ -209,7 +209,7 @@ test_round(unsigned verbose, FILE *fp)
y = floor_func(x);
printv("C floor(x) ", ref);
printv("LLVM floor(x)", y);
- compare(ref, y);
+ success = success && compare(ref, y);
refp[0] = ceil(xp[0]);
refp[1] = ceil(xp[1]);
@@ -218,7 +218,7 @@ test_round(unsigned verbose, FILE *fp)
y = ceil_func(x);
printv("C ceil(x) ", ref);
printv("LLVM ceil(x) ", y);
- compare(ref, y);
+ success = success && compare(ref, y);
}
LLVMFreeMachineCodeForFunction(engine, test_round);
@@ -247,11 +247,7 @@ test_round(unsigned verbose, FILE *fp)
boolean
test_all(unsigned verbose, FILE *fp)
{
- boolean success = TRUE;
-
- test_round(verbose, fp);
-
- return success;
+ return test_round(verbose, fp);
}
diff --git a/src/gallium/drivers/llvmpipe/lp_test_sincos.c b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
index 7ab357f162..79939b1a39 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_sincos.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
@@ -72,10 +72,7 @@ add_sincos_test(LLVMModuleRef module, boolean sin)
LLVMValueRef ret;
struct lp_build_context bld;
- bld.builder = builder;
- bld.type.floating = 1;
- bld.type.width = 32;
- bld.type.length = 4;
+ lp_build_context_init(&bld, builder, lp_float32_vec4_type());
LLVMSetFunctionCallConv(func, LLVMCCallConv);
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index 4e026cc8ff..f417fc8a9e 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -151,6 +151,10 @@ LP_LLVM_TEXTURE_MEMBER(last_level, LP_JIT_TEXTURE_LAST_LEVEL, TRUE)
LP_LLVM_TEXTURE_MEMBER(row_stride, LP_JIT_TEXTURE_ROW_STRIDE, FALSE)
LP_LLVM_TEXTURE_MEMBER(img_stride, LP_JIT_TEXTURE_IMG_STRIDE, FALSE)
LP_LLVM_TEXTURE_MEMBER(data_ptr, LP_JIT_TEXTURE_DATA, FALSE)
+LP_LLVM_TEXTURE_MEMBER(min_lod, LP_JIT_TEXTURE_MIN_LOD, TRUE)
+LP_LLVM_TEXTURE_MEMBER(max_lod, LP_JIT_TEXTURE_MAX_LOD, TRUE)
+LP_LLVM_TEXTURE_MEMBER(lod_bias, LP_JIT_TEXTURE_LOD_BIAS, TRUE)
+LP_LLVM_TEXTURE_MEMBER(border_color, LP_JIT_TEXTURE_BORDER_COLOR, FALSE)
static void
@@ -217,6 +221,11 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
sampler->dynamic_state.base.row_stride = lp_llvm_texture_row_stride;
sampler->dynamic_state.base.img_stride = lp_llvm_texture_img_stride;
sampler->dynamic_state.base.data_ptr = lp_llvm_texture_data_ptr;
+ sampler->dynamic_state.base.min_lod = lp_llvm_texture_min_lod;
+ sampler->dynamic_state.base.max_lod = lp_llvm_texture_max_lod;
+ sampler->dynamic_state.base.lod_bias = lp_llvm_texture_lod_bias;
+ sampler->dynamic_state.base.border_color = lp_llvm_texture_border_color;
+
sampler->dynamic_state.static_state = static_state;
sampler->dynamic_state.context_ptr = context_ptr;
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index 2ba39052ab..e49f9c62fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -289,172 +289,141 @@ def generate_format_write(format, src_channel, src_native_type, src_suffix):
print
-def generate_ssse3():
+def generate_sse2():
print '''
#if defined(PIPE_ARCH_SSE)
#include "util/u_sse.h"
-static void
-lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
- const uint8_t *src, unsigned src_stride,
- unsigned x0, unsigned y0)
+static ALWAYS_INLINE void
+swz4( const __m128i * restrict x,
+ const __m128i * restrict y,
+ const __m128i * restrict z,
+ const __m128i * restrict w,
+ __m128i * restrict a,
+ __m128i * restrict b,
+ __m128i * restrict c,
+ __m128i * restrict d)
{
+ __m128i i, j, k, l;
+ __m128i m, n, o, p;
+ __m128i e, f, g, h;
+
+ m = _mm_unpacklo_epi8(*x,*y);
+ n = _mm_unpackhi_epi8(*x,*y);
+ o = _mm_unpacklo_epi8(*z,*w);
+ p = _mm_unpackhi_epi8(*z,*w);
+
+ i = _mm_unpacklo_epi16(m,n);
+ j = _mm_unpackhi_epi16(m,n);
+ k = _mm_unpacklo_epi16(o,p);
+ l = _mm_unpackhi_epi16(o,p);
+
+ e = _mm_unpacklo_epi8(i,j);
+ f = _mm_unpackhi_epi8(i,j);
+ g = _mm_unpacklo_epi8(k,l);
+ h = _mm_unpackhi_epi8(k,l);
+
+ *a = _mm_unpacklo_epi64(e,g);
+ *b = _mm_unpackhi_epi64(e,g);
+ *c = _mm_unpacklo_epi64(f,h);
+ *d = _mm_unpackhi_epi64(f,h);
+}
+
+static ALWAYS_INLINE void
+unswz4( const __m128i * restrict a,
+ const __m128i * restrict b,
+ const __m128i * restrict c,
+ const __m128i * restrict d,
+ __m128i * restrict x,
+ __m128i * restrict y,
+ __m128i * restrict z,
+ __m128i * restrict w)
+{
+ __m128i i, j, k, l;
+ __m128i m, n, o, p;
+
+ i = _mm_unpacklo_epi8(*a,*b);
+ j = _mm_unpackhi_epi8(*a,*b);
+ k = _mm_unpacklo_epi8(*c,*d);
+ l = _mm_unpackhi_epi8(*c,*d);
+
+ m = _mm_unpacklo_epi16(i,k);
+ n = _mm_unpackhi_epi16(i,k);
+ o = _mm_unpacklo_epi16(j,l);
+ p = _mm_unpackhi_epi16(j,l);
+
+ *x = _mm_unpacklo_epi64(m,n);
+ *y = _mm_unpackhi_epi64(m,n);
+ *z = _mm_unpacklo_epi64(o,p);
+ *w = _mm_unpackhi_epi64(o,p);
+}
+static void
+lp_tile_b8g8r8a8_unorm_swizzle_4ub_sse2(uint8_t * restrict dst,
+ const uint8_t * restrict src, unsigned src_stride,
+ unsigned x0, unsigned y0)
+{
+ __m128i *dst128 = (__m128i *) dst;
unsigned x, y;
- __m128i *pdst = (__m128i*) dst;
- const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t);
- unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH;
- unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT;
-
- const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
- const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
- const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
- const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
- const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
- const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
- const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
- const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
- const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff);
- const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff);
- const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff);
- const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff);
-
- const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e);
- const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d);
- const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c);
- const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f);
-
- for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
- __m128i line0 = *(__m128i*)ysrc0;
- const uint8_t *ysrc = ysrc0 + src_stride;
- ysrc0 += tile_stridey;
-
- for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
- __m128i r, g, b, a, line1;
- line1 = *(__m128i*)ysrc;
- PIPE_READ_WRITE_BARRIER();
- ysrc += src_stride;
- r = _mm_shuffle_epi8(line0, shuffle00);
- g = _mm_shuffle_epi8(line0, shuffle01);
- b = _mm_shuffle_epi8(line0, shuffle02);
- a = _mm_shuffle_epi8(line0, shuffle03);
-
- line0 = *(__m128i*)ysrc;
- PIPE_READ_WRITE_BARRIER();
- ysrc += src_stride;
- r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10));
- g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11));
- b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12));
- a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13));
-
- line1 = *(__m128i*)ysrc;
- PIPE_READ_WRITE_BARRIER();
- ysrc -= tile_stridex;
- r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20));
- g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21));
- b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22));
- a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23));
-
- if (x + 1 < TILE_SIZE) {
- line0 = *(__m128i*)ysrc;
- ysrc += src_stride;
- }
-
- PIPE_READ_WRITE_BARRIER();
- r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30));
- g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31));
- b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32));
- a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33));
-
- *pdst++ = r;
- *pdst++ = g;
- *pdst++ = b;
- *pdst++ = a;
+
+ src += y0 * src_stride;
+ src += x0 * sizeof(uint32_t);
+
+ for (y = 0; y < TILE_SIZE; y += 4) {
+ const uint8_t *src_row = src;
+
+ for (x = 0; x < TILE_SIZE; x += 4) {
+ swz4((const __m128i *) (src_row + 0 * src_stride),
+ (const __m128i *) (src_row + 1 * src_stride),
+ (const __m128i *) (src_row + 2 * src_stride),
+ (const __m128i *) (src_row + 3 * src_stride),
+ dst128 + 2, /* b */
+ dst128 + 1, /* g */
+ dst128 + 0, /* r */
+ dst128 + 3); /* a */
+
+ dst128 += 4;
+ src_row += sizeof(__m128i);
}
- }
+ src += 4 * src_stride;
+ }
}
static void
-lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
- uint8_t *dst, unsigned dst_stride,
+lp_tile_b8g8r8a8_unorm_unswizzle_4ub_sse2(const uint8_t * restrict src,
+ uint8_t * restrict dst, unsigned dst_stride,
unsigned x0, unsigned y0)
{
unsigned int x, y;
- const __m128i *psrc = (__m128i*) src;
- const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t));
- uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t);
- __m128i c0 = *psrc++;
- __m128i c1;
-
- const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff);
- const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff);
- const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff);
- const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff);
-
- const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff);
- const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff);
- const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff);
- const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff);
-
- const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff);
- const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff);
- const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff);
- const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff);
-
- const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05);
- const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07);
- const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d);
- const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f);
-
- for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
- __m128i *tile = (__m128i*) pdst;
- pdst += dst_stride * TILE_VECTOR_HEIGHT;
- for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
- uint8_t *linep = (uint8_t*) (tile++);
- __m128i line0, line1, line2, line3;
-
- c1 = *psrc++; /* r */
- PIPE_READ_WRITE_BARRIER();
- line0 = _mm_shuffle_epi8(c0, shuffle00);
- line1 = _mm_shuffle_epi8(c0, shuffle01);
- line2 = _mm_shuffle_epi8(c0, shuffle02);
- line3 = _mm_shuffle_epi8(c0, shuffle03);
-
- c0 = *psrc++; /* g */
- PIPE_READ_WRITE_BARRIER();
- line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10));
- line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11));
- line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12));
- line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13));
-
- c1 = *psrc++; /* b */
- PIPE_READ_WRITE_BARRIER();
- line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20));
- line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21));
- line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22));
- line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23));
-
- if (psrc != end)
- c0 = *psrc++; /* a */
- PIPE_READ_WRITE_BARRIER();
- line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30));
- line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31));
- line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32));
- line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33));
-
- *(__m128i*) (linep) = line0;
- *(__m128i*) (((char*)linep) + dst_stride) = line1;
- *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2;
- *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3;
+ const __m128i *src128 = (const __m128i *) src;
+
+ dst += y0 * dst_stride;
+ dst += x0 * sizeof(uint32_t);
+
+ for (y = 0; y < TILE_SIZE; y += 4) {
+ const uint8_t *dst_row = dst;
+
+ for (x = 0; x < TILE_SIZE; x += 4) {
+ unswz4( &src128[2], /* b */
+ &src128[1], /* g */
+ &src128[0], /* r */
+ &src128[3], /* a */
+ (__m128i *) (dst_row + 0 * dst_stride),
+ (__m128i *) (dst_row + 1 * dst_stride),
+ (__m128i *) (dst_row + 2 * dst_stride),
+ (__m128i *) (dst_row + 3 * dst_stride));
+
+ src128 += 4;
+ dst_row += sizeof(__m128i);;
}
+
+ dst += 4 * dst_stride;
}
}
-#endif /* PIPE_ARCH_SSSE3 */
+#endif /* PIPE_ARCH_SSE */
'''
@@ -479,7 +448,7 @@ def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix)
if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
print '#ifdef PIPE_ARCH_SSE'
- print ' func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+ print ' func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
print '#else'
print ' func = %s;' % (func_name,)
print '#endif'
@@ -517,7 +486,7 @@ def generate_unswizzle(formats, src_channel, src_native_type, src_suffix):
func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix)
if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
print '#ifdef PIPE_ARCH_SSE'
- print ' func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+ print ' func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
print '#else'
print ' func = %s;' % (func_name,)
print '#endif'
@@ -577,7 +546,7 @@ def main():
print '};'
print
- generate_ssse3()
+ generate_sse2()
channel = Channel(UNSIGNED, True, 8)
native_type = 'uint8_t'