From bc775066aa9bbbf0cf5f1d5ced81f81e4cb0b731 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 29 Jun 2008 17:20:52 +0200 Subject: r300: Fix wrap mode for 1D textures --- src/mesa/drivers/dri/r300/r300_reg.h | 4 +- src/mesa/drivers/dri/r300/r300_state.c | 8 +-- src/mesa/drivers/dri/r300/r300_tex.c | 127 ++++++++------------------------- 3 files changed, 36 insertions(+), 103 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index 8b00f9958c..58a19554c7 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -1366,8 +1366,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. # define R300_TX_WRAP_S_MASK (7 << 0) # define R300_TX_WRAP_T_SHIFT 3 # define R300_TX_WRAP_T_MASK (7 << 3) -# define R300_TX_WRAP_Q_SHIFT 6 -# define R300_TX_WRAP_Q_MASK (7 << 6) +# define R300_TX_WRAP_R_SHIFT 6 +# define R300_TX_WRAP_R_MASK (7 << 6) # define R300_TX_MAG_FILTER_4 (0 << 9) # define R300_TX_MAG_FILTER_NEAREST (1 << 9) # define R300_TX_MAG_FILTER_LINEAR (2 << 9) diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index b1284647da..1d4472d59e 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -1256,8 +1256,8 @@ static unsigned long gen_fixed_filter(unsigned long f) (R300_TX_CLAMP << R300_TX_WRAP_T_SHIFT)) { needs_fixing |= 2; } - if ((f & ((7 - 1) << R300_TX_WRAP_Q_SHIFT)) == - (R300_TX_CLAMP << R300_TX_WRAP_Q_SHIFT)) { + if ((f & ((7 - 1) << R300_TX_WRAP_R_SHIFT)) == + (R300_TX_CLAMP << R300_TX_WRAP_R_SHIFT)) { needs_fixing |= 4; } @@ -1297,8 +1297,8 @@ static unsigned long gen_fixed_filter(unsigned long f) f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_T_SHIFT; } if (needs_fixing & 4) { - f &= ~((7 - 1) << R300_TX_WRAP_Q_SHIFT); - f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_Q_SHIFT; + f &= ~((7 - 1) << R300_TX_WRAP_R_SHIFT); + f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_R_SHIFT; } return f; } diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c index 5f54bcad9a..505951329f 100644 --- a/src/mesa/drivers/dri/r300/r300_tex.c +++ b/src/mesa/drivers/dri/r300/r300_tex.c @@ -52,112 +52,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "xmlpool.h" + +static unsigned int translate_wrap_mode(GLenum wrapmode) +{ + switch(wrapmode) { + case GL_REPEAT: return R300_TX_REPEAT; + case GL_CLAMP: return R300_TX_CLAMP; + case GL_CLAMP_TO_EDGE: return R300_TX_CLAMP_TO_EDGE; + case GL_CLAMP_TO_BORDER: return R300_TX_CLAMP_TO_BORDER; + case GL_MIRRORED_REPEAT: return R300_TX_REPEAT | R300_TX_MIRRORED; + case GL_MIRROR_CLAMP_EXT: return R300_TX_CLAMP | R300_TX_MIRRORED; + case GL_MIRROR_CLAMP_TO_EDGE_EXT: return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED; + case GL_MIRROR_CLAMP_TO_BORDER_EXT: return R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED; + default: + _mesa_problem(NULL, "bad wrap mode in %s", __FUNCTION__); + return 0; + } +} + + /** - * Set the texture wrap modes. + * Update the cached hardware registers based on the current texture wrap modes. * * \param t Texture object whose wrap modes are to be set - * \param swrap Wrap mode for the \a s texture coordinate - * \param twrap Wrap mode for the \a t texture coordinate */ - -static void r300SetTexWrap(r300TexObjPtr t, GLenum swrap, GLenum twrap, - GLenum rwrap) +static void r300UpdateTexWrap(r300TexObjPtr t) { - unsigned long hw_swrap = 0, hw_twrap = 0, hw_qwrap = 0; + struct gl_texture_object *tObj = t->base.tObj; t->filter &= - ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_Q_MASK); + ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_R_MASK); - switch (swrap) { - case GL_REPEAT: - hw_swrap |= R300_TX_REPEAT; - break; - case GL_CLAMP: - hw_swrap |= R300_TX_CLAMP; - break; - case GL_CLAMP_TO_EDGE: - hw_swrap |= R300_TX_CLAMP_TO_EDGE; - break; - case GL_CLAMP_TO_BORDER: - hw_swrap |= R300_TX_CLAMP_TO_BORDER; - break; - case GL_MIRRORED_REPEAT: - hw_swrap |= R300_TX_REPEAT | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_EXT: - hw_swrap |= R300_TX_CLAMP | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_TO_EDGE_EXT: - hw_swrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_TO_BORDER_EXT: - hw_swrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED; - break; - default: - _mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__); - } + t->filter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT; - switch (twrap) { - case GL_REPEAT: - hw_twrap |= R300_TX_REPEAT; - break; - case GL_CLAMP: - hw_twrap |= R300_TX_CLAMP; - break; - case GL_CLAMP_TO_EDGE: - hw_twrap |= R300_TX_CLAMP_TO_EDGE; - break; - case GL_CLAMP_TO_BORDER: - hw_twrap |= R300_TX_CLAMP_TO_BORDER; - break; - case GL_MIRRORED_REPEAT: - hw_twrap |= R300_TX_REPEAT | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_EXT: - hw_twrap |= R300_TX_CLAMP | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_TO_EDGE_EXT: - hw_twrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_TO_BORDER_EXT: - hw_twrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED; - break; - default: - _mesa_problem(NULL, "bad T wrap mode in %s", __FUNCTION__); - } + if (tObj->Target != GL_TEXTURE_1D) { + t->filter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT; - switch (rwrap) { - case GL_REPEAT: - hw_qwrap |= R300_TX_REPEAT; - break; - case GL_CLAMP: - hw_qwrap |= R300_TX_CLAMP; - break; - case GL_CLAMP_TO_EDGE: - hw_qwrap |= R300_TX_CLAMP_TO_EDGE; - break; - case GL_CLAMP_TO_BORDER: - hw_qwrap |= R300_TX_CLAMP_TO_BORDER; - break; - case GL_MIRRORED_REPEAT: - hw_qwrap |= R300_TX_REPEAT | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_EXT: - hw_qwrap |= R300_TX_CLAMP | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_TO_EDGE_EXT: - hw_qwrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED; - break; - case GL_MIRROR_CLAMP_TO_BORDER_EXT: - hw_qwrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED; - break; - default: - _mesa_problem(NULL, "bad R wrap mode in %s", __FUNCTION__); + if (tObj->Target == GL_TEXTURE_3D) + t->filter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT; } - - t->filter |= hw_swrap << R300_TX_WRAP_S_SHIFT; - t->filter |= hw_twrap << R300_TX_WRAP_T_SHIFT; - t->filter |= hw_qwrap << R300_TX_WRAP_Q_SHIFT; } static GLuint aniso_filter(GLfloat anisotropy) @@ -281,7 +214,7 @@ static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj) make_empty_list(&t->base); - r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR); + r300UpdateTexWrap(t); r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy); r300SetTexBorderColor(t, texObj->_BorderChan); } @@ -1071,7 +1004,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target, case GL_TEXTURE_WRAP_S: case GL_TEXTURE_WRAP_T: case GL_TEXTURE_WRAP_R: - r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR); + r300UpdateTexWrap(t); break; case GL_TEXTURE_BORDER_COLOR: -- cgit v1.2.3 From a74d22ba715da5e52efb15aebd15a74851f87d43 Mon Sep 17 00:00:00 2001 From: Corbin Simpson Date: Sun, 29 Jun 2008 10:30:47 -0700 Subject: r300: Change LOD bias emission to more closely follow per-tex rules. Okay, this time it's for real, and for good. This should be a perma-fix. --- src/mesa/drivers/dri/r300/r300_state.c | 6 ++++-- src/mesa/drivers/dri/r300/r300_tex.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index 1d4472d59e..00351014af 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -1456,8 +1456,10 @@ static void r300SetupTextures(GLcontext * ctx) r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 + hw_tmu] = gen_fixed_filter(t->filter) | (hw_tmu << 28); - r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->filter_1 - | r300CalculateTexLodBias(r300->LODBias); + /* Make LOD bias a bit more per-tex and less per-everything. */ + t->filter_1 &= ~R300_LOD_BIAS_MASK; + t->filter_1 |= r300CalculateTexLodBias(ctx->Texture.Unit[i].LodBias); + r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->filter_1; r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->size; r300->hw.tex.format.cmd[R300_TEX_VALUE_0 + diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c index 505951329f..4ed71777d3 100644 --- a/src/mesa/drivers/dri/r300/r300_tex.c +++ b/src/mesa/drivers/dri/r300/r300_tex.c @@ -955,7 +955,6 @@ static void r300TexEnv(GLcontext * ctx, GLenum target, */ switch (pname) { case GL_TEXTURE_LOD_BIAS_EXT: { - /* Needs to be relocated in order to make sure we got the right tmu */ GLfloat bias, min; /* The R300's LOD bias is a signed 2's complement value with a @@ -968,7 +967,14 @@ static void r300TexEnv(GLcontext * ctx, GLenum target, "no_neg_lod_bias") ? 0.0 : -16.0; bias = CLAMP(bias, min, 16.0); - rmesa->LODBias = bias; + /* There's probably a magic Mesa method for finding the REAL + * texture unit. I don't know it, though. */ + if (!(ctx->Texture._EnabledUnits & (1 << ctx->Texture.CurrentUnit))) { + break; + } + + /* Save our newly clamped LOD bias. */ + ctx->Texture.Unit[ctx->Texture.CurrentUnit].LodBias = bias; break; } -- cgit v1.2.3 From 4002b75e6267ecd0f9e3093e221e34ed5c8485d4 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 30 Jun 2008 00:44:26 +0200 Subject: r300: Cleanup LodBias support . There is both a per-texture unit and a per-texture object (at least for OpenGL 1.4); this should now be supported properly. . The LOD bias calculation in r300_state has been simplified and corrected (need to multiply by 32 instead of 31, and ensure clamping) . do not clamp LOD bias in TexEnv, as that behaviour conflicts with what the spec says . set Const.MaxTextureLodBias properly . remove the no_neg_lod_bias property; if somebody can explain what it's good for, we can add it back in, but according to Google, nobody seems to use it . removed some dead code and unused variables --- src/mesa/drivers/dri/r300/r300_context.c | 1 + src/mesa/drivers/dri/r300/r300_context.h | 5 +- src/mesa/drivers/dri/r300/r300_state.c | 28 ++++----- src/mesa/drivers/dri/r300/r300_tex.c | 91 ----------------------------- src/mesa/drivers/dri/radeon/radeon_screen.c | 15 +++-- 5 files changed, 25 insertions(+), 115 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c index 063d4e575e..44c368030f 100644 --- a/src/mesa/drivers/dri/r300/r300_context.c +++ b/src/mesa/drivers/dri/r300/r300_context.c @@ -279,6 +279,7 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual, MIN2(ctx->Const.MaxTextureImageUnits, ctx->Const.MaxTextureCoordUnits); ctx->Const.MaxTextureMaxAnisotropy = 16.0; + ctx->Const.MaxTextureLodBias = 16.0; if (screen->chip_family >= CHIP_FAMILY_RV515) { ctx->Const.MaxTextureLevels = 13; diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index 285b2ad6fd..a24ab0cad7 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -806,10 +806,10 @@ struct r500_fragment_program { GLcontext *ctx; GLboolean translated; GLboolean error; - + struct r500_fragment_program_external_state state; struct r500_fragment_program_code code; - + GLboolean writes_depth; GLuint optimization; @@ -925,7 +925,6 @@ struct r300_context { driTextureObject swapped; int texture_depth; float initialMaxAnisotropy; - float LODBias; /* Clientdata textures; */ diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index 00351014af..9e4cc777a9 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -1381,16 +1381,14 @@ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings) } } -static GLuint r300CalculateTexLodBias(GLfloat bias) +static GLuint translate_lod_bias(GLfloat bias) { - GLuint b; - b = (unsigned int)fabsf(ceilf(bias*31)); - if (signbit(bias)) { - b ^= 0x3ff; /* 10 bits */ - } - b <<= 3; - b &= R300_LOD_BIAS_MASK; - return b; + GLint b = (int)(bias*32); + if (b >= (1 << 9)) + b = (1 << 9)-1; + else if (b < -(1 << 9)) + b = -(1 << 9); + return ((GLuint)b) << R300_LOD_BIAS_SHIFT; } static void r300SetupTextures(GLcontext * ctx) @@ -1456,10 +1454,14 @@ static void r300SetupTextures(GLcontext * ctx) r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 + hw_tmu] = gen_fixed_filter(t->filter) | (hw_tmu << 28); - /* Make LOD bias a bit more per-tex and less per-everything. */ - t->filter_1 &= ~R300_LOD_BIAS_MASK; - t->filter_1 |= r300CalculateTexLodBias(ctx->Texture.Unit[i].LodBias); - r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->filter_1; + /* Note: There is a LOD bias per texture unit and a LOD bias + * per texture object. We add them here to get the correct behaviour. + * (The per-texture object LOD bias was introduced in OpenGL 1.4 + * and is not present in the EXT_texture_object extension). + */ + r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = + t->filter_1 | + translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias); r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] = t->size; r300->hw.tex.format.cmd[R300_TEX_VALUE_0 + diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c index 4ed71777d3..c8f02c4ef5 100644 --- a/src/mesa/drivers/dri/r300/r300_tex.c +++ b/src/mesa/drivers/dri/r300/r300_tex.c @@ -175,20 +175,6 @@ static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4]) t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]); } -static void r300SetTexLodBias(r300TexObjPtr t, GLfloat bias) -{ - GLuint b; - b = (unsigned int)fabsf(ceilf(bias*31)); - if (signbit(bias)) { - b ^= 0x3ff; /* 10 bits */ - } - b <<= 3; - b &= R300_LOD_BIAS_MASK; - - t->filter_1 &= ~R300_LOD_BIAS_MASK; - t->filter_1 |= b; -} - /** * Allocate space for and load the mesa images into the texture memory block. * This will happen before drawing with a new texture, or drawing with a @@ -912,78 +898,6 @@ r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level, t->dirty_images[0] |= (1 << level); } -/* This feels like a prime target for code reuse, so I'm putting it here - * instead of inlining it in TexEnv. */ -static GLenum r300TexUnitTarget(struct gl_texture_unit *unit) { - if (unit->_ReallyEnabled & (TEXTURE_RECT_BIT)) { - return GL_TEXTURE_RECTANGLE_NV; - } else if (unit->_ReallyEnabled & (TEXTURE_1D_BIT)) { - return GL_TEXTURE_1D; - } else if (unit->_ReallyEnabled & (TEXTURE_2D_BIT)) { - return GL_TEXTURE_2D; - } else if (unit->_ReallyEnabled & (TEXTURE_3D_BIT)) { - return GL_TEXTURE_3D; - } else if (unit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) { - return GL_TEXTURE_CUBE_MAP; - } - if (unit->Enabled & (TEXTURE_RECT_BIT)) { - return GL_TEXTURE_RECTANGLE_NV; - } else if (unit->Enabled & (TEXTURE_1D_BIT)) { - return GL_TEXTURE_1D; - } else if (unit->Enabled & (TEXTURE_2D_BIT)) { - return GL_TEXTURE_2D; - } else if (unit->Enabled & (TEXTURE_3D_BIT)) { - return GL_TEXTURE_3D; - } else if (unit->Enabled & (TEXTURE_CUBE_BIT)) { - return GL_TEXTURE_CUBE_MAP; - } - return 0; -} - -static void r300TexEnv(GLcontext * ctx, GLenum target, - GLenum pname, const GLfloat * param) -{ - r300ContextPtr rmesa = R300_CONTEXT(ctx); - if (RADEON_DEBUG & DEBUG_STATE) { - fprintf(stderr, "%s( %s )\n", - __FUNCTION__, _mesa_lookup_enum_by_nr(pname)); - } - - /* This is incorrect: Need to maintain this data for each of - * GL_TEXTURE_{123}D, GL_TEXTURE_RECTANGLE_NV, etc, and switch - * between them according to _ReallyEnabled. - */ - switch (pname) { - case GL_TEXTURE_LOD_BIAS_EXT: { - GLfloat bias, min; - - /* The R300's LOD bias is a signed 2's complement value with a - * range of -16.0 <= bias < 16.0. - * - * NOTE: Add a small bias to the bias for conform mipsel.c test. - */ - bias = *param + .01; - min = driQueryOptionb(&rmesa->radeon.optionCache, - "no_neg_lod_bias") ? 0.0 : -16.0; - bias = CLAMP(bias, min, 16.0); - - /* There's probably a magic Mesa method for finding the REAL - * texture unit. I don't know it, though. */ - if (!(ctx->Texture._EnabledUnits & (1 << ctx->Texture.CurrentUnit))) { - break; - } - - /* Save our newly clamped LOD bias. */ - ctx->Texture.Unit[ctx->Texture.CurrentUnit].LodBias = bias; - - break; - } - - default: - return; - } -} - /** * Changes variables and flags for a state update, which will happen at the * next UpdateTextureState @@ -1106,10 +1020,6 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx, return NULL; obj->MaxAnisotropy = rmesa->initialMaxAnisotropy; - /* Attempt to fill LOD bias, if previously set. - * Should start at 0.0, which won't affect the HW. */ - obj->LodBias = rmesa->LODBias; - r300AllocTexObj(obj); return obj; } @@ -1131,7 +1041,6 @@ void r300InitTextureFuncs(struct dd_function_table *functions) functions->DeleteTexture = r300DeleteTexture; functions->IsTextureResident = driIsTextureResident; - functions->TexEnv = r300TexEnv; functions->TexParameter = r300TexParameter; functions->CompressedTexImage2D = r300CompressedTexImage2D; diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c index b647cffa0e..02152499c4 100644 --- a/src/mesa/drivers/dri/radeon/radeon_screen.c +++ b/src/mesa/drivers/dri/radeon/radeon_screen.c @@ -194,8 +194,7 @@ DRI_CONF_BEGIN DRI_CONF_SECTION_QUALITY DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB) DRI_CONF_DEF_MAX_ANISOTROPY(1.0, "1.0,2.0,4.0,8.0,16.0") - DRI_CONF_NO_NEG_LOD_BIAS(false) - DRI_CONF_FORCE_S3TC_ENABLE(false) + DRI_CONF_FORCE_S3TC_ENABLE(false) DRI_CONF_DISABLE_S3TC(false) DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER) DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC) @@ -206,7 +205,7 @@ DRI_CONF_BEGIN DRI_CONF_NO_RAST(false) DRI_CONF_SECTION_END DRI_CONF_END; -static const GLuint __driNConfigOptions = 18; +static const GLuint __driNConfigOptions = 17; #ifndef RADEON_DEBUG int RADEON_DEBUG = 0; @@ -244,10 +243,10 @@ radeonGetParam(int fd, int param, void *value) { int ret; drm_radeon_getparam_t gp; - + gp.param = param; gp.value = value; - + ret = drmCommandWriteRead( fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp)); return ret; } @@ -280,7 +279,7 @@ radeonFillInModes( __DRIscreenPrivate *psp, depth_bits_array[0] = depth_bits; depth_bits_array[1] = depth_bits; - + /* Just like with the accumulation buffer, always provide some modes * with a stencil buffer. It will be a sw fallback, but some apps won't * care about that. @@ -384,7 +383,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv ) int ret; ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BUFFER_OFFSET, &screen->gart_buffer_offset); - + if (ret) { FREE( screen ); fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret); @@ -1133,7 +1132,7 @@ static void radeonDestroyContext(__DRIcontextPrivate * driContextPriv) /** * This is the driver specific part of the createNewScreen entry point. - * + * * \todo maybe fold this into intelInitDriver * * \return the __GLcontextModes supported by this driver -- cgit v1.2.3 From 23e9b43ce4a6ad5875f69363dec7d2baa8afa2ea Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 30 Jun 2008 08:37:13 +0200 Subject: r300: Fix dumb mistake in LOD bias translation --- src/mesa/drivers/dri/r300/r300_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index 9e4cc777a9..f6f0c39066 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -1388,7 +1388,7 @@ static GLuint translate_lod_bias(GLfloat bias) b = (1 << 9)-1; else if (b < -(1 << 9)) b = -(1 << 9); - return ((GLuint)b) << R300_LOD_BIAS_SHIFT; + return (((GLuint)b) << R300_LOD_BIAS_SHIFT) & R300_LOD_BIAS_MASK; } static void r300SetupTextures(GLcontext * ctx) -- cgit v1.2.3 From bb1744970d74432692f2e109fca1dc31593605af Mon Sep 17 00:00:00 2001 From: Corbin Simpson Date: Mon, 30 Jun 2008 11:12:51 -0700 Subject: r3xx/r5xx: Enable ARB_point_parameters. This isn't complete yet. It does cover the two most common usage cases, though, and at least the third one (POINT_DISTANCE_ATTENUATION) is possible, so I'll do that later. --- src/mesa/drivers/dri/r300/r300_context.c | 2 ++ src/mesa/drivers/dri/r300/r300_state.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c index 44c368030f..a9c581b236 100644 --- a/src/mesa/drivers/dri/r300/r300_context.c +++ b/src/mesa/drivers/dri/r300/r300_context.c @@ -79,6 +79,7 @@ int hw_tcl_on = 1; #define need_GL_EXT_stencil_two_side #define need_GL_ARB_multisample +#define need_GL_ARB_point_parameters #define need_GL_ARB_texture_compression #define need_GL_ARB_vertex_buffer_object #define need_GL_ARB_vertex_program @@ -98,6 +99,7 @@ const struct dri_extension card_extensions[] = { {"GL_ARB_fragment_program", NULL}, {"GL_ARB_multisample", GL_ARB_multisample_functions}, {"GL_ARB_multitexture", NULL}, + {"GL_ARB_point_parameters", GL_ARB_point_parameters_functions}, {"GL_ARB_shadow", NULL}, {"GL_ARB_texture_border_clamp", NULL}, {"GL_ARB_texture_compression", GL_ARB_texture_compression_functions}, diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index f6f0c39066..592ee9ccc1 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -827,6 +827,31 @@ static void r300PointSize(GLcontext * ctx, GLfloat size) ((int)(size * 6) << R300_POINTSIZE_Y_SHIFT); } +static void r300PointParameter(GLcontext * ctx, GLenum pname, const GLfloat * param) +{ + r300ContextPtr r300 = R300_CONTEXT(ctx); + + switch (pname) { + case GL_POINT_SIZE_MIN: + R300_STATECHANGE(r300, ga_point_minmax); + r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MIN_MASK; + r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MinSize * 16.0); + break; + case GL_POINT_SIZE_MAX: + R300_STATECHANGE(r300, ga_point_minmax); + r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MAX_MASK; + r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MaxSize * 16.0) + << R300_GA_POINT_MINMAX_MAX_SHIFT; + break; + case GL_POINT_DISTANCE_ATTENUATION: + break; + case GL_POINT_FADE_THRESHOLD_SIZE: + break; + default: + break; + } +} + /* ============================================================= * Line state */ @@ -2706,6 +2731,9 @@ void r300InitStateFuncs(struct dd_function_table *functions) functions->FrontFace = r300FrontFace; functions->ShadeModel = r300ShadeModel; + /* ARB_point_parameters */ + functions->PointParameterfv = r300PointParameter; + /* Stencil related */ functions->StencilFuncSeparate = r300StencilFuncSeparate; functions->StencilMaskSeparate = r300StencilMaskSeparate; -- cgit v1.2.3 From 5ef4e4ffb8053db87f52df3c9b2ddb71d9c7d6e5 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Wed, 2 Jul 2008 20:20:33 +0200 Subject: mesa: fix issues around multisample enable multisample enable is enabled by default, however gl mandates multisample rendering rules only apply if there's also a multisampled buffer. --- src/mesa/drivers/dri/r300/r300_render.c | 2 +- src/mesa/main/buffers.c | 2 +- src/mesa/main/mtypes.h | 1 + src/mesa/main/state.c | 17 +++++++++++++++++ src/mesa/swrast/s_points.c | 2 +- 5 files changed, 21 insertions(+), 3 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c index 8f74f9d785..69ff6d573e 100644 --- a/src/mesa/drivers/dri/r300/r300_render.c +++ b/src/mesa/drivers/dri/r300/r300_render.c @@ -373,7 +373,7 @@ static int r300Fallback(GLcontext * ctx) if (!r300->disable_lowimpact_fallback) { FALLBACK_IF(ctx->Polygon.StippleFlag); - FALLBACK_IF(ctx->Multisample.Enabled); + FALLBACK_IF(ctx->Multisample._Enabled); FALLBACK_IF(ctx->Line.StippleFlag); FALLBACK_IF(ctx->Line.SmoothFlag); FALLBACK_IF(ctx->Point.SmoothFlag); diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c index b08095465d..5ab969e0eb 100644 --- a/src/mesa/main/buffers.c +++ b/src/mesa/main/buffers.c @@ -829,7 +829,7 @@ _mesa_init_scissor(GLcontext *ctx) void _mesa_init_multisample(GLcontext *ctx) { - ctx->Multisample.Enabled = GL_FALSE; + ctx->Multisample.Enabled = GL_TRUE; ctx->Multisample.SampleAlphaToCoverage = GL_FALSE; ctx->Multisample.SampleAlphaToOne = GL_FALSE; ctx->Multisample.SampleCoverage = GL_FALSE; diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 04da767ec9..9339146e25 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -958,6 +958,7 @@ struct gl_list_extensions struct gl_multisample_attrib { GLboolean Enabled; + GLboolean _Enabled; /**< true if Enabled and multisample buffer */ GLboolean SampleAlphaToCoverage; GLboolean SampleAlphaToOne; GLboolean SampleCoverage; diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c index 1c73c5c462..7192bed4a3 100644 --- a/src/mesa/main/state.c +++ b/src/mesa/main/state.c @@ -1060,6 +1060,20 @@ update_viewport_matrix(GLcontext *ctx) } +/** + * Update derived multisample state. + */ +static void +update_multisample(GLcontext *ctx) +{ + ctx->Multisample._Enabled = GL_FALSE; + if (ctx->DrawBuffer) { + if (ctx->DrawBuffer->Visual.sampleBuffers) + ctx->Multisample._Enabled = GL_TRUE; + } +} + + /** * Update derived color/blend/logicop state. */ @@ -1223,6 +1237,9 @@ _mesa_update_state_locked( GLcontext *ctx ) if (new_state & (_NEW_BUFFERS | _NEW_VIEWPORT)) update_viewport_matrix(ctx); + if (new_state & _NEW_MULTISAMPLE) + update_multisample( ctx ); + if (new_state & _NEW_COLOR) update_color( ctx ); diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c index 350a0682d6..1a8fd7d6db 100644 --- a/src/mesa/swrast/s_points.c +++ b/src/mesa/swrast/s_points.c @@ -263,7 +263,7 @@ smooth_point(GLcontext *ctx, const SWvertex *vert) size = get_size(ctx, vert, GL_TRUE); /* alpha attenuation / fade factor */ - if (ctx->Multisample.Enabled) { + if (ctx->Multisample._Enabled) { if (vert->pointSize >= ctx->Point.Threshold) { alphaAtten = 1.0F; } -- cgit v1.2.3 From e187627c1d3b06e5d185989dd3d37f09a8953a1b Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Fri, 4 Jul 2008 18:18:19 +0200 Subject: r300: Fix depth texture in compare mode Missed the homogenous divide of R by Q before... --- src/mesa/drivers/dri/r300/r300_fragprog.c | 57 +++++++++++++++++------------ src/mesa/drivers/dri/r300/r500_fragprog.c | 59 ++++++++++++++++++------------- 2 files changed, 68 insertions(+), 48 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 6d24d266fe..0d1428f0bc 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -177,43 +177,54 @@ static GLboolean transform_TEX( compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode; + int rcptemp = radeonCompilerAllocateTemporary(context->compiler); tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 2); - - tgt[0].Opcode = OPCODE_ADD; - tgt[0].DstReg = inst.DstReg; - tgt[0].DstReg.WriteMask = orig_inst->DstReg.WriteMask; - tgt[0].SrcReg[0].File = PROGRAM_TEMPORARY; - tgt[0].SrcReg[0].Index = inst.DstReg.Index; + context->dest->NumInstructions, 3); + + tgt[0].Opcode = OPCODE_RCP; + tgt[0].DstReg.File = PROGRAM_TEMPORARY; + tgt[0].DstReg.Index = rcptemp; + tgt[0].DstReg.WriteMask = WRITEMASK_W; + tgt[0].SrcReg[0] = inst.SrcReg[0]; + tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW; + + tgt[1].Opcode = OPCODE_MAD; + tgt[1].DstReg = inst.DstReg; + tgt[1].DstReg.WriteMask = orig_inst->DstReg.WriteMask; + tgt[1].SrcReg[0] = inst.SrcReg[0]; + tgt[1].SrcReg[0].Swizzle = SWIZZLE_ZZZZ; + tgt[1].SrcReg[1].File = PROGRAM_TEMPORARY; + tgt[1].SrcReg[1].Index = rcptemp; + tgt[1].SrcReg[1].Swizzle = SWIZZLE_WWWW; + tgt[1].SrcReg[2].File = PROGRAM_TEMPORARY; + tgt[1].SrcReg[2].Index = inst.DstReg.Index; if (depthmode == 0) /* GL_LUMINANCE */ - tgt[0].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z); + tgt[1].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z); else if (depthmode == 2) /* GL_ALPHA */ - tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW; - tgt[0].SrcReg[1] = inst.SrcReg[0]; - tgt[0].SrcReg[1].Swizzle = SWIZZLE_ZZZZ; + tgt[1].SrcReg[2].Swizzle = SWIZZLE_WWWW; /* Recall that SrcReg[0] is tex, SrcReg[2] is r and: * r < tex <=> -tex+r < 0 * r >= tex <=> not (-tex+r < 0 */ if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL) - tgt[0].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW; + tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW; else - tgt[0].SrcReg[1].NegateBase = tgt[0].SrcReg[1].NegateBase ^ NEGATE_XYZW; + tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW; - tgt[1].Opcode = OPCODE_CMP; - tgt[1].DstReg = orig_inst->DstReg; - tgt[1].SrcReg[0].File = PROGRAM_TEMPORARY; - tgt[1].SrcReg[0].Index = tgt[0].DstReg.Index; - tgt[1].SrcReg[1].File = PROGRAM_BUILTIN; - tgt[1].SrcReg[2].File = PROGRAM_BUILTIN; + tgt[2].Opcode = OPCODE_CMP; + tgt[2].DstReg = orig_inst->DstReg; + tgt[2].SrcReg[0].File = PROGRAM_TEMPORARY; + tgt[2].SrcReg[0].Index = tgt[1].DstReg.Index; + tgt[2].SrcReg[1].File = PROGRAM_BUILTIN; + tgt[2].SrcReg[2].File = PROGRAM_BUILTIN; if (comparefunc == GL_LESS || comparefunc == GL_GREATER) { - tgt[1].SrcReg[1].Swizzle = SWIZZLE_1111; - tgt[1].SrcReg[2].Swizzle = SWIZZLE_0000; + tgt[2].SrcReg[1].Swizzle = SWIZZLE_1111; + tgt[2].SrcReg[2].Swizzle = SWIZZLE_0000; } else { - tgt[1].SrcReg[1].Swizzle = SWIZZLE_0000; - tgt[1].SrcReg[2].Swizzle = SWIZZLE_1111; + tgt[2].SrcReg[1].Swizzle = SWIZZLE_0000; + tgt[2].SrcReg[2].Swizzle = SWIZZLE_1111; } } else if (destredirect) { tgt = radeonClauseInsertInstructions(context->compiler, context->dest, diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index 5d72ec2784..41cb7c6ffc 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -87,45 +87,54 @@ static GLboolean transform_TEX( compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode; + int rcptemp = radeonCompilerAllocateTemporary(context->compiler); tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 2); - - tgt[0].Opcode = OPCODE_MAD; - tgt[0].DstReg = inst.DstReg; - tgt[0].DstReg.WriteMask = orig_inst->DstReg.WriteMask; - tgt[0].SrcReg[0].File = PROGRAM_TEMPORARY; - tgt[0].SrcReg[0].Index = inst.DstReg.Index; + context->dest->NumInstructions, 3); + + tgt[0].Opcode = OPCODE_RCP; + tgt[0].DstReg.File = PROGRAM_TEMPORARY; + tgt[0].DstReg.Index = rcptemp; + tgt[0].DstReg.WriteMask = WRITEMASK_W; + tgt[0].SrcReg[0] = inst.SrcReg[0]; + tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW; + + tgt[1].Opcode = OPCODE_MAD; + tgt[1].DstReg = inst.DstReg; + tgt[1].DstReg.WriteMask = orig_inst->DstReg.WriteMask; + tgt[1].SrcReg[0] = inst.SrcReg[0]; + tgt[1].SrcReg[0].Swizzle = SWIZZLE_ZZZZ; + tgt[1].SrcReg[1].File = PROGRAM_TEMPORARY; + tgt[1].SrcReg[1].Index = rcptemp; + tgt[1].SrcReg[1].Swizzle = SWIZZLE_WWWW; + tgt[1].SrcReg[2].File = PROGRAM_TEMPORARY; + tgt[1].SrcReg[2].Index = inst.DstReg.Index; if (depthmode == 0) /* GL_LUMINANCE */ - tgt[0].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z); + tgt[1].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z); else if (depthmode == 2) /* GL_ALPHA */ - tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW; - tgt[0].SrcReg[1].File = PROGRAM_BUILTIN; - tgt[0].SrcReg[1].Swizzle = SWIZZLE_1111; - tgt[0].SrcReg[2] = inst.SrcReg[0]; - tgt[0].SrcReg[2].Swizzle = SWIZZLE_ZZZZ; + tgt[1].SrcReg[2].Swizzle = SWIZZLE_WWWW; /* Recall that SrcReg[0] is tex, SrcReg[2] is r and: * r < tex <=> -tex+r < 0 * r >= tex <=> not (-tex+r < 0 */ if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL) - tgt[0].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW; + tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW; else - tgt[0].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW; + tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW; - tgt[1].Opcode = OPCODE_CMP; - tgt[1].DstReg = orig_inst->DstReg; - tgt[1].SrcReg[0].File = PROGRAM_TEMPORARY; - tgt[1].SrcReg[0].Index = tgt[0].DstReg.Index; - tgt[1].SrcReg[1].File = PROGRAM_BUILTIN; - tgt[1].SrcReg[2].File = PROGRAM_BUILTIN; + tgt[2].Opcode = OPCODE_CMP; + tgt[2].DstReg = orig_inst->DstReg; + tgt[2].SrcReg[0].File = PROGRAM_TEMPORARY; + tgt[2].SrcReg[0].Index = tgt[1].DstReg.Index; + tgt[2].SrcReg[1].File = PROGRAM_BUILTIN; + tgt[2].SrcReg[2].File = PROGRAM_BUILTIN; if (comparefunc == GL_LESS || comparefunc == GL_GREATER) { - tgt[1].SrcReg[1].Swizzle = SWIZZLE_1111; - tgt[1].SrcReg[2].Swizzle = SWIZZLE_0000; + tgt[2].SrcReg[1].Swizzle = SWIZZLE_1111; + tgt[2].SrcReg[2].Swizzle = SWIZZLE_0000; } else { - tgt[1].SrcReg[1].Swizzle = SWIZZLE_0000; - tgt[1].SrcReg[2].Swizzle = SWIZZLE_1111; + tgt[2].SrcReg[1].Swizzle = SWIZZLE_0000; + tgt[2].SrcReg[2].Swizzle = SWIZZLE_1111; } } else if (destredirect) { tgt = radeonClauseInsertInstructions(context->compiler, context->dest, -- cgit v1.2.3 From 09e587fcf3c77860eacf16a45c6cf1a9a54cc605 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 09:31:44 +0200 Subject: r500: Fix blend color. --- src/mesa/drivers/dri/r300/r300_cmdbuf.c | 11 ++++++++--- src/mesa/drivers/dri/r300/r300_state.c | 26 +++++++++++++++++--------- 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c index d378045b98..4dc3161449 100644 --- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c +++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c @@ -405,7 +405,7 @@ void r300InitCmdBuf(r300ContextPtr r300) ALLOC_STATE(ri, always, R500_RI_CMDSIZE, 0); r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R500_RS_IP_0, 16); for (i = 0; i < 8; i++) { - r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] = + r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] = (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) | (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) | (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) | @@ -470,8 +470,13 @@ void r300InitCmdBuf(r300ContextPtr r300) r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(R300_RB3D_CBLEND, 2); ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0); r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(RB3D_COLOR_CHANNEL_MASK, 1); - ALLOC_STATE(blend_color, always, 4, 0); - r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 3); + if (is_r500) { + ALLOC_STATE(blend_color, always, 3, 0); + r300->hw.blend_color.cmd[0] = cmdpacket0(R500_RB3D_CONSTANT_COLOR_AR, 2); + } else { + ALLOC_STATE(blend_color, always, 2, 0); + r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 1); + } ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0); r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1); r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1); diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index 592ee9ccc1..0f7c179de8 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -70,20 +70,28 @@ extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx); static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4]) { - GLubyte color[4]; r300ContextPtr rmesa = R300_CONTEXT(ctx); R300_STATECHANGE(rmesa, blend_color); - CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]); - CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]); - CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]); - CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]); + if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) { + GLuint r = IROUND(cf[0]*1023.0f); + GLuint g = IROUND(cf[1]*1023.0f); + GLuint b = IROUND(cf[2]*1023.0f); + GLuint a = IROUND(cf[3]*1023.0f); - rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0], - color[1], color[2]); - rmesa->hw.blend_color.cmd[2] = 0; - rmesa->hw.blend_color.cmd[3] = 0; + rmesa->hw.blend_color.cmd[1] = r | (a << 16); + rmesa->hw.blend_color.cmd[2] = b | (g << 16); + } else { + GLubyte color[4]; + CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]); + CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]); + CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]); + CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]); + + rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0], + color[1], color[2]); + } } /** -- cgit v1.2.3 From 13c44679ad42886544036f49597dcfc0d4b7e925 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 10:24:27 +0200 Subject: r500: Fix a mixup in fragment program LRP instruction emit --- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 3dc72af87a..2266dc3dec 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -942,16 +942,15 @@ static int do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *f | R500_ALPHA_SRCP_OP_1_MINUS_A0; code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRCP | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB); + | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[2])); code->inst[counter].inst4 |= R500_ALPHA_OP_MAD | R500_ALPHA_ADDRD(dest) | R500_ALPHA_SEL_A_SRCP | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A; + | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[2])); code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC2 | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[2])) - | R500_ALU_RGBA_ALPHA_SEL_C_SRC2 - | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[2])); + | R500_ALU_RGBA_SEL_C_SRC2 | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB) + | R500_ALU_RGBA_ALPHA_SEL_C_SRC2 | R500_ALU_RGBA_A_SWIZ_A; break; case OPCODE_MAD: emit_mad(cs, counter, fpi, 0, 1, 2); -- cgit v1.2.3 From 85b46fbe9cfc8de8871d6adb0b2287c5837d3028 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 10:58:54 +0200 Subject: r500_fragprog: Cleanup some unused variables and code. --- src/mesa/drivers/dri/r300/r500_fragprog.c | 5 --- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 42 +------------------------- 2 files changed, 1 insertion(+), 46 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index 41cb7c6ffc..fd9769d653 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -27,11 +27,6 @@ #include "r500_fragprog.h" -static void reset_srcreg(struct prog_src_register* reg) -{ - _mesa_bzero(reg, sizeof(*reg)); - reg->Swizzle = SWIZZLE_NOOP; -} /** * Transform TEX, TXP, TXB, and KIL instructions in the following way: diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 2266dc3dec..19a5cef084 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -89,27 +89,6 @@ struct reg_lifetime { int scalar_lastread; }; -/** - * Store usage information about an ALU instruction slot during the - * compilation of a fragment program. - */ -#define SLOT_SRC_VECTOR (1<<0) -#define SLOT_SRC_SCALAR (1<<3) -#define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR) -#define SLOT_OP_VECTOR (1<<16) -#define SLOT_OP_SCALAR (1<<17) -#define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR) - -struct r500_pfs_compile_slot { - /* Bitmask indicating which parts of the slot are used, using SLOT_ constants - defined above */ - unsigned int used; - - /* Selected sources */ - int vsrc[3]; - int ssrc[3]; -}; - /** * Store information during compilation of fragment programs. */ @@ -119,21 +98,9 @@ struct r500_pfs_compile_state { /* number of ALU slots used so far */ int nrslots; - /* Track which (parts of) slots are already filled with instructions */ - struct r500_pfs_compile_slot slot[PFS_MAX_ALU_INST]; - - /* Track the validity of R300 temporaries */ - struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS]; - /* Used to map Mesa's inputs/temps onto hardware temps */ int temp_in_use; - struct reg_acc temps[PFS_NUM_TEMP_REGS]; struct reg_acc inputs[32]; /* don't actually need 32... */ - - /* Track usage of hardware temps, for register allocation, - * indirection detection, etc. */ - GLuint used_in_node; - GLuint dest_in_node; }; /* @@ -1328,7 +1295,7 @@ static void init_program(struct r500_pfs_compile_state *cs) struct prog_instruction *fpi; GLuint InputsRead = mp->Base.InputsRead; GLuint temps_used = 0; - int i, j; + int i; /* New compile, reset tracking data */ cs->compiler->fp->optimization = @@ -1343,13 +1310,6 @@ static void init_program(struct r500_pfs_compile_state *cs) /* Whether or not we perform any depth writing. */ cs->compiler->fp->writes_depth = GL_FALSE; - for (i = 0; i < PFS_MAX_ALU_INST; i++) { - for (j = 0; j < 3; j++) { - cs->slot[i].vsrc[j] = SRC_CONST; - cs->slot[i].ssrc[j] = SRC_CONST; - } - } - /* Work out what temps the Mesa inputs correspond to, this must match * what setup_rs_unit does, which shouldn't be a problem as rs_unit * configures itself based on the fragprog's InputsRead -- cgit v1.2.3 From dea8719f00ad46ed66b5d4f5e6c0b71e2d1054e9 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 11:53:39 +0200 Subject: r300: Remove clause stuff for now in favour of a cloned generic gl_program --- src/mesa/drivers/dri/r300/r300_fragprog.c | 55 +++--- src/mesa/drivers/dri/r300/r300_fragprog.h | 3 +- src/mesa/drivers/dri/r300/r300_fragprog_emit.c | 19 +- src/mesa/drivers/dri/r300/r500_fragprog.c | 50 +++-- src/mesa/drivers/dri/r300/r500_fragprog.h | 2 +- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 40 ++-- src/mesa/drivers/dri/r300/radeon_program.c | 241 +++---------------------- src/mesa/drivers/dri/r300/radeon_program.h | 103 +---------- src/mesa/drivers/dri/r300/radeon_program_alu.c | 101 +++++------ src/mesa/drivers/dri/r300/radeon_program_alu.h | 3 +- 10 files changed, 157 insertions(+), 460 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 0d1428f0bc..8c49e8ada6 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -68,7 +68,7 @@ static void reset_srcreg(struct prog_src_register* reg) * be reused. */ static GLboolean transform_TEX( - struct radeon_program_transform_context* context, + GLcontext *ctx, struct gl_program *p, struct prog_instruction* orig_inst, void* data) { struct r300_fragment_program_compiler *compiler = @@ -84,12 +84,11 @@ static GLboolean transform_TEX( return GL_FALSE; if (inst.Opcode != OPCODE_KIL && - compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) { + p->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) { - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg = inst.DstReg; @@ -99,7 +98,7 @@ static GLboolean transform_TEX( } inst.DstReg.File = PROGRAM_TEMPORARY; - inst.DstReg.Index = radeonCompilerAllocateTemporary(context->compiler); + inst.DstReg.Index = _mesa_find_free_register(p, PROGRAM_TEMPORARY); inst.DstReg.WriteMask = WRITEMASK_XYZW; } @@ -114,7 +113,7 @@ static GLboolean transform_TEX( 0 }; - int tempreg = radeonCompilerAllocateTemporary(context->compiler); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); int factor_index; tokens[2] = inst.TexSrcUnit; @@ -122,8 +121,7 @@ static GLboolean transform_TEX( _mesa_add_state_reference( compiler->fp->mesa_program.Base.Parameters, tokens); - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); tgt->Opcode = OPCODE_MUL; tgt->DstReg.File = PROGRAM_TEMPORARY; @@ -142,10 +140,9 @@ static GLboolean transform_TEX( */ if (inst.SrcReg[0].Swizzle != SWIZZLE_NOOP || inst.SrcReg[0].Abs || inst.SrcReg[0].NegateBase || inst.SrcReg[0].NegateAbs) { - int tempreg = radeonCompilerAllocateTemporary(context->compiler); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg.File = PROGRAM_TEMPORARY; @@ -160,7 +157,7 @@ static GLboolean transform_TEX( if (inst.Opcode != OPCODE_KIL) { if (inst.DstReg.File != PROGRAM_TEMPORARY || inst.DstReg.WriteMask != WRITEMASK_XYZW) { - int tempreg = radeonCompilerAllocateTemporary(context->compiler); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); inst.DstReg.File = PROGRAM_TEMPORARY; inst.DstReg.Index = tempreg; @@ -169,18 +166,16 @@ static GLboolean transform_TEX( } } - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); _mesa_copy_instructions(tgt, &inst, 1); if (inst.Opcode != OPCODE_KIL && - compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) { + p->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode; - int rcptemp = radeonCompilerAllocateTemporary(context->compiler); + int rcptemp = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 3); + tgt = radeonAppendInstructions(p, 3); tgt[0].Opcode = OPCODE_RCP; tgt[0].DstReg.File = PROGRAM_TEMPORARY; @@ -227,8 +222,7 @@ static GLboolean transform_TEX( tgt[2].SrcReg[2].Swizzle = SWIZZLE_1111; } } else if (destredirect) { - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg = orig_inst->DstReg; @@ -274,9 +268,10 @@ static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler) struct prog_instruction *fpi; GLuint window_index; int i = 0; - GLuint tempregi = radeonCompilerAllocateTemporary(&compiler->compiler); + GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY); - fpi = radeonClauseInsertInstructions(&compiler->compiler, &compiler->compiler.Clauses[0], 0, 3); + _mesa_insert_instructions(compiler->program, 0, 3); + fpi = compiler->program->Instructions; /* perspective divide */ fpi[i].Opcode = OPCODE_RCP; @@ -333,7 +328,7 @@ static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler) MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO); i++; - for (; i < compiler->compiler.Clauses[0].NumInstructions; ++i) { + for (; i < compiler->program->NumInstructions; ++i) { int reg; for (reg = 0; reg < 3; reg++) { if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT && @@ -404,8 +399,7 @@ void r300TranslateFragmentShader(r300ContextPtr r300, compiler.r300 = r300; compiler.fp = fp; compiler.code = &fp->code; - - radeonCompilerInit(&compiler.compiler, r300->radeon.glCtx, &fp->mesa_program.Base); + compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base); insert_WPOS_trailer(&compiler); @@ -413,19 +407,20 @@ void r300TranslateFragmentShader(r300ContextPtr r300, { &transform_TEX, &compiler }, { &radeonTransformALU, 0 } }; - radeonClauseLocalTransform(&compiler.compiler, - &compiler.compiler.Clauses[0], + radeonLocalTransform( + r300->radeon.glCtx, + compiler.program, 2, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { - _mesa_printf("Compiler state after transformations:\n"); - radeonCompilerDump(&compiler.compiler); + _mesa_printf("Program after transformations:\n"); + _mesa_print_program(compiler.program); } if (!r300FragmentProgramEmit(&compiler)) fp->error = GL_TRUE; - radeonCompilerCleanup(&compiler.compiler); + _mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL); if (!fp->error) fp->translated = GL_TRUE; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h index 7c1e210b04..c76ae62701 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.h +++ b/src/mesa/drivers/dri/r300/r300_fragprog.h @@ -146,10 +146,9 @@ struct r300_fragment_program_compiler { r300ContextPtr r300; struct r300_fragment_program *fp; struct r300_fragment_program_code *code; - struct radeon_compiler compiler; + struct gl_program *program; }; -extern void r300FPTransformTextures(struct r300_fragment_program_compiler *compiler); extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler); diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c index d72b92832c..889631f705 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c @@ -1887,18 +1887,13 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst static GLboolean parse_program(struct r300_pfs_compile_state *cs) { COMPILE_STATE; - int clauseidx; + struct prog_instruction* fpi; - for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) { - struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx]; - int ip; + for(fpi = cs->compiler->program->Instructions; fpi->Opcode != OPCODE_END; ++fpi) { + emit_instruction(cs, fpi); - for(ip = 0; ip < clause->NumInstructions; ++ip) { - emit_instruction(cs, clause->Instructions + ip); - - if (fp->error) - return GL_FALSE; - } + if (fp->error) + return GL_FALSE; } return GL_TRUE; @@ -1988,8 +1983,8 @@ static void init_program(struct r300_pfs_compile_state *cs) /* Pre-parse the program, grabbing refcounts on input/temp regs. * That way, we can free up the reg when it's no longer needed */ - for (i = 0; i < cs->compiler->compiler.Clauses[0].NumInstructions; ++i) { - struct prog_instruction *fpi = cs->compiler->compiler.Clauses[0].Instructions + i; + for (i = 0; i < cs->compiler->program->NumInstructions; ++i) { + struct prog_instruction *fpi = cs->compiler->program->Instructions + i; int idx; for (j = 0; j < 3; j++) { diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index fd9769d653..62e06ea52c 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -36,7 +36,7 @@ * */ static GLboolean transform_TEX( - struct radeon_program_transform_context* context, + GLcontext *ctx, struct gl_program *p, struct prog_instruction* orig_inst, void* data) { struct r500_fragment_program_compiler *compiler = @@ -53,12 +53,11 @@ static GLboolean transform_TEX( /* ARB_shadow & EXT_shadow_funcs */ if (inst.Opcode != OPCODE_KIL && - compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) { + p->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) { - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg.File = inst.DstReg.File; @@ -70,22 +69,20 @@ static GLboolean transform_TEX( } inst.DstReg.File = PROGRAM_TEMPORARY; - inst.DstReg.Index = radeonCompilerAllocateTemporary(context->compiler); + inst.DstReg.Index = _mesa_find_free_register(p, PROGRAM_TEMPORARY); inst.DstReg.WriteMask = WRITEMASK_XYZW; } - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); _mesa_copy_instructions(tgt, &inst, 1); if (inst.Opcode != OPCODE_KIL && - compiler->fp->mesa_program.Base.ShadowSamplers & (1 << inst.TexSrcUnit)) { + p->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode; - int rcptemp = radeonCompilerAllocateTemporary(context->compiler); + int rcptemp = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 3); + tgt = radeonAppendInstructions(p, 3); tgt[0].Opcode = OPCODE_RCP; tgt[0].DstReg.File = PROGRAM_TEMPORARY; @@ -132,17 +129,12 @@ static GLboolean transform_TEX( tgt[2].SrcReg[2].Swizzle = SWIZZLE_1111; } } else if (destredirect) { - tgt = radeonClauseInsertInstructions(context->compiler, context->dest, - context->dest->NumInstructions, 1); + tgt = radeonAppendInstructions(p, 1); - tgt->Opcode = OPCODE_MAD; + tgt->Opcode = OPCODE_MOV; tgt->DstReg = orig_inst->DstReg; tgt->SrcReg[0].File = PROGRAM_TEMPORARY; tgt->SrcReg[0].Index = inst.DstReg.Index; - tgt->SrcReg[1].File = PROGRAM_BUILTIN; - tgt->SrcReg[1].Swizzle = SWIZZLE_1111; - tgt->SrcReg[2].File = PROGRAM_BUILTIN; - tgt->SrcReg[2].Swizzle = SWIZZLE_0000; } return GL_TRUE; @@ -183,9 +175,10 @@ static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler) struct prog_instruction *fpi; GLuint window_index; int i = 0; - GLuint tempregi = radeonCompilerAllocateTemporary(&compiler->compiler); + GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY); - fpi = radeonClauseInsertInstructions(&compiler->compiler, &compiler->compiler.Clauses[0], 0, 3); + _mesa_insert_instructions(compiler->program, 0, 3); + fpi = compiler->program->Instructions; /* perspective divide */ fpi[i].Opcode = OPCODE_RCP; @@ -242,7 +235,7 @@ static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler) MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO); i++; - for (; i < compiler->compiler.Clauses[0].NumInstructions; ++i) { + for (; i < compiler->program->NumInstructions; ++i) { int reg; for (reg = 0; reg < 3; reg++) { if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT && @@ -314,26 +307,29 @@ void r500TranslateFragmentShader(r300ContextPtr r300, compiler.r300 = r300; compiler.fp = fp; compiler.code = &fp->code; + compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base); - radeonCompilerInit(&compiler.compiler, r300->radeon.glCtx, &fp->mesa_program.Base); + if (RADEON_DEBUG & DEBUG_PIXEL) { + _mesa_printf("Compiler: Initial program:\n"); + _mesa_print_program(compiler.program); + } insert_WPOS_trailer(&compiler); struct radeon_program_transformation transformations[1] = { { &transform_TEX, &compiler } }; - radeonClauseLocalTransform(&compiler.compiler, - &compiler.compiler.Clauses[0], + radeonLocalTransform(r300->radeon.glCtx, compiler.program, 1, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { - _mesa_printf("Compiler state after transformations:\n"); - radeonCompilerDump(&compiler.compiler); + _mesa_printf("Compiler: after all transformations:\n"); + _mesa_print_program(compiler.program); } fp->translated = r500FragmentProgramEmit(&compiler); - radeonCompilerCleanup(&compiler.compiler); + _mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0); r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM); diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h index ff6a9002c1..ed8f7f41ca 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.h +++ b/src/mesa/drivers/dri/r300/r500_fragprog.h @@ -84,7 +84,7 @@ struct r500_fragment_program_compiler { r300ContextPtr r300; struct r500_fragment_program *fp; struct r500_fragment_program_code *code; - struct radeon_compiler compiler; + struct gl_program *program; }; extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler); diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 19a5cef084..67545cbb4f 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -282,7 +282,7 @@ static GLuint make_src(struct r500_pfs_compile_state *cs, struct prog_src_regist break; case PROGRAM_ENV_PARAM: reg = emit_const4fv(cs, - cs->compiler->compiler.Ctx->FragmentProgram.Parameters[src.Index]); + cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[src.Index]); break; case PROGRAM_STATE_VAR: case PROGRAM_NAMED_PARAM: @@ -1256,21 +1256,14 @@ static int do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *f static GLboolean parse_program(struct r500_pfs_compile_state *cs) { PROG_CODE; - int clauseidx, counter = 0; + int counter = 0; + struct prog_instruction* fpi; - for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; clauseidx++) { - struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx]; - struct prog_instruction* fpi; + for(fpi = cs->compiler->program->Instructions; fpi->Opcode != OPCODE_END; ++fpi) { + counter = do_inst(cs, fpi, counter); - int ip; - - for (ip = 0; ip < clause->NumInstructions; ip++) { - fpi = clause->Instructions + ip; - counter = do_inst(cs, fpi, counter); - - if (cs->compiler->fp->error) - return GL_FALSE; - } + if (cs->compiler->fp->error) + return GL_FALSE; } /* Finish him! (If it's an ALU/OUT instruction...) */ @@ -1365,19 +1358,14 @@ static void init_program(struct r500_pfs_compile_state *cs) cs->inputs[i].reg = 0; } - int clauseidx; - - for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) { - struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx]; - int ip; + int ip; - for (ip = 0; ip < clause->NumInstructions; ip++) { - fpi = clause->Instructions + ip; - for (i = 0; i < 3; i++) { - if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) { - if (fpi->SrcReg[i].Index >= temps_used) - temps_used = fpi->SrcReg[i].Index + 1; - } + for (ip = 0; ip < cs->compiler->program->NumInstructions; ip++) { + fpi = cs->compiler->program->Instructions + ip; + for (i = 0; i < 3; i++) { + if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) { + if (fpi->SrcReg[i].Index >= temps_used) + temps_used = fpi->SrcReg[i].Index + 1; } } } diff --git a/src/mesa/drivers/dri/r300/radeon_program.c b/src/mesa/drivers/dri/r300/radeon_program.c index c8f40e8189..3112339f81 100644 --- a/src/mesa/drivers/dri/r300/radeon_program.c +++ b/src/mesa/drivers/dri/r300/radeon_program.c @@ -29,201 +29,6 @@ #include "shader/prog_print.h" -/** - * Initialize a compiler structure with a single mixed clause - * containing all instructions from the source program. - */ -void radeonCompilerInit( - struct radeon_compiler *compiler, - GLcontext *ctx, - struct gl_program *source) -{ - struct radeon_clause* clause; - - _mesa_memset(compiler, 0, sizeof(*compiler)); - compiler->Source = source; - compiler->Ctx = ctx; - - compiler->NumTemporaries = source->NumTemporaries; - - clause = radeonCompilerInsertClause(compiler, 0, CLAUSE_MIXED); - clause->NumInstructions = 0; - while(source->Instructions[clause->NumInstructions].Opcode != OPCODE_END) - clause->NumInstructions++; - clause->ReservedInstructions = clause->NumInstructions; - clause->Instructions = _mesa_alloc_instructions(clause->NumInstructions); - _mesa_copy_instructions(clause->Instructions, source->Instructions, clause->NumInstructions); -} - - -/** - * Free all data that is referenced by the compiler structure. - * However, the compiler structure itself is not freed. - */ -void radeonCompilerCleanup(struct radeon_compiler *compiler) -{ - radeonCompilerEraseClauses(compiler, 0, compiler->NumClauses); -} - - -/** - * Allocate and return a unique temporary register. - */ -int radeonCompilerAllocateTemporary(struct radeon_compiler *compiler) -{ - if (compiler->NumTemporaries >= 256) { - _mesa_problem(compiler->Ctx, "radeonCompiler: Too many temporaries"); - return 0; - } - - return compiler->NumTemporaries++; -} - - -static const char* clausename(int type) -{ - switch(type) { - case CLAUSE_MIXED: return "CLAUSE_MIXED"; - case CLAUSE_ALU: return "CLAUSE_ALU"; - case CLAUSE_TEX: return "CLAUSE_TEX"; - default: return "CLAUSE_UNKNOWN"; - } -} - - -/** - * Dump the current compiler state to the console for debugging. - */ -void radeonCompilerDump(struct radeon_compiler *compiler) -{ - int i; - for(i = 0; i < compiler->NumClauses; ++i) { - struct radeon_clause *clause = &compiler->Clauses[i]; - int j; - - _mesa_printf("%2i: %s\n", i+1, clausename(clause->Type)); - - for(j = 0; j < clause->NumInstructions; ++j) { - _mesa_printf("%4i: ", j+1); - _mesa_print_instruction(&clause->Instructions[j]); - } - } -} - - -/** - * \p position index of the new clause; later clauses are moved - * \p type of the new clause; one of CLAUSE_XXX - * \return a pointer to the new clause - */ -struct radeon_clause* radeonCompilerInsertClause( - struct radeon_compiler *compiler, - int position, int type) -{ - struct radeon_clause* oldClauses = compiler->Clauses; - struct radeon_clause* clause; - - assert(position >= 0 && position <= compiler->NumClauses); - - compiler->Clauses = (struct radeon_clause *) - _mesa_malloc((compiler->NumClauses+1) * sizeof(struct radeon_clause)); - if (oldClauses) { - _mesa_memcpy(compiler->Clauses, oldClauses, - position*sizeof(struct radeon_clause)); - _mesa_memcpy(compiler->Clauses+position+1, oldClauses+position, - (compiler->NumClauses - position) * sizeof(struct radeon_clause)); - _mesa_free(oldClauses); - } - compiler->NumClauses++; - - clause = compiler->Clauses + position; - _mesa_memset(clause, 0, sizeof(*clause)); - clause->Type = type; - - return clause; -} - - -/** - * Remove clauses in the range [start, end) - */ -void radeonCompilerEraseClauses( - struct radeon_compiler *compiler, - int start, int end) -{ - struct radeon_clause* oldClauses = compiler->Clauses; - int i; - - assert(0 <= start); - assert(start <= end); - assert(end <= compiler->NumClauses); - - if (end == start) - return; - - for(i = start; i < end; ++i) { - struct radeon_clause* clause = oldClauses + i; - _mesa_free_instructions(clause->Instructions, clause->NumInstructions); - } - - if (start > 0 || end < compiler->NumClauses) { - compiler->Clauses = (struct radeon_clause*) - _mesa_malloc((compiler->NumClauses+start-end) * sizeof(struct radeon_clause)); - _mesa_memcpy(compiler->Clauses, oldClauses, - start * sizeof(struct radeon_clause)); - _mesa_memcpy(compiler->Clauses + start, oldClauses + end, - (compiler->NumClauses - end) * sizeof(struct radeon_clause)); - compiler->NumClauses -= end - start; - } else { - compiler->Clauses = 0; - compiler->NumClauses = 0; - } - - _mesa_free(oldClauses); -} - - -/** - * Insert new instructions at the given position, initialize them as NOPs - * and return a pointer to the first new instruction. - */ -struct prog_instruction* radeonClauseInsertInstructions( - struct radeon_compiler *compiler, - struct radeon_clause *clause, - int position, int count) -{ - int newNumInstructions = clause->NumInstructions + count; - - assert(position >= 0 && position <= clause->NumInstructions); - - if (newNumInstructions <= clause->ReservedInstructions) { - memmove(clause->Instructions + position + count, clause->Instructions + position, - (clause->NumInstructions - position) * sizeof(struct prog_instruction)); - } else { - struct prog_instruction *oldInstructions = clause->Instructions; - - clause->ReservedInstructions *= 2; - if (newNumInstructions > clause->ReservedInstructions) - clause->ReservedInstructions = newNumInstructions; - - clause->Instructions = (struct prog_instruction*) - _mesa_malloc(clause->ReservedInstructions * sizeof(struct prog_instruction)); - - if (oldInstructions) { - _mesa_memcpy(clause->Instructions, oldInstructions, - position * sizeof(struct prog_instruction)); - _mesa_memcpy(clause->Instructions + position + count, oldInstructions + position, - (clause->NumInstructions - position) * sizeof(struct prog_instruction)); - - _mesa_free(oldInstructions); - } - } - - clause->NumInstructions = newNumInstructions; - _mesa_init_instructions(clause->Instructions + position, count); - return clause->Instructions + position; -} - /** * Transform the given clause in the following way: @@ -240,42 +45,50 @@ struct prog_instruction* radeonClauseInsertInstructions( * \note The transform is called 'local' because it can only look at * one instruction at a time. */ -void radeonClauseLocalTransform( - struct radeon_compiler *compiler, - struct radeon_clause *clause, +void radeonLocalTransform( + GLcontext *ctx, + struct gl_program *program, int num_transformations, struct radeon_program_transformation* transformations) { - struct radeon_program_transform_context context; - struct radeon_clause source; + struct prog_instruction *source; + int numinstructions; int ip; - source = *clause; - clause->Instructions = 0; - clause->NumInstructions = 0; - clause->ReservedInstructions = 0; + source = program->Instructions; + numinstructions = program->NumInstructions; - context.compiler = compiler; - context.dest = clause; - context.src = &source; + program->Instructions = 0; + program->NumInstructions = 0; - for(ip = 0; ip < source.NumInstructions; ++ip) { - struct prog_instruction *instr = source.Instructions + ip; + for(ip = 0; ip < numinstructions; ++ip) { + struct prog_instruction *instr = source + ip; int i; for(i = 0; i < num_transformations; ++i) { struct radeon_program_transformation* t = transformations + i; - if (t->function(&context, instr, t->userData)) + if (t->function(ctx, program, instr, t->userData)) break; } if (i >= num_transformations) { - struct prog_instruction *tgt = - radeonClauseInsertInstructions(compiler, clause, clause->NumInstructions, 1); - _mesa_copy_instructions(tgt, instr, 1); + struct prog_instruction* dest = radeonAppendInstructions(program, 1); + _mesa_copy_instructions(dest, instr, 1); } } - _mesa_free_instructions(source.Instructions, source.NumInstructions); + _mesa_free_instructions(source, numinstructions); +} + + +/** + * Append the given number of instructions to the program and return a + * pointer to the first new instruction. + */ +struct prog_instruction *radeonAppendInstructions(struct gl_program *program, int count) +{ + int oldnum = program->NumInstructions; + _mesa_insert_instructions(program, oldnum, count); + return program->Instructions + oldnum; } diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/radeon_program.h index 25e70505b1..012104fa5a 100644 --- a/src/mesa/drivers/dri/r300/radeon_program.h +++ b/src/mesa/drivers/dri/r300/radeon_program.h @@ -49,96 +49,7 @@ enum { #define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE) /** - * A clause is simply a sequence of instructions that are executed - * in order. - */ -struct radeon_clause { - /** - * Type of this clause, one of CLAUSE_XXX. - */ - int Type : 2; - - /** - * Pointer to an array of instructions. - * The array is terminated by an OPCODE_END instruction. - */ - struct prog_instruction *Instructions; - - /** - * Number of instructions in this clause. - */ - int NumInstructions; - - /** - * Space reserved for instructions in this clause. - */ - int ReservedInstructions; -}; - -/** - * A compile object, holding the current intermediate state during compilation. - */ -struct radeon_compiler { - struct gl_program *Source; - GLcontext* Ctx; - - /** - * Number of clauses in this program. - */ - int NumClauses; - - /** - * Pointer to an array of NumClauses clauses. - */ - struct radeon_clause *Clauses; - - /** - * Number of registers in the PROGRAM_TEMPORARIES file. - */ - int NumTemporaries; -}; - -void radeonCompilerInit( - struct radeon_compiler *compiler, - GLcontext *ctx, - struct gl_program *source); -void radeonCompilerCleanup(struct radeon_compiler *compiler); -int radeonCompilerAllocateTemporary(struct radeon_compiler *compiler); -void radeonCompilerDump(struct radeon_compiler *compiler); - -struct radeon_clause *radeonCompilerInsertClause( - struct radeon_compiler *compiler, - int position, - int type); -void radeonCompilerEraseClauses( - struct radeon_compiler *compiler, - int start, - int end); - -struct prog_instruction* radeonClauseInsertInstructions( - struct radeon_compiler *compiler, - struct radeon_clause *clause, - int position, int count); - -/** - * - */ -struct radeon_program_transform_context { - struct radeon_compiler *compiler; - - /** - * Destination clause where new instructions must be written. - */ - struct radeon_clause *dest; - - /** - * Original clause that is currently being transformed. - */ - struct radeon_clause *src; -}; - -/** - * A transformation that can be passed to \ref radeonClauseLinearTransform. + * A transformation that can be passed to \ref radeonLocalTransform. * * The function will be called once for each instruction. * It has to either emit the appropriate transformed code for the instruction @@ -149,16 +60,20 @@ struct radeon_program_transform_context { */ struct radeon_program_transformation { GLboolean (*function)( - struct radeon_program_transform_context*, + GLcontext*, + struct gl_program*, struct prog_instruction*, void*); void *userData; }; -void radeonClauseLocalTransform( - struct radeon_compiler *compiler, - struct radeon_clause *clause, +void radeonLocalTransform( + GLcontext* ctx, + struct gl_program *program, int num_transformations, struct radeon_program_transformation* transformations); + +struct prog_instruction *radeonAppendInstructions(struct gl_program *program, int count); + #endif diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index 7fe940a7d7..3104d07fac 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -36,13 +36,11 @@ #include "radeon_program_alu.h" -static struct prog_instruction *emit1(struct radeon_program_transform_context* ctx, +static struct prog_instruction *emit1(struct gl_program* p, gl_inst_opcode Opcode, struct prog_dst_register DstReg, struct prog_src_register SrcReg) { - struct prog_instruction *fpi = - radeonClauseInsertInstructions(ctx->compiler, ctx->dest, - ctx->dest->NumInstructions, 1); + struct prog_instruction *fpi = radeonAppendInstructions(p, 1); fpi->Opcode = Opcode; fpi->DstReg = DstReg; @@ -50,13 +48,11 @@ static struct prog_instruction *emit1(struct radeon_program_transform_context* c return fpi; } -static struct prog_instruction *emit2(struct radeon_program_transform_context* ctx, +static struct prog_instruction *emit2(struct gl_program* p, gl_inst_opcode Opcode, struct prog_dst_register DstReg, struct prog_src_register SrcReg0, struct prog_src_register SrcReg1) { - struct prog_instruction *fpi = - radeonClauseInsertInstructions(ctx->compiler, ctx->dest, - ctx->dest->NumInstructions, 1); + struct prog_instruction *fpi = radeonAppendInstructions(p, 1); fpi->Opcode = Opcode; fpi->DstReg = DstReg; @@ -65,14 +61,12 @@ static struct prog_instruction *emit2(struct radeon_program_transform_context* c return fpi; } -static struct prog_instruction *emit3(struct radeon_program_transform_context* ctx, +static struct prog_instruction *emit3(struct gl_program* p, gl_inst_opcode Opcode, struct prog_dst_register DstReg, struct prog_src_register SrcReg0, struct prog_src_register SrcReg1, struct prog_src_register SrcReg2) { - struct prog_instruction *fpi = - radeonClauseInsertInstructions(ctx->compiler, ctx->dest, - ctx->dest->NumInstructions, 1); + struct prog_instruction *fpi = radeonAppendInstructions(p, 1); fpi->Opcode = Opcode; fpi->DstReg = DstReg; @@ -154,24 +148,24 @@ static struct prog_src_register scalar(struct prog_src_register reg) return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X); } -static void transform_ABS(struct radeon_program_transform_context* ctx, +static void transform_ABS(struct gl_program* p, struct prog_instruction* inst) { struct prog_src_register src = inst->SrcReg[0]; src.Abs = 1; src.NegateBase = 0; src.NegateAbs = 0; - emit1(ctx, OPCODE_MOV, inst->DstReg, src); + emit1(p, OPCODE_MOV, inst->DstReg, src); } -static void transform_DPH(struct radeon_program_transform_context* ctx, +static void transform_DPH(struct gl_program* p, struct prog_instruction* inst) { struct prog_src_register src0 = inst->SrcReg[0]; if (src0.NegateAbs) { if (src0.Abs) { - int tempreg = radeonCompilerAllocateTemporary(ctx->compiler); - emit1(ctx, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + emit1(p, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0); src0 = srcreg(src0.File, src0.Index); } else { src0.NegateAbs = 0; @@ -180,70 +174,70 @@ static void transform_DPH(struct radeon_program_transform_context* ctx, } set_swizzle(&src0, 3, SWIZZLE_ONE); set_negate_base(&src0, 3, 0); - emit2(ctx, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]); + emit2(p, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]); } -static void transform_FLR(struct radeon_program_transform_context* ctx, +static void transform_FLR(struct gl_program* p, struct prog_instruction* inst) { - int tempreg = radeonCompilerAllocateTemporary(ctx->compiler); - emit1(ctx, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]); - emit2(ctx, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg))); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + emit1(p, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]); + emit2(p, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg))); } -static void transform_POW(struct radeon_program_transform_context* ctx, +static void transform_POW(struct gl_program* p, struct prog_instruction* inst) { - int tempreg = radeonCompilerAllocateTemporary(ctx->compiler); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg); struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg); tempdst.WriteMask = WRITEMASK_W; tempsrc.Swizzle = SWIZZLE_WWWW; - emit1(ctx, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0])); - emit2(ctx, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1])); - emit1(ctx, OPCODE_EX2, inst->DstReg, tempsrc); + emit1(p, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0])); + emit2(p, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1])); + emit1(p, OPCODE_EX2, inst->DstReg, tempsrc); } -static void transform_SGE(struct radeon_program_transform_context* ctx, +static void transform_SGE(struct gl_program* p, struct prog_instruction* inst) { - int tempreg = radeonCompilerAllocateTemporary(ctx->compiler); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - emit2(ctx, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); - emit3(ctx, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one); + emit2(p, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); + emit3(p, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one); } -static void transform_SLT(struct radeon_program_transform_context* ctx, +static void transform_SLT(struct gl_program* p, struct prog_instruction* inst) { - int tempreg = radeonCompilerAllocateTemporary(ctx->compiler); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - emit2(ctx, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); - emit3(ctx, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero); + emit2(p, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); + emit3(p, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero); } -static void transform_SUB(struct radeon_program_transform_context* ctx, +static void transform_SUB(struct gl_program* p, struct prog_instruction* inst) { - emit2(ctx, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1])); + emit2(p, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1])); } -static void transform_SWZ(struct radeon_program_transform_context* ctx, +static void transform_SWZ(struct gl_program* p, struct prog_instruction* inst) { - emit1(ctx, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]); + emit1(p, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]); } -static void transform_XPD(struct radeon_program_transform_context* ctx, +static void transform_XPD(struct gl_program* p, struct prog_instruction* inst) { - int tempreg = radeonCompilerAllocateTemporary(ctx->compiler); + int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - emit2(ctx, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg), + emit2(p, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg), swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W), swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W)); - emit3(ctx, OPCODE_MAD, inst->DstReg, + emit3(p, OPCODE_MAD, inst->DstReg, swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W), swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W), negate(srcreg(PROGRAM_TEMPORARY, tempreg))); @@ -264,20 +258,21 @@ static void transform_XPD(struct radeon_program_transform_context* ctx, * @todo add LIT here as well? */ GLboolean radeonTransformALU( - struct radeon_program_transform_context* ctx, + GLcontext* ctx, + struct gl_program* prog, struct prog_instruction* inst, void* unused) { switch(inst->Opcode) { - case OPCODE_ABS: transform_ABS(ctx, inst); return GL_TRUE; - case OPCODE_DPH: transform_DPH(ctx, inst); return GL_TRUE; - case OPCODE_FLR: transform_FLR(ctx, inst); return GL_TRUE; - case OPCODE_POW: transform_POW(ctx, inst); return GL_TRUE; - case OPCODE_SGE: transform_SGE(ctx, inst); return GL_TRUE; - case OPCODE_SLT: transform_SLT(ctx, inst); return GL_TRUE; - case OPCODE_SUB: transform_SUB(ctx, inst); return GL_TRUE; - case OPCODE_SWZ: transform_SWZ(ctx, inst); return GL_TRUE; - case OPCODE_XPD: transform_XPD(ctx, inst); return GL_TRUE; + case OPCODE_ABS: transform_ABS(prog, inst); return GL_TRUE; + case OPCODE_DPH: transform_DPH(prog, inst); return GL_TRUE; + case OPCODE_FLR: transform_FLR(prog, inst); return GL_TRUE; + case OPCODE_POW: transform_POW(prog, inst); return GL_TRUE; + case OPCODE_SGE: transform_SGE(prog, inst); return GL_TRUE; + case OPCODE_SLT: transform_SLT(prog, inst); return GL_TRUE; + case OPCODE_SUB: transform_SUB(prog, inst); return GL_TRUE; + case OPCODE_SWZ: transform_SWZ(prog, inst); return GL_TRUE; + case OPCODE_XPD: transform_XPD(prog, inst); return GL_TRUE; default: return GL_FALSE; } diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.h b/src/mesa/drivers/dri/r300/radeon_program_alu.h index 940459624f..f5beb9f8c3 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.h +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.h @@ -31,7 +31,8 @@ #include "radeon_program.h" GLboolean radeonTransformALU( - struct radeon_program_transform_context*, + GLcontext*, + struct gl_program*, struct prog_instruction*, void*); -- cgit v1.2.3 From 364d45a3e1629f32c6ab5407f92618a16c9d45e0 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 16:07:37 +0200 Subject: r500: Major refactoring of fragment program emit Use the common facilities to convert non-native instructions into native ones. Worked hard to make the code easier to read (hopefully), by using helper functions instead of direct manipulation of the machine code. Fixes two bugs related to FLR and XPD. --- src/mesa/drivers/dri/r300/r300_reg.h | 6 + src/mesa/drivers/dri/r300/r500_fragprog.c | 9 +- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 1218 +++++++++--------------- 3 files changed, 458 insertions(+), 775 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index 58a19554c7..cd232c5b7b 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -2705,6 +2705,7 @@ enum { # define R500_ALPHA_OP_MDV 15 # define R500_ALPHA_ADDRD(x) (x << 4) # define R500_ALPHA_ADDRD_REL (1 << 11) +# define R500_ALPHA_SEL_A_SHIFT 12 # define R500_ALPHA_SEL_A_SRC0 (0 << 12) # define R500_ALPHA_SEL_A_SRC1 (1 << 12) # define R500_ALPHA_SEL_A_SRC2 (2 << 12) @@ -2721,6 +2722,7 @@ enum { # define R500_ALPHA_MOD_A_NEG (1 << 17) # define R500_ALPHA_MOD_A_ABS (2 << 17) # define R500_ALPHA_MOD_A_NAB (3 << 17) +# define R500_ALPHA_SEL_B_SHIFT 19 # define R500_ALPHA_SEL_B_SRC0 (0 << 19) # define R500_ALPHA_SEL_B_SRC1 (1 << 19) # define R500_ALPHA_SEL_B_SRC2 (2 << 19) @@ -2777,6 +2779,7 @@ enum { # define R500_ALU_RGBA_OP_MDV (12 << 0) # define R500_ALU_RGBA_ADDRD(x) (x << 4) # define R500_ALU_RGBA_ADDRD_REL (1 << 11) +# define R500_ALU_RGBA_SEL_C_SHIFT 12 # define R500_ALU_RGBA_SEL_C_SRC0 (0 << 12) # define R500_ALU_RGBA_SEL_C_SRC1 (1 << 12) # define R500_ALU_RGBA_SEL_C_SRC2 (2 << 12) @@ -2809,6 +2812,7 @@ enum { # define R500_ALU_RGBA_MOD_C_NEG (1 << 23) # define R500_ALU_RGBA_MOD_C_ABS (2 << 23) # define R500_ALU_RGBA_MOD_C_NAB (3 << 23) +# define R500_ALU_RGBA_ALPHA_SEL_C_SHIFT 25 # define R500_ALU_RGBA_ALPHA_SEL_C_SRC0 (0 << 25) # define R500_ALU_RGBA_ALPHA_SEL_C_SRC1 (1 << 25) # define R500_ALU_RGBA_ALPHA_SEL_C_SRC2 (2 << 25) @@ -2826,6 +2830,7 @@ enum { # define R500_ALU_RGBA_ALPHA_MOD_C_ABS (2 << 30) # define R500_ALU_RGBA_ALPHA_MOD_C_NAB (3 << 30) #define R500_US_ALU_RGB_INST_0 0xa000 +# define R500_ALU_RGB_SEL_A_SHIFT 0 # define R500_ALU_RGB_SEL_A_SRC0 (0 << 0) # define R500_ALU_RGB_SEL_A_SRC1 (1 << 0) # define R500_ALU_RGB_SEL_A_SRC2 (2 << 0) @@ -2858,6 +2863,7 @@ enum { # define R500_ALU_RGB_MOD_A_NEG (1 << 11) # define R500_ALU_RGB_MOD_A_ABS (2 << 11) # define R500_ALU_RGB_MOD_A_NAB (3 << 11) +# define R500_ALU_RGB_SEL_B_SHIFT 13 # define R500_ALU_RGB_SEL_B_SRC0 (0 << 13) # define R500_ALU_RGB_SEL_B_SRC1 (1 << 13) # define R500_ALU_RGB_SEL_B_SRC2 (2 << 13) diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index 62e06ea52c..b46e924ac7 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -27,6 +27,8 @@ #include "r500_fragprog.h" +#include "radeon_program_alu.h" + /** * Transform TEX, TXP, TXB, and KIL instructions in the following way: @@ -316,11 +318,12 @@ void r500TranslateFragmentShader(r300ContextPtr r300, insert_WPOS_trailer(&compiler); - struct radeon_program_transformation transformations[1] = { - { &transform_TEX, &compiler } + struct radeon_program_transformation transformations[2] = { + { &transform_TEX, &compiler }, + { &radeonTransformALU, 0 } }; radeonLocalTransform(r300->radeon.glCtx, compiler.program, - 1, transformations); + 2, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { _mesa_printf("Compiler: after all transformations:\n"); diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 67545cbb4f..0e95c81e48 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -168,6 +168,12 @@ static const GLfloat LIT[] = {127.999999, 127.999999, -127.999999}; +static const struct prog_dst_register dstreg_template = { + .File = PROGRAM_TEMPORARY, + .Index = 0, + .WriteMask = WRITEMASK_XYZW +}; + static INLINE GLuint make_rgb_swizzle(struct prog_src_register src) { GLuint swiz = 0x0; GLuint temp; @@ -179,8 +185,14 @@ static INLINE GLuint make_rgb_swizzle(struct prog_src_register src) { if (temp == 5) temp++; swiz |= temp << i*3; } - if (src.NegateBase) - swiz |= (R500_SWIZ_MOD_NEG << 9); + if (src.Abs) { + swiz |= R500_SWIZ_MOD_ABS << 9; + } else if (src.NegateBase & 7) { + ASSERT((src.NegateBase & 7) == 7); + swiz |= R500_SWIZ_MOD_NEG << 9; + } + if (src.NegateAbs) + swiz ^= R500_SWIZ_MOD_NEG << 9; return swiz; } @@ -202,8 +214,13 @@ static INLINE GLuint make_alpha_swizzle(struct prog_src_register src) { if (swiz == 5) swiz++; - if (src.NegateBase) - swiz |= (R500_SWIZ_MOD_NEG << 3); + if (src.Abs) { + swiz |= R500_SWIZ_MOD_ABS << 3; + } else if (src.NegateBase & 8) { + swiz |= R500_SWIZ_MOD_NEG << 3; + } + if (src.NegateAbs) + swiz ^= R500_SWIZ_MOD_NEG << 3; return swiz; } @@ -212,6 +229,15 @@ static INLINE GLuint make_sop_swizzle(struct prog_src_register src) { GLuint swiz = GET_SWZ(src.Swizzle, 0); if (swiz == 5) swiz++; + + if (src.Abs) { + swiz |= R500_SWIZ_MOD_ABS << 3; + } else if (src.NegateBase & 1) { + swiz |= R500_SWIZ_MOD_NEG << 3; + } + if (src.NegateAbs) + swiz ^= R500_SWIZ_MOD_NEG << 3; + return swiz; } @@ -324,12 +350,23 @@ static GLuint make_dest(struct r500_pfs_compile_state *cs, struct prog_dst_regis return reg; } -static void emit_tex(struct r500_pfs_compile_state *cs, - struct prog_instruction *fpi, int dest, int counter) +static int emit_slot(struct r500_pfs_compile_state *cs) +{ + if (cs->nrslots >= 512) { + ERROR("Too many instructions"); + cs->nrslots = 1; + return 0; + } + return cs->nrslots++; +} + +static int emit_tex(struct r500_pfs_compile_state *cs, + struct prog_instruction *fpi, int dest) { PROG_CODE; int hwsrc, hwdest; GLuint mask; + int counter = emit_slot(cs); mask = fpi->DstReg.WriteMask << 11; hwsrc = make_src(cs, fpi->SrcReg[0]); @@ -399,844 +436,490 @@ static void emit_tex(struct r500_pfs_compile_state *cs, | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); } + + return counter; } -static void emit_alu(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi) { +/* Do not call directly */ +static int _helper_emit_alu(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, + int File, int Index, int WriteMask) +{ PROG_CODE; - /* Ideally, we shouldn't have to explicitly clear memory here! */ - code->inst[counter].inst0 = 0x0; - code->inst[counter].inst1 = 0x0; - code->inst[counter].inst2 = 0x0; - code->inst[counter].inst3 = 0x0; - code->inst[counter].inst4 = 0x0; - code->inst[counter].inst5 = 0x0; + int counter = emit_slot(cs); - if (fpi->DstReg.File == PROGRAM_OUTPUT) { - code->inst[counter].inst0 = R500_INST_TYPE_OUT; + code->inst[counter].inst4 = alphaop; + code->inst[counter].inst5 = rgbop; - if (fpi->DstReg.Index == FRAG_RESULT_COLR) - code->inst[counter].inst0 |= (fpi->DstReg.WriteMask << 15); + if (File == PROGRAM_OUTPUT) { + code->inst[counter].inst0 = R500_INST_TYPE_OUT; - if (fpi->DstReg.Index == FRAG_RESULT_DEPR) { + if (Index == FRAG_RESULT_COLR) { + code->inst[counter].inst0 |= WriteMask << 15; + } else if (Index == FRAG_RESULT_DEPR) { code->inst[counter].inst4 |= R500_ALPHA_W_OMASK; - /* Notify the state emission! */ cs->compiler->fp->writes_depth = GL_TRUE; } } else { + int dest = Index + code->temp_reg_offset; + code->inst[counter].inst0 = R500_INST_TYPE_ALU - /* pixel_mask */ - | (fpi->DstReg.WriteMask << 11); + | (WriteMask << 11); + code->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest); + code->inst[counter].inst5 |= R500_ALU_RGBA_ADDRD(dest); } code->inst[counter].inst0 |= R500_INST_TEX_SEM_WAIT; + + return counter; } -static void emit_mov(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, GLuint src_reg, GLuint swizzle, GLuint dest) { +/** + * Prepare an ALU slot with the given RGB operation, ALPHA operation, and + * destination register. + */ +static int emit_alu(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, struct prog_dst_register dst) +{ + return _helper_emit_alu(cs, rgbop, alphaop, dst.File, dst.Index, dst.WriteMask); +} + +static int emit_alu_temp(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, int dst, int writemask) +{ + return _helper_emit_alu(cs, rgbop, alphaop, + PROGRAM_TEMPORARY, dst - cs->compiler->code->temp_reg_offset, writemask); +} + +/** + * Set an instruction's source 0 (both RGB and ALPHA) to the given hardware index. + */ +static void set_src0_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) +{ PROG_CODE; - /* The r3xx shader uses MAD to implement MOV. We are using CMP, since - * it is technically more accurate and recommended by ATI/AMD. */ - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src_reg); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src_reg); - /* (De)mangle the swizzle from Mesa to R500. */ - swizzle = make_rgba_swizzle(swizzle); - /* 0x1FF is 9 bits, size of an RGB swizzle. */ - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A((swizzle & 0x1ff)) - | R500_ALU_RGB_SEL_B_SRC0 - | MAKE_SWIZ_RGB_B((swizzle & 0x1ff)) - | R500_ALU_RGB_OMOD_DISABLE; - code->inst[counter].inst4 |= R500_ALPHA_OP_CMP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(GET_SWZ(swizzle, 3)) - | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(GET_SWZ(swizzle, 3)) - | R500_ALPHA_OMOD_DISABLE; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP - | R500_ALU_RGBA_ADDRD(dest) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); + code->inst[ip].inst1 |= R500_RGB_ADDR0(src); + code->inst[ip].inst2 |= R500_ALPHA_ADDR0(src); } -static void emit_mad(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, int one, int two, int three) { +/** + * Set an instruction's source 1 (both RGB and ALPHA) to the given hardware index. + */ +static void set_src1_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) +{ PROG_CODE; - /* Note: This code was all Corbin's. Corbin is a rather hackish coder. - * If you can make it pretty or fast, please do so! */ - emit_alu(cs, counter, fpi); - /* Common MAD stuff */ - code->inst[counter].inst4 |= R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(make_dest(cs, fpi->DstReg)); - code->inst[counter].inst5 |= R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(make_dest(cs, fpi->DstReg)); - switch (one) { - case 0: - case 1: - case 2: - code->inst[counter].inst1 |= R500_RGB_ADDR0(make_src(cs, fpi->SrcReg[one])); - code->inst[counter].inst2 |= R500_ALPHA_ADDR0(make_src(cs, fpi->SrcReg[one])); - code->inst[counter].inst3 |= R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[one])); - code->inst[counter].inst4 |= R500_ALPHA_SEL_A_SRC0 - | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[one])); - break; - case R500_SWIZZLE_ZERO: - code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO); - code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO); - break; - case R500_SWIZZLE_ONE: - code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE); - code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE); - break; - default: - ERROR("Bad src index in emit_mad: %d\n", one); - break; - } - switch (two) { - case 0: - case 1: - case 2: - code->inst[counter].inst1 |= R500_RGB_ADDR1(make_src(cs, fpi->SrcReg[two])); - code->inst[counter].inst2 |= R500_ALPHA_ADDR1(make_src(cs, fpi->SrcReg[two])); - code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1 - | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[two])); - code->inst[counter].inst4 |= R500_ALPHA_SEL_B_SRC1 - | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[two])); - break; - case R500_SWIZZLE_ZERO: - code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO); - code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO); - break; - case R500_SWIZZLE_ONE: - code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE); - code->inst[counter].inst4 |= MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE); - break; - default: - ERROR("Bad src index in emit_mad: %d\n", two); - break; - } - switch (three) { - case 0: - case 1: - case 2: - code->inst[counter].inst1 |= R500_RGB_ADDR2(make_src(cs, fpi->SrcReg[three])); - code->inst[counter].inst2 |= R500_ALPHA_ADDR2(make_src(cs, fpi->SrcReg[three])); - code->inst[counter].inst5 |= R500_ALU_RGBA_SEL_C_SRC2 - | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[three])) - | R500_ALU_RGBA_ALPHA_SEL_C_SRC2 - | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[three])); - break; - case R500_SWIZZLE_ZERO: - code->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - break; - case R500_SWIZZLE_ONE: - code->inst[counter].inst5 |= MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ONE) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ONE); - break; - default: - ERROR("Bad src index in emit_mad: %d\n", three); - break; - } + code->inst[ip].inst1 |= R500_RGB_ADDR1(src); + code->inst[ip].inst2 |= R500_ALPHA_ADDR1(src); } -static void emit_sop(struct r500_pfs_compile_state *cs, int counter, struct prog_instruction *fpi, int opcode, GLuint src, GLuint swiz, GLuint dest) { +/** + * Set an instruction's source 2 (both RGB and ALPHA) to the given hardware index. + */ +static void set_src2_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) +{ PROG_CODE; - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src); - code->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(swiz); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_SOP - | R500_ALU_RGBA_ADDRD(dest); - switch (opcode) { - case OPCODE_COS: - code->inst[counter].inst4 |= R500_ALPHA_OP_COS; - break; - case OPCODE_EX2: - code->inst[counter].inst4 |= R500_ALPHA_OP_EX2; - break; - case OPCODE_LG2: - code->inst[counter].inst4 |= R500_ALPHA_OP_LN2; - break; - case OPCODE_RCP: - code->inst[counter].inst4 |= R500_ALPHA_OP_RCP; - break; - case OPCODE_RSQ: - code->inst[counter].inst4 |= R500_ALPHA_OP_RSQ; - break; - case OPCODE_SIN: - code->inst[counter].inst4 |= R500_ALPHA_OP_SIN; - break; - default: - ERROR("Bad opcode in emit_sop: %d\n", opcode); - break; + code->inst[ip].inst1 |= R500_RGB_ADDR2(src); + code->inst[ip].inst2 |= R500_ALPHA_ADDR2(src); +} + +/** + * Set an instruction's source 0 (both RGB and ALPHA) according to the given source register. + */ +static void set_src0(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) +{ + set_src0_direct(cs, ip, make_src(cs, srcreg)); +} + +/** + * Set an instruction's source 1 (both RGB and ALPHA) according to the given source register. + */ +static void set_src1(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) +{ + set_src1_direct(cs, ip, make_src(cs, srcreg)); +} + +/** + * Set an instruction's source 2 (both RGB and ALPHA) according to the given source register. + */ +static void set_src2(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) +{ + set_src2_direct(cs, ip, make_src(cs, srcreg)); +} + +/** + * Set an instruction's argument A (both RGB and ALPHA) from the given source, + * taking swizzles+neg+abs as specified (see also _reg version below). + */ +static void set_argA(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) +{ + PROG_CODE; + code->inst[ip].inst3 |= (source << R500_ALU_RGB_SEL_A_SHIFT) | MAKE_SWIZ_RGB_A(swizRGB); + code->inst[ip].inst4 |= (source << R500_ALPHA_SEL_A_SHIFT) | MAKE_SWIZ_ALPHA_A(swizA); +} + +/** + * Set an instruction's argument B (both RGB and ALPHA) from the given source, + * taking swizzles+neg+abs as specified (see also _reg version below). + */ +static void set_argB(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) +{ + PROG_CODE; + code->inst[ip].inst3 |= (source << R500_ALU_RGB_SEL_B_SHIFT) | MAKE_SWIZ_RGB_B(swizRGB); + code->inst[ip].inst4 |= (source << R500_ALPHA_SEL_B_SHIFT) | MAKE_SWIZ_ALPHA_B(swizA); +} + +/** + * Set an instruction's argument C (both RGB and ALPHA) from the given source, + * taking swizzles+neg+abs as specified (see also _reg version below). + */ +static void set_argC(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) +{ + PROG_CODE; + code->inst[ip].inst5 |= + (source << R500_ALU_RGBA_SEL_C_SHIFT) | + MAKE_SWIZ_RGBA_C(swizRGB) | + (source << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT) | + MAKE_SWIZ_ALPHA_C(swizA); +} + +/** + * Set an instruction's argument A (both RGB and ALPHA) from the given source, + * taking swizzles, negation and absolute value from the given source register. + */ +static void set_argA_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) +{ + set_argA(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); +} + +/** + * Set an instruction's argument B (both RGB and ALPHA) from the given source, + * taking swizzles, negation and absolute value from the given source register. + */ +static void set_argB_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) +{ + set_argB(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); +} + +/** + * Set an instruction's argument C (both RGB and ALPHA) from the given source, + * taking swizzles, negation and absolute value from the given source register. + */ +static void set_argC_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) +{ + set_argC(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); +} + +/** + * Emit a special scalar operation. + */ +static int emit_sop(struct r500_pfs_compile_state *cs, + int opcode, struct prog_dst_register dstreg, GLuint src, GLuint swiz) +{ + int ip = emit_alu(cs, R500_ALU_RGBA_OP_SOP, opcode, dstreg); + set_src0_direct(cs, ip, src); + set_argA(cs, ip, 0, R500_SWIZ_RGB_ZERO, swiz); + return ip; +} + + +/** + * Emit trigonometric function COS, SIN, SCS + */ +static void emit_trig(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) +{ + int ip; + struct prog_dst_register temp = dstreg_template; + temp.Index = get_temp(cs, 0); + temp.WriteMask = WRITEMASK_W; + + /* temp = Input*(1/2pi) */ + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, temp); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1_direct(cs, ip, emit_const4fv(cs, RCP_2PI)); + set_argA(cs, ip, 0, R500_SWIZ_RGB_ZERO, make_sop_swizzle(fpi->SrcReg[0])); + set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, SWIZZLE_W); + set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); + + /* temp = frac(dst) */ + ip = emit_alu(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, temp); + set_src0_direct(cs, ip, temp.Index); + set_argA(cs, ip, 0, R500_SWIZ_RGB_RGB, SWIZZLE_W); + + /* Dest = trig(temp) */ + if (fpi->Opcode == OPCODE_COS) { + emit_sop(cs, R500_ALPHA_OP_COS, fpi->DstReg, temp.Index, SWIZZLE_W); + } else if (fpi->Opcode == OPCODE_SIN) { + emit_sop(cs, R500_ALPHA_OP_SIN, fpi->DstReg, temp.Index, SWIZZLE_W); + } else if (fpi->Opcode == OPCODE_SCS) { + struct prog_dst_register moddst = fpi->DstReg; + + if (fpi->DstReg.WriteMask & WRITEMASK_X) { + moddst.WriteMask = WRITEMASK_X; + emit_sop(cs, R500_ALPHA_OP_COS, fpi->DstReg, temp.Index, SWIZZLE_W); + } + if (fpi->DstReg.WriteMask & WRITEMASK_Y) { + moddst.WriteMask = WRITEMASK_Y; + emit_sop(cs, R500_ALPHA_OP_SIN, fpi->DstReg, temp.Index, SWIZZLE_W); + } } } -static int do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi, int counter) { +/** + * Emit a LIT instruction. + * + * Definition of LIT (from ARB_fragment_program): + * tmp = VectorLoad(op0); + * if (tmp.x < 0) tmp.x = 0; + * if (tmp.y < 0) tmp.y = 0; + * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); + * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; + * result.x = 1.0; + * result.y = tmp.x; + * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0; + * result.w = 1.0; + */ +static void emit_lit(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) +{ + GLuint cnst; + int needTemporary; + GLuint temp; + int ip; + + cnst = emit_const4fv(cs, LIT); + + needTemporary = 0; + if (fpi->DstReg.WriteMask != WRITEMASK_XYZW || fpi->DstReg.File == PROGRAM_OUTPUT) + needTemporary = 1; + + if (needTemporary) { + temp = get_temp(cs, 0); + } else { + temp = fpi->DstReg.Index; + } + + // MAX tmp.xyw, op0, { 0, 0, 0, -128+eps } + ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAX, R500_ALPHA_OP_MAX, temp, WRITEMASK_XYW); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1_direct(cs, ip, cnst); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, SWIZZLE_W); + + // MIN tmp.z, tmp.w, { 128-eps } + // LG2 tmp.w, tmp.y + ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MIN, R500_ALPHA_OP_LN2, temp, WRITEMASK_ZW); + set_src0_direct(cs, ip, temp); + set_src1_direct(cs, ip, cnst); + set_argA(cs, ip, 0, SWIZZLE_W | (SWIZZLE_W<<3) | (SWIZZLE_W<<6), SWIZZLE_Y); + set_argB(cs, ip, 1, SWIZZLE_X | (SWIZZLE_X<<3) | (SWIZZLE_X<<6), SWIZZLE_X); + + // MOV tmp.y, tmp.x + // MUL tmp.w, tmp.z, tmp.w + ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, temp, WRITEMASK_YW); + set_src0_direct(cs, ip, temp); + set_argA(cs, ip, 0, SWIZZLE_X | (SWIZZLE_X<<3) | (SWIZZLE_X<<6), SWIZZLE_Z); + set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, SWIZZLE_W); + set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); + + // MOV tmp.x, 1.0 + // EX2 tmp.w, tmp.w + ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_EX2, temp, WRITEMASK_XW); + set_src0_direct(cs, ip, temp); + set_argA(cs, ip, 0, R500_SWIZ_RGB_ONE, SWIZZLE_W); + set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ZERO); + set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); + + // tmp.z := (-tmp.x >= 0) ? tmp.y : 0.0 + // MOV tmp.w, 1.0 + ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, temp, WRITEMASK_ZW); + set_src0_direct(cs, ip, temp); + set_argA(cs, ip, 0, R500_SWIZZLE_ZERO, R500_SWIZZLE_ONE); + set_argB(cs, ip, 0, SWIZZLE_W | (SWIZZLE_W<<3) | (SWIZZLE_W<<6), R500_SWIZZLE_ONE); + set_argC(cs, ip, 0, SWIZZLE_Y | (SWIZZLE_Y<<3) | (SWIZZLE_Y<<6) | (R500_SWIZ_MOD_NEG<<9), R500_SWIZZLE_ZERO); + + if (needTemporary) { + ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); + set_src0_direct(cs, ip, temp); + set_argA(cs, ip, 0, R500_SWIZ_RGB_RGB, SWIZZLE_W); + set_argB(cs, ip, 1, R500_SWIZ_RGB_RGB, SWIZZLE_W); + set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); + } +} + +static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) { PROG_CODE; GLuint src[3], dest = 0; - int temp_swiz = 0; + int ip; if (fpi->Opcode != OPCODE_KIL) { dest = make_dest(cs, fpi->DstReg); } switch (fpi->Opcode) { - case OPCODE_ABS: - emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest); - code->inst[counter].inst3 |= R500_ALU_RGB_MOD_A_ABS - | R500_ALU_RGB_MOD_B_ABS; - code->inst[counter].inst4 |= R500_ALPHA_MOD_A_ABS - | R500_ALPHA_MOD_B_ABS; - break; case OPCODE_ADD: /* Variation on MAD: 1*src0+src1 */ - emit_mad(cs, counter, fpi, R500_SWIZZLE_ONE, 0, 1); + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_argA(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ONE); + set_argB_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argC_reg(cs, ip, 1, fpi->SrcReg[1]); break; case OPCODE_CMP: /* This inst's selects need to be swapped as follows: * 0 -> C ; 1 -> B ; 2 -> A */ - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - src[2] = make_src(cs, fpi->SrcReg[2]); - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[2]) - | R500_RGB_ADDR1(src[1]) | R500_RGB_ADDR2(src[0]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[2]) - | R500_ALPHA_ADDR1(src[1]) | R500_ALPHA_ADDR2(src[0]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[2])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 |= R500_ALPHA_OP_CMP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[2])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP - | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC2 - | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGBA_ALPHA_SEL_C_SRC2 - | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0])); + ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_src2(cs, ip, fpi->SrcReg[2]); + set_argA_reg(cs, ip, 2, fpi->SrcReg[2]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); + set_argC_reg(cs, ip, 0, fpi->SrcReg[0]); break; case OPCODE_COS: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = emit_const4fv(cs, RCP_2PI); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A - | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - counter++; - code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB); - code->inst[counter].inst4 = R500_ALPHA_OP_FRC - | R500_ALPHA_ADDRD(get_temp(cs, 1)) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC - | R500_ALU_RGBA_ADDRD(get_temp(cs, 1)); - counter++; - emit_sop(cs, counter, fpi, OPCODE_COS, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest); + emit_trig(cs, fpi); break; case OPCODE_DP3: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 |= R500_ALPHA_OP_DP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP3 - | R500_ALU_RGBA_ADDRD(dest); + ip = emit_alu(cs, R500_ALU_RGBA_OP_DP3, R500_ALPHA_OP_DP, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); break; case OPCODE_DP4: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - /* Based on DP3 */ - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 |= R500_ALPHA_OP_DP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4 - | R500_ALU_RGBA_ADDRD(dest); - break; - case OPCODE_DPH: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - /* Based on DP3 */ - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 |= R500_ALPHA_OP_DP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_DP4 - | R500_ALU_RGBA_ADDRD(dest); + ip = emit_alu(cs, R500_ALU_RGBA_OP_DP4, R500_ALPHA_OP_DP, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); break; case OPCODE_DST: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); /* [1, src0.y*src1.y, src0.z, src1.w] - * So basically MUL with lotsa swizzling. */ - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | R500_ALU_RGB_SEL_B_SRC1; - /* Select [1, y, z, 1] */ - temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x7) | R500_SWIZZLE_ONE; - code->inst[counter].inst3 |= MAKE_SWIZ_RGB_A(temp_swiz); - /* Select [1, y, 1, w] */ - temp_swiz = (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x1c7) | R500_SWIZZLE_ONE | (R500_SWIZZLE_ONE << 6); - code->inst[counter].inst3 |= MAKE_SWIZ_RGB_B(temp_swiz); - code->inst[counter].inst4 |= R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(dest) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); + * So basically MUL with lotsa swizzling. */ + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_argA(cs, ip, 0, + (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x7) | R500_SWIZZLE_ONE, + R500_SWIZZLE_ONE); + set_argB(cs, ip, 1, + (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x1c7) | R500_SWIZZLE_ONE | (R500_SWIZZLE_ONE << 6), + make_alpha_swizzle(fpi->SrcReg[1])); + set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); break; case OPCODE_EX2: src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, counter, fpi, OPCODE_EX2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest); + emit_sop(cs, R500_ALPHA_OP_EX2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; case OPCODE_FLR: - src[0] = make_src(cs, fpi->SrcReg[0]); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst4 |= R500_ALPHA_OP_FRC - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)); - counter++; - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(get_temp(cs, 0)); - code->inst[counter].inst3 = MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE) - | R500_ALU_RGB_SEL_B_SRC0 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SWIZ_A_A - | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC1 - | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGBA_ALPHA_SEL_C_SRC1 - | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGBA_MOD_C_NEG; + dest = get_temp(cs, 0); + ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, dest, WRITEMASK_XYZW); + set_src0(cs, ip, fpi->SrcReg[0]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1_direct(cs, ip, dest); + set_argA_reg(cs, ip, 0, fpi->SrcReg[1]); + set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ONE); + set_argC(cs, ip, 1, + R500_SWIZ_RGB_RGB|(R500_SWIZ_MOD_NEG<<9), + SWIZZLE_W|(R500_SWIZ_MOD_NEG<<3)); break; case OPCODE_FRC: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst4 |= R500_ALPHA_OP_FRC - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC - | R500_ALU_RGBA_ADDRD(dest); + ip = emit_alu(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); break; case OPCODE_LG2: src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest); + emit_sop(cs, R500_ALPHA_OP_LN2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; case OPCODE_LIT: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = emit_const4fv(cs, LIT); - /* First inst: MAX temp, input, [0, 0, 0, -128] - * Write: RG, A */ - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | (R500_WRITEMASK_ARG << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO); - code->inst[counter].inst4 = R500_ALPHA_OP_MAX - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)); - counter++; - /* Second inst: MIN temp, temp, [x, x, x, 128] - * Write: A */ - code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_A << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)) | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)) | R500_ALPHA_ADDR1(src[1]); - /* code->inst[counter].inst3; */ - code->inst[counter].inst4 = R500_ALPHA_OP_MAX - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A - | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX - | R500_ALU_RGBA_ADDRD(dest); - counter++; - /* Third-fifth insts: POW temp, temp.y, temp.w - * Write: B */ - emit_sop(cs, counter, fpi, OPCODE_LG2, get_temp(cs, 0), SWIZZLE_Y, get_temp(cs, 1)); - code->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11); - counter++; - code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 1)) - | R500_RGB_ADDR1(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 1)) - | R500_ALPHA_ADDR1(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 1)) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A - | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 1)) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - counter++; - emit_sop(cs, counter, fpi, OPCODE_EX2, get_temp(cs, 1), SWIZZLE_W, get_temp(cs, 0)); - code->inst[counter].inst0 |= (R500_WRITEMASK_B << 11); - counter++; - /* Sixth inst: CMP dest, temp.xxxx, temp.[1, x, z, 1], temp.[1, x, 0, 1]; - * Write: ARGB - * This inst's selects need to be swapped as follows: - * 0 -> C ; 1 -> B ; 2 -> A */ - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | R500_ALU_RGB_R_SWIZ_A_1 - | R500_ALU_RGB_G_SWIZ_A_R - | R500_ALU_RGB_B_SWIZ_A_B - | R500_ALU_RGB_SEL_B_SRC0 - | R500_ALU_RGB_R_SWIZ_B_1 - | R500_ALU_RGB_G_SWIZ_B_R - | R500_ALU_RGB_B_SWIZ_B_0; - code->inst[counter].inst4 |= R500_ALPHA_OP_CMP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_1 - | R500_ALPHA_SEL_B_SRC0 | R500_ALPHA_SWIZ_B_1; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP - | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC0 - | R500_ALU_RGBA_ALPHA_SEL_C_SRC0 - | R500_ALU_RGBA_R_SWIZ_R - | R500_ALU_RGBA_G_SWIZ_R - | R500_ALU_RGBA_B_SWIZ_R - | R500_ALU_RGBA_A_SWIZ_R; + emit_lit(cs, fpi); break; case OPCODE_LRP: - /* src0 * src1 + INV(src0) * src2 - * 1) MUL src0, src1, temp - * 2) PRE 1-src0; MAD srcp, src2, temp */ - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - src[2] = make_src(cs, fpi->SrcReg[2]); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | R500_INST_NOP | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - counter++; - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[2]) - | R500_RGB_ADDR2(get_temp(cs, 0)) - | R500_RGB_SRCP_OP_1_MINUS_RGB0; - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[2]) - | R500_ALPHA_ADDR2(get_temp(cs, 0)) - | R500_ALPHA_SRCP_OP_1_MINUS_A0; - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRCP - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[2])); - code->inst[counter].inst4 |= R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRCP | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[2])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC2 | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB) - | R500_ALU_RGBA_ALPHA_SEL_C_SRC2 | R500_ALU_RGBA_A_SWIZ_A; + /* result = src0*src1 + (1-src0)*src2 + * = src0*src1 + src2 + (-src0)*src2 + * + * Note: LRP without swizzling (or with only limited + * swizzling) could be done more efficiently using the + * presubtract hardware. + */ + dest = get_temp(cs, 0); + ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, dest, WRITEMASK_XYZW); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_src2(cs, ip, fpi->SrcReg[2]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); + set_argC_reg(cs, ip, 2, fpi->SrcReg[2]); + + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[2]); + set_src2_direct(cs, ip, dest); + set_argA(cs, ip, 0, + make_rgb_swizzle(fpi->SrcReg[0]) ^ (R500_SWIZ_MOD_NEG<<9), + make_alpha_swizzle(fpi->SrcReg[0]) ^ (R500_SWIZ_MOD_NEG<<3)); + set_argB_reg(cs, ip, 1, fpi->SrcReg[2]); + set_argC(cs, ip, 2, R500_SWIZ_RGB_RGB, SWIZZLE_W); break; case OPCODE_MAD: - emit_mad(cs, counter, fpi, 0, 1, 2); + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_src2(cs, ip, fpi->SrcReg[2]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); + set_argC_reg(cs, ip, 2, fpi->SrcReg[2]); break; case OPCODE_MAX: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 - | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 |= R500_ALPHA_OP_MAX - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAX - | R500_ALU_RGBA_ADDRD(dest); + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAX, R500_ALPHA_OP_MAX, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); break; case OPCODE_MIN: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 - | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 |= R500_ALPHA_OP_MIN - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MIN - | R500_ALU_RGBA_ADDRD(dest); + ip = emit_alu(cs, R500_ALU_RGBA_OP_MIN, R500_ALPHA_OP_MIN, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); break; case OPCODE_MOV: - emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest); + ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); + code->inst[ip].inst3 |= R500_ALU_RGB_OMOD_DISABLE; + code->inst[ip].inst4 |= R500_ALPHA_OMOD_DISABLE; break; case OPCODE_MUL: /* Variation on MAD: src0*src1+0 */ - emit_mad(cs, counter, fpi, 0, 1, R500_SWIZZLE_ZERO); - break; - case OPCODE_POW: - /* POW(a,b) = EX2(LN2(a)*b) */ - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - emit_sop(cs, counter, fpi, OPCODE_LG2, src[0], make_sop_swizzle(fpi->SrcReg[0]), get_temp(cs, 0)); - code->inst[counter].inst0 |= (R500_WRITEMASK_ARGB << 11); - counter++; - code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(make_rgb_swizzle(fpi->SrcReg[0])) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 1)) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 1)) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - counter++; - emit_sop(cs, counter, fpi, OPCODE_EX2, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest); + ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); + set_src0(cs, ip, fpi->SrcReg[0]); + set_src1(cs, ip, fpi->SrcReg[1]); + set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); + set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); + set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); break; case OPCODE_RCP: src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, counter, fpi, OPCODE_RCP, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest); + emit_sop(cs, R500_ALPHA_OP_RCP, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; case OPCODE_RSQ: src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, counter, fpi, OPCODE_RSQ, src[0], make_sop_swizzle(fpi->SrcReg[0]), dest); + emit_sop(cs, R500_ALPHA_OP_RSQ, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; case OPCODE_SCS: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = emit_const4fv(cs, RCP_2PI); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A - | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - counter++; - code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB); - code->inst[counter].inst4 = R500_ALPHA_OP_FRC - | R500_ALPHA_ADDRD(get_temp(cs, 1)) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC - | R500_ALU_RGBA_ADDRD(get_temp(cs, 1)); - counter++; - /* Do a cosine, then a sine, masking out the channels we want to protect. */ - /* Cosine only goes in R (x) channel. */ - fpi->DstReg.WriteMask = 0x1; - emit_sop(cs, counter, fpi, OPCODE_COS, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest); - counter++; - /* Sine only goes in G (y) channel. */ - fpi->DstReg.WriteMask = 0x2; - emit_sop(cs, counter, fpi, OPCODE_SIN, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest); - break; - case OPCODE_SGE: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR1(src[0]) - | R500_RGB_ADDR2(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0]) - | R500_ALPHA_ADDR2(src[1]); - code->inst[counter].inst3 = /* 1 */ - MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)) - | R500_ALU_RGBA_SEL_C_SRC2 - | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1])) - | R500_ALU_RGBA_MOD_C_NEG - | R500_ALU_RGBA_ALPHA_SEL_C_SRC2 - | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1])) - | R500_ALU_RGBA_ALPHA_MOD_C_NEG; - counter++; - /* This inst's selects need to be swapped as follows: - * 0 -> C ; 1 -> B ; 2 -> A */ - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE) - | R500_ALU_RGB_SEL_B_SRC0 - | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ZERO); - code->inst[counter].inst4 |= R500_ALPHA_OP_CMP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE) - | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ZERO); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP - | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC0 - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB) - | R500_ALU_RGBA_ALPHA_SEL_C_SRC0 - | R500_ALU_RGBA_A_SWIZ_A; + emit_trig(cs, fpi); break; case OPCODE_SIN: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = emit_const4fv(cs, RCP_2PI); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A - | R500_ALPHA_SEL_B_SRC1 | R500_ALPHA_SWIZ_B_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - counter++; - code->inst[counter].inst0 = R500_INST_TYPE_ALU | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB); - code->inst[counter].inst4 = R500_ALPHA_OP_FRC - | R500_ALPHA_ADDRD(get_temp(cs, 1)) - | R500_ALPHA_SEL_A_SRC0 | R500_ALPHA_SWIZ_A_A; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_FRC - | R500_ALU_RGBA_ADDRD(get_temp(cs, 1)); - counter++; - emit_sop(cs, counter, fpi, OPCODE_SIN, get_temp(cs, 1), make_sop_swizzle(fpi->SrcReg[0]), dest); - break; - case OPCODE_SLT: - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | (R500_WRITEMASK_ARGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR1(src[0]) - | R500_RGB_ADDR2(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR1(src[0]) - | R500_ALPHA_ADDR2(src[1]); - code->inst[counter].inst3 = /* 1 */ - MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ONE) - | R500_ALU_RGB_SEL_B_SRC1 | MAKE_SWIZ_RGB_B(make_rgb_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ONE) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[0])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)) - | R500_ALU_RGBA_SEL_C_SRC2 - | MAKE_SWIZ_RGBA_C(make_rgb_swizzle(fpi->SrcReg[1])) - | R500_ALU_RGBA_MOD_C_NEG - | R500_ALU_RGBA_ALPHA_SEL_C_SRC2 - | MAKE_SWIZ_ALPHA_C(make_alpha_swizzle(fpi->SrcReg[1])) - | R500_ALU_RGBA_ALPHA_MOD_C_NEG; - counter++; - /* This inst's selects need to be swapped as follows: - * 0 -> C ; 1 -> B ; 2 -> A */ - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_ZERO) - | R500_ALU_RGB_SEL_B_SRC0 - | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_ONE); - code->inst[counter].inst4 |= R500_ALPHA_OP_CMP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_SWIZZLE_ZERO) - | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_SWIZZLE_ONE); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP - | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC0 - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB) - | R500_ALU_RGBA_ALPHA_SEL_C_SRC0 - | R500_ALU_RGBA_A_SWIZ_A; - break; - case OPCODE_SUB: - /* Variation on MAD: 1*src0-src1 */ - fpi->SrcReg[1].NegateBase = 0xF; /* NEG_XYZW */ - emit_mad(cs, counter, fpi, R500_SWIZZLE_ONE, 0, 1); - break; - case OPCODE_SWZ: - /* TODO: The rarer negation masks! */ - emit_mov(cs, counter, fpi, make_src(cs, fpi->SrcReg[0]), fpi->SrcReg[0].Swizzle, dest); - break; - case OPCODE_XPD: - /* src0 * src1 - src1 * src0 - * 1) MUL temp.xyz, src0.yzx, src1.zxy - * 2) MAD src0.zxy, src1.yzx, -temp.xyz */ - src[0] = make_src(cs, fpi->SrcReg[0]); - src[1] = make_src(cs, fpi->SrcReg[1]); - code->inst[counter].inst0 = R500_INST_TYPE_ALU | R500_INST_TEX_SEM_WAIT - | (R500_WRITEMASK_RGB << 11); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]); - /* Select [y, z, x] */ - temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]); - temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(temp_swiz); - /* Select [z, x, y] */ - temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]); - temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6); - code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1 - | MAKE_SWIZ_RGB_B(temp_swiz); - code->inst[counter].inst4 = R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(get_temp(cs, 0)) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(make_alpha_swizzle(fpi->SrcReg[0])) - | R500_ALPHA_SEL_B_SRC1 | MAKE_SWIZ_ALPHA_B(make_alpha_swizzle(fpi->SrcReg[1])); - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(get_temp(cs, 0)) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - counter++; - emit_alu(cs, counter, fpi); - code->inst[counter].inst1 = R500_RGB_ADDR0(src[0]) - | R500_RGB_ADDR1(src[1]) - | R500_RGB_ADDR2(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(src[0]) - | R500_ALPHA_ADDR1(src[1]) - | R500_ALPHA_ADDR2(get_temp(cs, 0)); - /* Select [z, x, y] */ - temp_swiz = make_rgb_swizzle(fpi->SrcReg[0]); - temp_swiz = (GET_SWZ(temp_swiz, 2) << 0) | (GET_SWZ(temp_swiz, 0) << 3) | (GET_SWZ(temp_swiz, 1) << 6); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(temp_swiz); - /* Select [y, z, x] */ - temp_swiz = make_rgb_swizzle(fpi->SrcReg[1]); - temp_swiz = (GET_SWZ(temp_swiz, 1) << 0) | (GET_SWZ(temp_swiz, 2) << 3) | (GET_SWZ(temp_swiz, 0) << 6); - code->inst[counter].inst3 |= R500_ALU_RGB_SEL_B_SRC1 - | MAKE_SWIZ_RGB_B(temp_swiz); - code->inst[counter].inst4 |= R500_ALPHA_OP_MAD - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SWIZ_A_1 - | R500_ALPHA_SWIZ_B_1; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_MAD - | R500_ALU_RGBA_ADDRD(dest) - | R500_ALU_RGBA_SEL_C_SRC2 - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_RGB) - | R500_ALU_RGBA_MOD_C_NEG - | R500_ALU_RGBA_A_SWIZ_0; + emit_trig(cs, fpi); break; case OPCODE_KIL: case OPCODE_TEX: case OPCODE_TXB: case OPCODE_TXP: - emit_tex(cs, fpi, dest, counter); - if (fpi->DstReg.File == PROGRAM_OUTPUT) - counter++; + emit_tex(cs, fpi, dest); break; default: ERROR("unknown fpi->Opcode %s\n", _mesa_opcode_string(fpi->Opcode)); @@ -1245,37 +928,30 @@ static int do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *f /* Finishing touches */ if (fpi->SaturateMode == SATURATE_ZERO_ONE) { - code->inst[counter].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP; + code->inst[cs->nrslots-1].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP; } - - counter++; - - return counter; } static GLboolean parse_program(struct r500_pfs_compile_state *cs) { PROG_CODE; - int counter = 0; struct prog_instruction* fpi; for(fpi = cs->compiler->program->Instructions; fpi->Opcode != OPCODE_END; ++fpi) { - counter = do_inst(cs, fpi, counter); + do_inst(cs, fpi); if (cs->compiler->fp->error) return GL_FALSE; } /* Finish him! (If it's an ALU/OUT instruction...) */ - if ((code->inst[counter-1].inst0 & 0x3) == 1) { - code->inst[counter-1].inst0 |= R500_INST_LAST; + if ((code->inst[cs->nrslots-1].inst0 & 0x3) == 1) { + code->inst[cs->nrslots-1].inst0 |= R500_INST_LAST; } else { /* We still need to put an output inst, right? */ WARN_ONCE("Final FP instruction is not an OUT.\n"); } - cs->nrslots = counter; - code->max_temp_idx++; return GL_TRUE; @@ -1295,12 +971,10 @@ static void init_program(struct r500_pfs_compile_state *cs) driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization"); cs->compiler->fp->translated = GL_FALSE; cs->compiler->fp->error = GL_FALSE; - code->const_nr = 0; - /* Size of pixel stack, plus 1. */ - code->max_temp_idx = 1; - /* Temp register offset. */ - code->temp_reg_offset = 0; - /* Whether or not we perform any depth writing. */ + + _mesa_bzero(code, sizeof(*code)); + code->max_temp_idx = 1; /* Size of pixel stack, plus 1. */ + cs->nrslots = 0; cs->compiler->fp->writes_depth = GL_FALSE; /* Work out what temps the Mesa inputs correspond to, this must match -- cgit v1.2.3 From 77fdfaa23adeaaf6a217ef1ee751410c6a5b0d21 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 20:01:20 +0200 Subject: r300: Correctly scan for used temporary registers This fixes a regression introduced by dea8719f0... --- src/mesa/drivers/dri/r300/r300_fragprog.c | 28 ++++----- src/mesa/drivers/dri/r300/r500_fragprog.c | 18 +++--- src/mesa/drivers/dri/r300/radeon_program.c | 52 +++++++++++++--- src/mesa/drivers/dri/r300/radeon_program.h | 20 +++++- src/mesa/drivers/dri/r300/radeon_program_alu.c | 84 +++++++++++++------------- src/mesa/drivers/dri/r300/radeon_program_alu.h | 3 +- 6 files changed, 126 insertions(+), 79 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 8c49e8ada6..6a8ef0ef5f 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -68,7 +68,7 @@ static void reset_srcreg(struct prog_src_register* reg) * be reused. */ static GLboolean transform_TEX( - GLcontext *ctx, struct gl_program *p, + struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data) { struct r300_fragment_program_compiler *compiler = @@ -84,11 +84,11 @@ static GLboolean transform_TEX( return GL_FALSE; if (inst.Opcode != OPCODE_KIL && - p->ShadowSamplers & (1 << inst.TexSrcUnit)) { + t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) { - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg = inst.DstReg; @@ -98,7 +98,7 @@ static GLboolean transform_TEX( } inst.DstReg.File = PROGRAM_TEMPORARY; - inst.DstReg.Index = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + inst.DstReg.Index = radeonFindFreeTemporary(t); inst.DstReg.WriteMask = WRITEMASK_XYZW; } @@ -113,7 +113,7 @@ static GLboolean transform_TEX( 0 }; - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int tempreg = radeonFindFreeTemporary(t); int factor_index; tokens[2] = inst.TexSrcUnit; @@ -121,7 +121,7 @@ static GLboolean transform_TEX( _mesa_add_state_reference( compiler->fp->mesa_program.Base.Parameters, tokens); - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); tgt->Opcode = OPCODE_MUL; tgt->DstReg.File = PROGRAM_TEMPORARY; @@ -140,9 +140,9 @@ static GLboolean transform_TEX( */ if (inst.SrcReg[0].Swizzle != SWIZZLE_NOOP || inst.SrcReg[0].Abs || inst.SrcReg[0].NegateBase || inst.SrcReg[0].NegateAbs) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int tempreg = radeonFindFreeTemporary(t); - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg.File = PROGRAM_TEMPORARY; @@ -157,7 +157,7 @@ static GLboolean transform_TEX( if (inst.Opcode != OPCODE_KIL) { if (inst.DstReg.File != PROGRAM_TEMPORARY || inst.DstReg.WriteMask != WRITEMASK_XYZW) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int tempreg = radeonFindFreeTemporary(t); inst.DstReg.File = PROGRAM_TEMPORARY; inst.DstReg.Index = tempreg; @@ -166,16 +166,16 @@ static GLboolean transform_TEX( } } - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); _mesa_copy_instructions(tgt, &inst, 1); if (inst.Opcode != OPCODE_KIL && - p->ShadowSamplers & (1 << inst.TexSrcUnit)) { + t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode; - int rcptemp = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int rcptemp = radeonFindFreeTemporary(t); - tgt = radeonAppendInstructions(p, 3); + tgt = radeonAppendInstructions(t->Program, 3); tgt[0].Opcode = OPCODE_RCP; tgt[0].DstReg.File = PROGRAM_TEMPORARY; @@ -222,7 +222,7 @@ static GLboolean transform_TEX( tgt[2].SrcReg[2].Swizzle = SWIZZLE_1111; } } else if (destredirect) { - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg = orig_inst->DstReg; diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index b46e924ac7..7ee8494722 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -38,7 +38,7 @@ * */ static GLboolean transform_TEX( - GLcontext *ctx, struct gl_program *p, + struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data) { struct r500_fragment_program_compiler *compiler = @@ -55,11 +55,11 @@ static GLboolean transform_TEX( /* ARB_shadow & EXT_shadow_funcs */ if (inst.Opcode != OPCODE_KIL && - p->ShadowSamplers & (1 << inst.TexSrcUnit)) { + t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) { - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg.File = inst.DstReg.File; @@ -71,20 +71,20 @@ static GLboolean transform_TEX( } inst.DstReg.File = PROGRAM_TEMPORARY; - inst.DstReg.Index = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + inst.DstReg.Index = radeonFindFreeTemporary(t); inst.DstReg.WriteMask = WRITEMASK_XYZW; } - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); _mesa_copy_instructions(tgt, &inst, 1); if (inst.Opcode != OPCODE_KIL && - p->ShadowSamplers & (1 << inst.TexSrcUnit)) { + t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) { GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func; GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode; - int rcptemp = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int rcptemp = radeonFindFreeTemporary(t); - tgt = radeonAppendInstructions(p, 3); + tgt = radeonAppendInstructions(t->Program, 3); tgt[0].Opcode = OPCODE_RCP; tgt[0].DstReg.File = PROGRAM_TEMPORARY; @@ -131,7 +131,7 @@ static GLboolean transform_TEX( tgt[2].SrcReg[2].Swizzle = SWIZZLE_1111; } } else if (destredirect) { - tgt = radeonAppendInstructions(p, 1); + tgt = radeonAppendInstructions(t->Program, 1); tgt->Opcode = OPCODE_MOV; tgt->DstReg = orig_inst->DstReg; diff --git a/src/mesa/drivers/dri/r300/radeon_program.c b/src/mesa/drivers/dri/r300/radeon_program.c index 3112339f81..da5e7aefce 100644 --- a/src/mesa/drivers/dri/r300/radeon_program.c +++ b/src/mesa/drivers/dri/r300/radeon_program.c @@ -46,29 +46,30 @@ * one instruction at a time. */ void radeonLocalTransform( - GLcontext *ctx, + GLcontext *Ctx, struct gl_program *program, int num_transformations, struct radeon_program_transformation* transformations) { - struct prog_instruction *source; - int numinstructions; + struct radeon_transform_context ctx; int ip; - source = program->Instructions; - numinstructions = program->NumInstructions; + ctx.Ctx = Ctx; + ctx.Program = program; + ctx.OldInstructions = program->Instructions; + ctx.OldNumInstructions = program->NumInstructions; program->Instructions = 0; program->NumInstructions = 0; - for(ip = 0; ip < numinstructions; ++ip) { - struct prog_instruction *instr = source + ip; + for(ip = 0; ip < ctx.OldNumInstructions; ++ip) { + struct prog_instruction *instr = ctx.OldInstructions + ip; int i; for(i = 0; i < num_transformations; ++i) { struct radeon_program_transformation* t = transformations + i; - if (t->function(ctx, program, instr, t->userData)) + if (t->function(&ctx, instr, t->userData)) break; } @@ -78,7 +79,40 @@ void radeonLocalTransform( } } - _mesa_free_instructions(source, numinstructions); + _mesa_free_instructions(ctx.OldInstructions, ctx.OldNumInstructions); +} + + +static void scan_instructions(GLboolean* used, const struct prog_instruction* insts, GLuint count) +{ + GLuint i; + for (i = 0; i < count; i++) { + const struct prog_instruction *inst = insts + i; + const GLuint n = _mesa_num_inst_src_regs(inst->Opcode); + GLuint k; + + for (k = 0; k < n; k++) { + if (inst->SrcReg[k].File == PROGRAM_TEMPORARY) + used[inst->SrcReg[k].Index] = GL_TRUE; + } + } +} + +GLint radeonFindFreeTemporary(struct radeon_transform_context *t) +{ + GLboolean used[MAX_PROGRAM_TEMPS]; + GLuint i; + + _mesa_memset(used, 0, sizeof(used)); + scan_instructions(used, t->Program->Instructions, t->Program->NumInstructions); + scan_instructions(used, t->OldInstructions, t->OldNumInstructions); + + for (i = 0; i < MAX_PROGRAM_TEMPS; i++) { + if (!used[i]) + return i; + } + + return -1; } diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/radeon_program.h index 012104fa5a..ba76bc47cf 100644 --- a/src/mesa/drivers/dri/r300/radeon_program.h +++ b/src/mesa/drivers/dri/r300/radeon_program.h @@ -48,6 +48,19 @@ enum { #define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO) #define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE) +/** + * Transformation context that is passed to local transformations. + * + * Care must be taken with some operations during transformation, + * e.g. finding new temporary registers must use @ref radeonFindFreeTemporary + */ +struct radeon_transform_context { + GLcontext *Ctx; + struct gl_program *Program; + struct prog_instruction *OldInstructions; + GLuint OldNumInstructions; +}; + /** * A transformation that can be passed to \ref radeonLocalTransform. * @@ -60,8 +73,7 @@ enum { */ struct radeon_program_transformation { GLboolean (*function)( - GLcontext*, - struct gl_program*, + struct radeon_transform_context*, struct prog_instruction*, void*); void *userData; @@ -73,6 +85,10 @@ void radeonLocalTransform( int num_transformations, struct radeon_program_transformation* transformations); +/** + * Find a usable free temporary register during program transformation + */ +GLint radeonFindFreeTemporary(struct radeon_transform_context *ctx); struct prog_instruction *radeonAppendInstructions(struct gl_program *program, int count); diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index 3104d07fac..d6d016d7c1 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -148,24 +148,24 @@ static struct prog_src_register scalar(struct prog_src_register reg) return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X); } -static void transform_ABS(struct gl_program* p, +static void transform_ABS(struct radeon_transform_context* t, struct prog_instruction* inst) { struct prog_src_register src = inst->SrcReg[0]; src.Abs = 1; src.NegateBase = 0; src.NegateAbs = 0; - emit1(p, OPCODE_MOV, inst->DstReg, src); + emit1(t->Program, OPCODE_MOV, inst->DstReg, src); } -static void transform_DPH(struct gl_program* p, +static void transform_DPH(struct radeon_transform_context* t, struct prog_instruction* inst) { struct prog_src_register src0 = inst->SrcReg[0]; if (src0.NegateAbs) { if (src0.Abs) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - emit1(p, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0); + int tempreg = radeonFindFreeTemporary(t); + emit1(t->Program, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0); src0 = srcreg(src0.File, src0.Index); } else { src0.NegateAbs = 0; @@ -174,70 +174,70 @@ static void transform_DPH(struct gl_program* p, } set_swizzle(&src0, 3, SWIZZLE_ONE); set_negate_base(&src0, 3, 0); - emit2(p, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]); + emit2(t->Program, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]); } -static void transform_FLR(struct gl_program* p, +static void transform_FLR(struct radeon_transform_context* t, struct prog_instruction* inst) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); - emit1(p, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]); - emit2(p, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg))); + int tempreg = radeonFindFreeTemporary(t); + emit1(t->Program, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]); + emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg))); } -static void transform_POW(struct gl_program* p, +static void transform_POW(struct radeon_transform_context* t, struct prog_instruction* inst) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int tempreg = radeonFindFreeTemporary(t); struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg); struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg); tempdst.WriteMask = WRITEMASK_W; tempsrc.Swizzle = SWIZZLE_WWWW; - emit1(p, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0])); - emit2(p, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1])); - emit1(p, OPCODE_EX2, inst->DstReg, tempsrc); + emit1(t->Program, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0])); + emit2(t->Program, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1])); + emit1(t->Program, OPCODE_EX2, inst->DstReg, tempsrc); } -static void transform_SGE(struct gl_program* p, +static void transform_SGE(struct radeon_transform_context* t, struct prog_instruction* inst) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int tempreg = radeonFindFreeTemporary(t); - emit2(p, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); - emit3(p, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one); + emit2(t->Program, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); + emit3(t->Program, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one); } -static void transform_SLT(struct gl_program* p, +static void transform_SLT(struct radeon_transform_context* t, struct prog_instruction* inst) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int tempreg = radeonFindFreeTemporary(t); - emit2(p, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); - emit3(p, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero); + emit2(t->Program, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); + emit3(t->Program, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero); } -static void transform_SUB(struct gl_program* p, +static void transform_SUB(struct radeon_transform_context* t, struct prog_instruction* inst) { - emit2(p, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1])); + emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1])); } -static void transform_SWZ(struct gl_program* p, +static void transform_SWZ(struct radeon_transform_context* t, struct prog_instruction* inst) { - emit1(p, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]); + emit1(t->Program, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]); } -static void transform_XPD(struct gl_program* p, +static void transform_XPD(struct radeon_transform_context* t, struct prog_instruction* inst) { - int tempreg = _mesa_find_free_register(p, PROGRAM_TEMPORARY); + int tempreg = radeonFindFreeTemporary(t); - emit2(p, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg), + emit2(t->Program, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg), swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W), swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W)); - emit3(p, OPCODE_MAD, inst->DstReg, + emit3(t->Program, OPCODE_MAD, inst->DstReg, swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W), swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W), negate(srcreg(PROGRAM_TEMPORARY, tempreg))); @@ -257,22 +257,20 @@ static void transform_XPD(struct gl_program* p, * * @todo add LIT here as well? */ -GLboolean radeonTransformALU( - GLcontext* ctx, - struct gl_program* prog, +GLboolean radeonTransformALU(struct radeon_transform_context* t, struct prog_instruction* inst, void* unused) { switch(inst->Opcode) { - case OPCODE_ABS: transform_ABS(prog, inst); return GL_TRUE; - case OPCODE_DPH: transform_DPH(prog, inst); return GL_TRUE; - case OPCODE_FLR: transform_FLR(prog, inst); return GL_TRUE; - case OPCODE_POW: transform_POW(prog, inst); return GL_TRUE; - case OPCODE_SGE: transform_SGE(prog, inst); return GL_TRUE; - case OPCODE_SLT: transform_SLT(prog, inst); return GL_TRUE; - case OPCODE_SUB: transform_SUB(prog, inst); return GL_TRUE; - case OPCODE_SWZ: transform_SWZ(prog, inst); return GL_TRUE; - case OPCODE_XPD: transform_XPD(prog, inst); return GL_TRUE; + case OPCODE_ABS: transform_ABS(t, inst); return GL_TRUE; + case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE; + case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE; + case OPCODE_POW: transform_POW(t, inst); return GL_TRUE; + case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE; + case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE; + case OPCODE_SUB: transform_SUB(t, inst); return GL_TRUE; + case OPCODE_SWZ: transform_SWZ(t, inst); return GL_TRUE; + case OPCODE_XPD: transform_XPD(t, inst); return GL_TRUE; default: return GL_FALSE; } diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.h b/src/mesa/drivers/dri/r300/radeon_program_alu.h index f5beb9f8c3..858c5ed0b8 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.h +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.h @@ -31,8 +31,7 @@ #include "radeon_program.h" GLboolean radeonTransformALU( - GLcontext*, - struct gl_program*, + struct radeon_transform_context *t, struct prog_instruction*, void*); -- cgit v1.2.3 From 62bccd6df0c963a14e801bcac95dc8046b978a7f Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 22:21:24 +0200 Subject: r300: Allow adding parameters during fragprog transform, share LIT code --- src/mesa/drivers/dri/r300/r300_context.h | 24 ++--- src/mesa/drivers/dri/r300/r300_fragprog.c | 18 +++- src/mesa/drivers/dri/r300/r300_fragprog_emit.c | 143 ++++--------------------- src/mesa/drivers/dri/r300/r300_state.c | 41 +++++-- src/mesa/drivers/dri/r300/r500_fragprog.c | 12 ++- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 112 +++---------------- src/mesa/drivers/dri/r300/radeon_program_alu.c | 124 +++++++++++++++++++-- 7 files changed, 216 insertions(+), 258 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index a24ab0cad7..a69beba9a7 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -716,14 +716,11 @@ struct r300_fragment_program_code { int tex_offset; int tex_end; - /* Hardware constants. - * Contains a pointer to the value. The destination of the pointer - * is supposed to be updated when GL state changes. - * Typically, this is either a pointer into - * gl_program_parameter_list::ParameterValues, or a pointer to a - * global constant (e.g. for sin/cos-approximation) + /** + * Remember which program register a given hardware constant + * belongs to. */ - const GLfloat *constant[PFS_NUM_CONST_REGS]; + struct prog_src_register constant[PFS_NUM_CONST_REGS]; int const_nr; int max_temp_idx; @@ -787,14 +784,11 @@ struct r500_fragment_program_code { int inst_offset; int inst_end; - /* Hardware constants. - * Contains a pointer to the value. The destination of the pointer - * is supposed to be updated when GL state changes. - * Typically, this is either a pointer into - * gl_program_parameter_list::ParameterValues, or a pointer to a - * global constant (e.g. for sin/cos-approximation) - */ - const GLfloat *constant[PFS_NUM_CONST_REGS]; + /** + * Remember which program register a given hardware constant + * belongs to. + */ + struct prog_src_register constant[PFS_NUM_CONST_REGS]; int const_nr; int max_temp_idx; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 6a8ef0ef5f..57987f5d0f 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -117,9 +117,7 @@ static GLboolean transform_TEX( int factor_index; tokens[2] = inst.TexSrcUnit; - factor_index = - _mesa_add_state_reference( - compiler->fp->mesa_program.Base.Parameters, tokens); + factor_index = _mesa_add_state_reference(t->Program->Parameters, tokens); tgt = radeonAppendInstructions(t->Program, 1); @@ -303,7 +301,7 @@ static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler) i++; /* viewport transformation */ - window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens); + window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens); fpi[i].Opcode = OPCODE_MAD; @@ -401,6 +399,11 @@ void r300TranslateFragmentShader(r300ContextPtr r300, compiler.code = &fp->code; compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base); + if (RADEON_DEBUG & DEBUG_PIXEL) { + _mesa_printf("Fragment Program: Initial program:\n"); + _mesa_print_program(compiler.program); + } + insert_WPOS_trailer(&compiler); struct radeon_program_transformation transformations[] = { @@ -413,13 +416,18 @@ void r300TranslateFragmentShader(r300ContextPtr r300, 2, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { - _mesa_printf("Program after transformations:\n"); + _mesa_printf("Fragment Program: After transformations:\n"); _mesa_print_program(compiler.program); } if (!r300FragmentProgramEmit(&compiler)) fp->error = GL_TRUE; + /* Subtle: Rescue any parameters that have been added during transformations */ + _mesa_free_parameter_list(fp->mesa_program.Base.Parameters); + fp->mesa_program.Base.Parameters = compiler.program->Parameters; + compiler.program->Parameters = 0; + _mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL); if (!fp->error) diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c index 889631f705..d95008edc0 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c @@ -549,22 +549,17 @@ static void free_temp(struct r300_pfs_compile_state *cs, GLuint r) /** * Emit a hardware constant/parameter. - * - * \p cp Stable pointer to an array of 4 floats. - * The pointer must be stable in the sense that it remains to be valid - * and hold the contents of the constant/parameter throughout the lifetime - * of the fragment program (actually, up until the next time the fragment - * program is translated). */ static GLuint emit_const4fv(struct r300_pfs_compile_state *cs, - const GLfloat * cp) + struct prog_src_register srcreg) { COMPILE_STATE; GLuint reg = undef; int index; for (index = 0; index < code->const_nr; ++index) { - if (code->constant[index] == cp) + if (code->constant[index].File == srcreg.File && + code->constant[index].Index == srcreg.Index) break; } @@ -575,7 +570,7 @@ static GLuint emit_const4fv(struct r300_pfs_compile_state *cs, } code->const_nr++; - code->constant[index] = cp; + code->constant[index] = srcreg; } REG_SET_TYPE(reg, REG_TYPE_CONST); @@ -806,20 +801,11 @@ static GLuint t_src(struct r300_pfs_compile_state *cs, REG_SET_TYPE(r, REG_TYPE_INPUT); break; case PROGRAM_LOCAL_PARAM: - r = emit_const4fv(cs, - fp->mesa_program.Base.LocalParams[fpsrc. - Index]); - break; case PROGRAM_ENV_PARAM: - r = emit_const4fv(cs, - cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]); - break; case PROGRAM_STATE_VAR: case PROGRAM_NAMED_PARAM: case PROGRAM_CONSTANT: - r = emit_const4fv(cs, - fp->mesa_program.Base.Parameters-> - ParameterValues[fpsrc.Index]); + r = emit_const4fv(cs, fpsrc); break; case PROGRAM_BUILTIN: switch(fpsrc.Swizzle) { @@ -1452,100 +1438,17 @@ static GLfloat SinCosConsts[2][4] = { } }; -/** - * Emit a LIT instruction. - * \p flags may be PFS_FLAG_SAT - * - * Definition of LIT (from ARB_fragment_program): - * tmp = VectorLoad(op0); - * if (tmp.x < 0) tmp.x = 0; - * if (tmp.y < 0) tmp.y = 0; - * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); - * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; - * result.x = 1.0; - * result.y = tmp.x; - * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0; - * result.w = 1.0; - * - * The longest path of computation is the one leading to result.z, - * consisting of 5 operations. This implementation of LIT takes - * 5 slots. So unless there's some special undocumented opcode, - * this implementation is potentially optimal. Unfortunately, - * emit_arith is a bit too conservative because it doesn't understand - * partial writes to the vector component. - */ -static const GLfloat LitConst[4] = - { 127.999999, 127.999999, 127.999999, -127.999999 }; - -static void emit_lit(struct r300_pfs_compile_state *cs, - GLuint dest, int mask, GLuint src, int flags) +static GLuint emit_sincosconsts(struct r300_pfs_compile_state *cs, int i) { - COMPILE_STATE; - GLuint cnst; - int needTemporary; - GLuint temp; - - cnst = emit_const4fv(cs, LitConst); - - needTemporary = 0; - if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) { - needTemporary = 1; - } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { - // LIT is typically followed by DP3/DP4, so there's no point - // in creating special code for this case - needTemporary = 1; - } + struct prog_src_register srcreg; + GLuint constant_swizzle; - if (needTemporary) { - temp = keep(get_temp_reg(cs)); - } else { - temp = keep(dest); - } + srcreg.File = PROGRAM_CONSTANT; + srcreg.Index = _mesa_add_unnamed_constant(cs->compiler->program->Parameters, + SinCosConsts[i], 4, &constant_swizzle); + srcreg.Swizzle = constant_swizzle; - // Note: The order of emit_arith inside the slots is relevant, - // because emit_arith only looks at scalar vs. vector when resolving - // dependencies, and it does not consider individual vector components, - // so swizzling between the two parts can create fake dependencies. - - // First slot - emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY, - keep(src), pfs_zero, undef, 0); - emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0); - - // Second slot - emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z, - swizzle(temp, W, W, W, W), cnst, undef, 0); - emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W, - swizzle(temp, Y, Y, Y, Y), undef, undef, 0); - - // Third slot - // If desired, we saturate the y result here. - // This does not affect the use as a condition variable in the CMP later - emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, - temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0); - emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y, - swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags); - - // Fourth slot - emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X, - pfs_one, pfs_one, pfs_zero, 0); - emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0); - - // Fifth slot - emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z, - pfs_zero, swizzle(temp, W, W, W, W), - negate(swizzle(temp, Y, Y, Y, Y)), flags); - emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one, - pfs_zero, 0); - - if (needTemporary) { - emit_arith(cs, PFS_OP_MAD, dest, mask, - temp, pfs_one, pfs_zero, flags); - free_temp(cs, temp); - } else { - // Decrease refcount of the destination - t_hw_dst(cs, dest, GL_FALSE, cs->nrslots); - } + return emit_const4fv(cs, srcreg); } static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi) @@ -1577,8 +1480,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst src[1] = t_src(cs, fpi->SrcReg[1]); src[2] = t_src(cs, fpi->SrcReg[2]); /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c - * r300 - if src2.c < 0.0 ? src1.c : src0.c - */ + * r300 - if src2.c < 0.0 ? src1.c : src0.c + */ emit_arith(cs, PFS_OP_CMP, dest, mask, src[2], src[1], src[0], flags); break; @@ -1592,8 +1495,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst * result = sin(x) */ temp[0] = get_temp_reg(cs); - const_sin[0] = emit_const4fv(cs, SinCosConsts[0]); - const_sin[1] = emit_const4fv(cs, SinCosConsts[1]); + const_sin[0] = emit_sincosconsts(cs, 0); + const_sin[1] = emit_sincosconsts(cs, 1); src[0] = t_scalar_src(cs, fpi->SrcReg[0]); /* add 0.5*PI and do range reduction */ @@ -1687,10 +1590,6 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst emit_arith(cs, PFS_OP_LG2, dest, mask, src[0], undef, undef, flags); break; - case OPCODE_LIT: - src[0] = t_src(cs, fpi->SrcReg[0]); - emit_lit(cs, dest, mask, src[0], flags); - break; case OPCODE_LRP: src[0] = t_src(cs, fpi->SrcReg[0]); src[1] = t_src(cs, fpi->SrcReg[1]); @@ -1758,8 +1657,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst */ temp[0] = get_temp_reg(cs); temp[1] = get_temp_reg(cs); - const_sin[0] = emit_const4fv(cs, SinCosConsts[0]); - const_sin[1] = emit_const4fv(cs, SinCosConsts[1]); + const_sin[0] = emit_sincosconsts(cs, 0); + const_sin[1] = emit_sincosconsts(cs, 1); src[0] = t_scalar_src(cs, fpi->SrcReg[0]); /* x = -abs(x)+0.5*PI */ @@ -1825,8 +1724,8 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst */ temp[0] = get_temp_reg(cs); - const_sin[0] = emit_const4fv(cs, SinCosConsts[0]); - const_sin[1] = emit_const4fv(cs, SinCosConsts[1]); + const_sin[0] = emit_sincosconsts(cs, 0); + const_sin[1] = emit_sincosconsts(cs, 1); src[0] = t_scalar_src(cs, fpi->SrcReg[0]); /* do range reduction */ diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index 0f7c179de8..d7a6962acc 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -2453,6 +2453,27 @@ void r300UpdateShaders(r300ContextPtr rmesa) r300UpdateStateParameters(ctx, _NEW_PROGRAM); } +static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx, + struct gl_program *program, struct prog_src_register srcreg) +{ + static const GLfloat dummy[4] = { 0, 0, 0, 0 }; + + switch(srcreg.File) { + case PROGRAM_LOCAL_PARAM: + return program->LocalParams[srcreg.Index]; + case PROGRAM_ENV_PARAM: + return ctx->FragmentProgram.Parameters[srcreg.Index]; + case PROGRAM_STATE_VAR: + case PROGRAM_NAMED_PARAM: + case PROGRAM_CONSTANT: + return program->Parameters->ParameterValues[srcreg.Index]; + default: + _mesa_problem(ctx, "get_fragmentprogram_constant: Unknown\n"); + return dummy; + } +} + + static void r300SetupPixelShader(r300ContextPtr rmesa) { GLcontext *ctx = rmesa->radeon.glCtx; @@ -2523,10 +2544,12 @@ static void r300SetupPixelShader(r300ContextPtr rmesa) R300_STATECHANGE(rmesa, fpp); rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4); for (i = 0; i < code->const_nr; i++) { - rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(code->constant[i][0]); - rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(code->constant[i][1]); - rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(code->constant[i][2]); - rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(code->constant[i][3]); + const GLfloat *constant = get_fragmentprogram_constant(ctx, + &fp->mesa_program.Base, code->constant[i]); + rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]); + rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]); + rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]); + rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(constant[3]); } } @@ -2595,10 +2618,12 @@ static void r500SetupPixelShader(r300ContextPtr rmesa) R300_STATECHANGE(rmesa, r500fp_const); for (i = 0; i < code->const_nr; i++) { - rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(code->constant[i][0]); - rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(code->constant[i][1]); - rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(code->constant[i][2]); - rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(code->constant[i][3]); + const GLfloat *constant = get_fragmentprogram_constant(ctx, + &fp->mesa_program.Base, code->constant[i]); + rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]); + rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]); + rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]); + rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]); } bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4); diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index 7ee8494722..1cdb065354 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -212,7 +212,7 @@ static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler) i++; /* viewport transformation */ - window_index = _mesa_add_state_reference(compiler->fp->mesa_program.Base.Parameters, tokens); + window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens); fpi[i].Opcode = OPCODE_MAD; @@ -332,6 +332,11 @@ void r500TranslateFragmentShader(r300ContextPtr r300, fp->translated = r500FragmentProgramEmit(&compiler); + /* Subtle: Rescue any parameters that have been added during transformations */ + _mesa_free_parameter_list(fp->mesa_program.Base.Parameters); + fp->mesa_program.Base.Parameters = compiler.program->Parameters; + compiler.program->Parameters = 0; + _mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0); r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM); @@ -461,9 +466,8 @@ static void dump_program(struct r500_fragment_program_code *code) if (code->const_nr) { fprintf(stderr, "--------\nConstants:\n"); for (n = 0; n < code->const_nr; n++) { - fprintf(stderr, "Constant %d: %f %f\n\t %f %f\n", n, - code->constant[n][0], code->constant[n][1], code->constant[n][2], - code->constant[n][3]); + fprintf(stderr, "Constant %d: %i[%i]\n", n, + code->constant[n].File, code->constant[n].Index); } fprintf(stderr, "--------\n"); } diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 0e95c81e48..c79bff96bd 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -266,7 +266,7 @@ static int get_temp(struct r500_pfs_compile_state *cs, int slot) { /* Borrowed verbatim from r300_fragprog since it hasn't changed. */ static GLuint emit_const4fv(struct r500_pfs_compile_state *cs, - const GLfloat * cp) + struct prog_src_register srcreg) { PROG_CODE; @@ -274,7 +274,8 @@ static GLuint emit_const4fv(struct r500_pfs_compile_state *cs, int index; for (index = 0; index < code->const_nr; ++index) { - if (code->constant[index] == cp) + if (code->constant[index].File == srcreg.File && + code->constant[index].Index == srcreg.Index) break; } @@ -285,7 +286,7 @@ static GLuint emit_const4fv(struct r500_pfs_compile_state *cs, } code->const_nr++; - code->constant[index] = cp; + code->constant[index] = srcreg; } reg = index | REG_CONSTANT; @@ -303,18 +304,11 @@ static GLuint make_src(struct r500_pfs_compile_state *cs, struct prog_src_regist reg = cs->inputs[src.Index].reg; break; case PROGRAM_LOCAL_PARAM: - reg = emit_const4fv(cs, - cs->compiler->fp->mesa_program.Base.LocalParams[src.Index]); - break; case PROGRAM_ENV_PARAM: - reg = emit_const4fv(cs, - cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[src.Index]); - break; case PROGRAM_STATE_VAR: case PROGRAM_NAMED_PARAM: case PROGRAM_CONSTANT: - reg = emit_const4fv(cs, - cs->compiler->fp->mesa_program.Base.Parameters->ParameterValues[src.Index]); + reg = emit_const4fv(cs, src); break; case PROGRAM_BUILTIN: reg = 0x0; @@ -628,12 +622,20 @@ static void emit_trig(struct r500_pfs_compile_state *cs, struct prog_instruction temp.Index = get_temp(cs, 0); temp.WriteMask = WRITEMASK_W; + struct prog_src_register srcreg; + GLuint constant_swizzle; + + srcreg.File = PROGRAM_CONSTANT; + srcreg.Index = _mesa_add_unnamed_constant(cs->compiler->program->Parameters, + RCP_2PI, 4, &constant_swizzle); + srcreg.Swizzle = constant_swizzle; + /* temp = Input*(1/2pi) */ ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, temp); set_src0(cs, ip, fpi->SrcReg[0]); - set_src1_direct(cs, ip, emit_const4fv(cs, RCP_2PI)); + set_src1(cs, ip, srcreg); set_argA(cs, ip, 0, R500_SWIZ_RGB_ZERO, make_sop_swizzle(fpi->SrcReg[0])); - set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, SWIZZLE_W); + set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, make_alpha_swizzle(srcreg)); set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); /* temp = frac(dst) */ @@ -660,87 +662,6 @@ static void emit_trig(struct r500_pfs_compile_state *cs, struct prog_instruction } } -/** - * Emit a LIT instruction. - * - * Definition of LIT (from ARB_fragment_program): - * tmp = VectorLoad(op0); - * if (tmp.x < 0) tmp.x = 0; - * if (tmp.y < 0) tmp.y = 0; - * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); - * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; - * result.x = 1.0; - * result.y = tmp.x; - * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0; - * result.w = 1.0; - */ -static void emit_lit(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) -{ - GLuint cnst; - int needTemporary; - GLuint temp; - int ip; - - cnst = emit_const4fv(cs, LIT); - - needTemporary = 0; - if (fpi->DstReg.WriteMask != WRITEMASK_XYZW || fpi->DstReg.File == PROGRAM_OUTPUT) - needTemporary = 1; - - if (needTemporary) { - temp = get_temp(cs, 0); - } else { - temp = fpi->DstReg.Index; - } - - // MAX tmp.xyw, op0, { 0, 0, 0, -128+eps } - ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAX, R500_ALPHA_OP_MAX, temp, WRITEMASK_XYW); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1_direct(cs, ip, cnst); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, SWIZZLE_W); - - // MIN tmp.z, tmp.w, { 128-eps } - // LG2 tmp.w, tmp.y - ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MIN, R500_ALPHA_OP_LN2, temp, WRITEMASK_ZW); - set_src0_direct(cs, ip, temp); - set_src1_direct(cs, ip, cnst); - set_argA(cs, ip, 0, SWIZZLE_W | (SWIZZLE_W<<3) | (SWIZZLE_W<<6), SWIZZLE_Y); - set_argB(cs, ip, 1, SWIZZLE_X | (SWIZZLE_X<<3) | (SWIZZLE_X<<6), SWIZZLE_X); - - // MOV tmp.y, tmp.x - // MUL tmp.w, tmp.z, tmp.w - ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, temp, WRITEMASK_YW); - set_src0_direct(cs, ip, temp); - set_argA(cs, ip, 0, SWIZZLE_X | (SWIZZLE_X<<3) | (SWIZZLE_X<<6), SWIZZLE_Z); - set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, SWIZZLE_W); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - - // MOV tmp.x, 1.0 - // EX2 tmp.w, tmp.w - ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_EX2, temp, WRITEMASK_XW); - set_src0_direct(cs, ip, temp); - set_argA(cs, ip, 0, R500_SWIZ_RGB_ONE, SWIZZLE_W); - set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ZERO); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - - // tmp.z := (-tmp.x >= 0) ? tmp.y : 0.0 - // MOV tmp.w, 1.0 - ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, temp, WRITEMASK_ZW); - set_src0_direct(cs, ip, temp); - set_argA(cs, ip, 0, R500_SWIZZLE_ZERO, R500_SWIZZLE_ONE); - set_argB(cs, ip, 0, SWIZZLE_W | (SWIZZLE_W<<3) | (SWIZZLE_W<<6), R500_SWIZZLE_ONE); - set_argC(cs, ip, 0, SWIZZLE_Y | (SWIZZLE_Y<<3) | (SWIZZLE_Y<<6) | (R500_SWIZ_MOD_NEG<<9), R500_SWIZZLE_ZERO); - - if (needTemporary) { - ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); - set_src0_direct(cs, ip, temp); - set_argA(cs, ip, 0, R500_SWIZ_RGB_RGB, SWIZZLE_W); - set_argB(cs, ip, 1, R500_SWIZ_RGB_RGB, SWIZZLE_W); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - } -} - static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) { PROG_CODE; GLuint src[3], dest = 0; @@ -830,9 +751,6 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction * src[0] = make_src(cs, fpi->SrcReg[0]); emit_sop(cs, R500_ALPHA_OP_LN2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; - case OPCODE_LIT: - emit_lit(cs, fpi); - break; case OPCODE_LRP: /* result = src0*src1 + (1-src0)*src2 * = src0*src1 + src2 + (-src0)*src2 diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index d6d016d7c1..85ea810523 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -35,6 +35,8 @@ #include "radeon_program_alu.h" +#include "shader/prog_parameter.h" + static struct prog_instruction *emit1(struct gl_program* p, gl_inst_opcode Opcode, struct prog_dst_register DstReg, @@ -101,6 +103,19 @@ static struct prog_dst_register dstreg(int file, int index) return dst; } +static struct prog_dst_register dstregtmpmask(int index, int mask) +{ + struct prog_dst_register dst; + dst.File = PROGRAM_TEMPORARY; + dst.Index = index; + dst.WriteMask = mask; + dst.CondMask = COND_TR; + dst.CondSwizzle = SWIZZLE_NOOP; + dst.CondSrc = 0; + dst.pad = 0; + return dst; +} + static const struct prog_src_register builtin_zero = { .File = PROGRAM_BUILTIN, .Index = 0, @@ -125,6 +140,15 @@ static struct prog_src_register srcreg(int file, int index) return src; } +static struct prog_src_register srcregswz(int file, int index, int swz) +{ + struct prog_src_register src = srcreg_undefined; + src.File = file; + src.Index = index; + src.Swizzle = swz; + return src; +} + static struct prog_src_register negate(struct prog_src_register reg) { struct prog_src_register newreg = reg; @@ -136,10 +160,10 @@ static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, { struct prog_src_register swizzled = reg; swizzled.Swizzle = MAKE_SWIZZLE4( - GET_SWZ(reg.Swizzle, x), - GET_SWZ(reg.Swizzle, y), - GET_SWZ(reg.Swizzle, z), - GET_SWZ(reg.Swizzle, w)); + x >= 4 ? x : GET_SWZ(reg.Swizzle, x), + y >= 4 ? y : GET_SWZ(reg.Swizzle, y), + z >= 4 ? z : GET_SWZ(reg.Swizzle, z), + w >= 4 ? w : GET_SWZ(reg.Swizzle, w)); return swizzled; } @@ -185,6 +209,93 @@ static void transform_FLR(struct radeon_transform_context* t, emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg))); } +/** + * Definition of LIT (from ARB_fragment_program): + * + * tmp = VectorLoad(op0); + * if (tmp.x < 0) tmp.x = 0; + * if (tmp.y < 0) tmp.y = 0; + * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); + * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; + * result.x = 1.0; + * result.y = tmp.x; + * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0; + * result.w = 1.0; + * + * The longest path of computation is the one leading to result.z, + * consisting of 5 operations. This implementation of LIT takes + * 5 slots, if the subsequent optimization passes are clever enough + * to pair instructions correctly. + */ +static void transform_LIT(struct radeon_transform_context* t, + struct prog_instruction* inst) +{ + static const GLfloat LitConst[4] = { -127.999999 }; + + GLuint constant; + GLuint constant_swizzle; + GLuint temp; + int needTemporary = 0; + struct prog_src_register srctemp; + + constant = _mesa_add_unnamed_constant(t->Program->Parameters, LitConst, 1, &constant_swizzle); + + if (inst->DstReg.WriteMask != WRITEMASK_XYZW) { + needTemporary = 1; + } else if (inst->DstReg.File != PROGRAM_TEMPORARY) { + // LIT is typically followed by DP3/DP4, so there's no point + // in creating special code for this case + needTemporary = 1; + } + + if (needTemporary) { + temp = radeonFindFreeTemporary(t); + } else { + temp = inst->DstReg.Index; + } + srctemp = srcreg(PROGRAM_TEMPORARY, temp); + + // tmp.x = max(0.0, Src.x); + // tmp.y = max(0.0, Src.y); + // tmp.w = clamp(Src.z, -128+eps, 128-eps); + emit2(t->Program, OPCODE_MAX, + dstregtmpmask(temp, WRITEMASK_XYW), + inst->SrcReg[0], + swizzle(srcreg(PROGRAM_CONSTANT, constant), + SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3)); + emit2(t->Program, OPCODE_MIN, + dstregtmpmask(temp, WRITEMASK_Z), + swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle))); + + // tmp.w = Pow(tmp.y, tmp.w) + emit1(t->Program, OPCODE_LG2, + dstregtmpmask(temp, WRITEMASK_W), + swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y)); + emit2(t->Program, OPCODE_MUL, + dstregtmpmask(temp, WRITEMASK_W), + swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)); + emit1(t->Program, OPCODE_EX2, + dstregtmpmask(temp, WRITEMASK_W), + swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W)); + + // tmp.z = (tmp.x > 0) ? tmp.w : 0.0 + emit3(t->Program, OPCODE_CMP, + dstregtmpmask(temp, WRITEMASK_Z), + negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)), + swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + builtin_zero); + + // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 + emit1(t->Program, OPCODE_MOV, + dstregtmpmask(temp, WRITEMASK_XYW), + swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE)); + + if (needTemporary) + emit1(t->Program, OPCODE_MOV, inst->DstReg, srctemp); +} + static void transform_POW(struct radeon_transform_context* t, struct prog_instruction* inst) { @@ -249,13 +360,11 @@ static void transform_XPD(struct radeon_transform_context* t, * no userData necessary. * * Eliminates the following ALU instructions: - * ABS, DPH, FLR, POW, SGE, SLT, SUB, SWZ, XPD + * ABS, DPH, FLR, LIT, POW, SGE, SLT, SUB, SWZ, XPD * using: * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP * * @note should be applicable to R300 and R500 fragment programs. - * - * @todo add LIT here as well? */ GLboolean radeonTransformALU(struct radeon_transform_context* t, struct prog_instruction* inst, @@ -265,6 +374,7 @@ GLboolean radeonTransformALU(struct radeon_transform_context* t, case OPCODE_ABS: transform_ABS(t, inst); return GL_TRUE; case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE; case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE; + case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE; case OPCODE_POW: transform_POW(t, inst); return GL_TRUE; case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE; case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE; -- cgit v1.2.3 From 03abd021f2fa1d043682c9f1bbb1c080fba6b033 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 22:35:46 +0200 Subject: r300: Translate fragment program LRP in radeon_program_alu.c --- src/mesa/drivers/dri/r300/r300_fragprog_emit.c | 17 ---------------- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 27 -------------------------- src/mesa/drivers/dri/r300/radeon_program_alu.c | 16 ++++++++++++++- 3 files changed, 15 insertions(+), 45 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c index d95008edc0..446517405b 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c @@ -1590,23 +1590,6 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst emit_arith(cs, PFS_OP_LG2, dest, mask, src[0], undef, undef, flags); break; - case OPCODE_LRP: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - src[2] = t_src(cs, fpi->SrcReg[2]); - /* result = tmp0tmp1 + (1 - tmp0)tmp2 - * = tmp0tmp1 + tmp2 + (-tmp0)tmp2 - * MAD temp, -tmp0, tmp2, tmp2 - * MAD result, tmp0, tmp1, temp - */ - temp[0] = get_temp_reg(cs); - emit_arith(cs, PFS_OP_MAD, temp[0], mask, - negate(keep(src[0])), keep(src[2]), src[2], - 0); - emit_arith(cs, PFS_OP_MAD, dest, mask, - src[0], src[1], temp[0], flags); - free_temp(cs, temp[0]); - break; case OPCODE_MAD: src[0] = t_src(cs, fpi->SrcReg[0]); src[1] = t_src(cs, fpi->SrcReg[1]); diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index c79bff96bd..5b4d06ecf3 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -751,33 +751,6 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction * src[0] = make_src(cs, fpi->SrcReg[0]); emit_sop(cs, R500_ALPHA_OP_LN2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; - case OPCODE_LRP: - /* result = src0*src1 + (1-src0)*src2 - * = src0*src1 + src2 + (-src0)*src2 - * - * Note: LRP without swizzling (or with only limited - * swizzling) could be done more efficiently using the - * presubtract hardware. - */ - dest = get_temp(cs, 0); - ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, dest, WRITEMASK_XYZW); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_src2(cs, ip, fpi->SrcReg[2]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - set_argC_reg(cs, ip, 2, fpi->SrcReg[2]); - - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[2]); - set_src2_direct(cs, ip, dest); - set_argA(cs, ip, 0, - make_rgb_swizzle(fpi->SrcReg[0]) ^ (R500_SWIZ_MOD_NEG<<9), - make_alpha_swizzle(fpi->SrcReg[0]) ^ (R500_SWIZ_MOD_NEG<<3)); - set_argB_reg(cs, ip, 1, fpi->SrcReg[2]); - set_argC(cs, ip, 2, R500_SWIZ_RGB_RGB, SWIZZLE_W); - break; case OPCODE_MAD: ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); set_src0(cs, ip, fpi->SrcReg[0]); diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index 85ea810523..483dfa2cdc 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -296,6 +296,19 @@ static void transform_LIT(struct radeon_transform_context* t, emit1(t->Program, OPCODE_MOV, inst->DstReg, srctemp); } +static void transform_LRP(struct radeon_transform_context* t, + struct prog_instruction* inst) +{ + int tempreg = radeonFindFreeTemporary(t); + + emit2(t->Program, OPCODE_ADD, + dstreg(PROGRAM_TEMPORARY, tempreg), + inst->SrcReg[1], negate(inst->SrcReg[2])); + emit3(t->Program, OPCODE_MAD, + inst->DstReg, + inst->SrcReg[0], srcreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[2]); +} + static void transform_POW(struct radeon_transform_context* t, struct prog_instruction* inst) { @@ -360,7 +373,7 @@ static void transform_XPD(struct radeon_transform_context* t, * no userData necessary. * * Eliminates the following ALU instructions: - * ABS, DPH, FLR, LIT, POW, SGE, SLT, SUB, SWZ, XPD + * ABS, DPH, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD * using: * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP * @@ -375,6 +388,7 @@ GLboolean radeonTransformALU(struct radeon_transform_context* t, case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE; case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE; case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE; + case OPCODE_LRP: transform_LRP(t, inst); return GL_TRUE; case OPCODE_POW: transform_POW(t, inst); return GL_TRUE; case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE; case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE; -- cgit v1.2.3 From 4746752f167c674722b46ab3840297d48e6d889d Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 22:44:37 +0200 Subject: r300: Translate fragment program DST in radeon_program_alu --- src/mesa/drivers/dri/r300/r300_fragprog_emit.c | 21 --------------------- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 14 -------------- src/mesa/drivers/dri/r300/radeon_program_alu.c | 15 ++++++++++++++- 3 files changed, 14 insertions(+), 36 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c index 446517405b..30f513b5a3 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c @@ -1551,27 +1551,6 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst emit_arith(cs, PFS_OP_DP4, dest, mask, src[0], src[1], undef, flags); break; - case OPCODE_DST: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - /* dest.y = src0.y * src1.y */ - if (mask & WRITEMASK_Y) - emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y, - keep(src[0]), keep(src[1]), - pfs_zero, flags); - /* dest.z = src0.z */ - if (mask & WRITEMASK_Z) - emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z, - src[0], pfs_one, pfs_zero, flags); - /* result.x = 1.0 - * result.w = src1.w */ - if (mask & WRITEMASK_XW) { - REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat */ - emit_arith(cs, PFS_OP_MAD, dest, - mask & WRITEMASK_XW, - src[1], pfs_one, pfs_zero, flags); - } - break; case OPCODE_EX2: src[0] = t_scalar_src(cs, fpi->SrcReg[0]); emit_arith(cs, PFS_OP_EX2, dest, mask, diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 5b4d06ecf3..678114b8c1 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -709,20 +709,6 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction * set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); break; - case OPCODE_DST: - /* [1, src0.y*src1.y, src0.z, src1.w] - * So basically MUL with lotsa swizzling. */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA(cs, ip, 0, - (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x7) | R500_SWIZZLE_ONE, - R500_SWIZZLE_ONE); - set_argB(cs, ip, 1, - (make_rgb_swizzle(fpi->SrcReg[0]) & ~0x1c7) | R500_SWIZZLE_ONE | (R500_SWIZZLE_ONE << 6), - make_alpha_swizzle(fpi->SrcReg[1])); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - break; case OPCODE_EX2: src[0] = make_src(cs, fpi->SrcReg[0]); emit_sop(cs, R500_ALPHA_OP_EX2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index 483dfa2cdc..4a40d3e44d 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -201,6 +201,18 @@ static void transform_DPH(struct radeon_transform_context* t, emit2(t->Program, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]); } +/** + * [1, src0.y*src1.y, src0.z, src1.w] + * So basically MUL with lotsa swizzling. + */ +static void transform_DST(struct radeon_transform_context* t, + struct prog_instruction* inst) +{ + emit2(t->Program, OPCODE_MUL, inst->DstReg, + swizzle(inst->SrcReg[0], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE), + swizzle(inst->SrcReg[1], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_ONE, SWIZZLE_W)); +} + static void transform_FLR(struct radeon_transform_context* t, struct prog_instruction* inst) { @@ -373,7 +385,7 @@ static void transform_XPD(struct radeon_transform_context* t, * no userData necessary. * * Eliminates the following ALU instructions: - * ABS, DPH, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD + * ABS, DPH, DST, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD * using: * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP * @@ -386,6 +398,7 @@ GLboolean radeonTransformALU(struct radeon_transform_context* t, switch(inst->Opcode) { case OPCODE_ABS: transform_ABS(t, inst); return GL_TRUE; case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE; + case OPCODE_DST: transform_DST(t, inst); return GL_TRUE; case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE; case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE; case OPCODE_LRP: transform_LRP(t, inst); return GL_TRUE; -- cgit v1.2.3 From 2b2cb566563b9f1f9739327ef9874143af838850 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 5 Jul 2008 23:54:31 +0200 Subject: r300_fragprog: Emulate trigonometric functions in radeon_program_alu --- src/mesa/drivers/dri/r300/r300_fragprog.c | 5 +- src/mesa/drivers/dri/r300/r300_fragprog_emit.c | 206 +------------------------ src/mesa/drivers/dri/r300/radeon_program_alu.c | 144 +++++++++++++++++ src/mesa/drivers/dri/r300/radeon_program_alu.h | 5 + 4 files changed, 153 insertions(+), 207 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 57987f5d0f..8a1d690ae4 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -408,12 +408,13 @@ void r300TranslateFragmentShader(r300ContextPtr r300, struct radeon_program_transformation transformations[] = { { &transform_TEX, &compiler }, - { &radeonTransformALU, 0 } + { &radeonTransformALU, 0 }, + { &radeonTransformTrigSimple, 0 } }; radeonLocalTransform( r300->radeon.glCtx, compiler.program, - 2, transformations); + 3, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { _mesa_printf("Fragment Program: After transformations:\n"); diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c index 30f513b5a3..4786b4554d 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c @@ -1423,40 +1423,11 @@ static void emit_arith(struct r300_pfs_compile_state *cs, return; } -static GLfloat SinCosConsts[2][4] = { - { - 1.273239545, // 4/PI - -0.405284735, // -4/(PI*PI) - 3.141592654, // PI - 0.2225 // weight - }, - { - 0.75, - 0.0, - 0.159154943, // 1/(2*PI) - 6.283185307 // 2*PI - } -}; - -static GLuint emit_sincosconsts(struct r300_pfs_compile_state *cs, int i) -{ - struct prog_src_register srcreg; - GLuint constant_swizzle; - - srcreg.File = PROGRAM_CONSTANT; - srcreg.Index = _mesa_add_unnamed_constant(cs->compiler->program->Parameters, - SinCosConsts[i], 4, &constant_swizzle); - srcreg.Swizzle = constant_swizzle; - - return emit_const4fv(cs, srcreg); -} - static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi) { COMPILE_STATE; - GLuint src[3], dest, temp[2]; + GLuint src[3], dest; int flags, mask = 0; - int const_sin[2]; if (fpi->SaturateMode == SATURATE_ZERO_ONE) flags = PFS_FLAG_SAT; @@ -1485,60 +1456,6 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst emit_arith(cs, PFS_OP_CMP, dest, mask, src[2], src[1], src[0], flags); break; - case OPCODE_COS: - /* - * cos using a parabola (see SIN): - * cos(x): - * x = (x/(2*PI))+0.75 - * x = frac(x) - * x = (x*2*PI)-PI - * result = sin(x) - */ - temp[0] = get_temp_reg(cs); - const_sin[0] = emit_sincosconsts(cs, 0); - const_sin[1] = emit_sincosconsts(cs, 1); - src[0] = t_scalar_src(cs, fpi->SrcReg[0]); - - /* add 0.5*PI and do range reduction */ - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X, - swizzle(src[0], X, X, X, X), - swizzle(const_sin[1], Z, Z, Z, Z), - swizzle(const_sin[1], X, X, X, X), 0); - - emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X, - swizzle(temp[0], X, X, X, X), - undef, undef, 0); - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI - negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI - 0); - - /* SIN */ - - emit_arith(cs, PFS_OP_MAD, temp[0], - WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0], - Z, Z, Z, - Z), - const_sin[0], pfs_zero, 0); - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X, - swizzle(temp[0], Y, Y, Y, Y), - absolute(swizzle(temp[0], Z, Z, Z, Z)), - swizzle(temp[0], X, X, X, X), 0); - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y, - swizzle(temp[0], X, X, X, X), - absolute(swizzle(temp[0], X, X, X, X)), - negate(swizzle(temp[0], X, X, X, X)), 0); - - emit_arith(cs, PFS_OP_MAD, dest, mask, - swizzle(temp[0], Y, Y, Y, Y), - swizzle(const_sin[0], W, W, W, W), - swizzle(temp[0], X, X, X, X), flags); - - free_temp(cs, temp[0]); - break; case OPCODE_DP3: src[0] = t_src(cs, fpi->SrcReg[0]); src[1] = t_src(cs, fpi->SrcReg[1]); @@ -1609,127 +1526,6 @@ static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_inst emit_arith(cs, PFS_OP_RSQ, dest, mask, absolute(src[0]), pfs_zero, pfs_zero, flags); break; - case OPCODE_SCS: - /* - * scs using a parabola : - * scs(x): - * result.x = sin(-abs(x)+0.5*PI) (cos) - * result.y = sin(x) (sin) - * - */ - temp[0] = get_temp_reg(cs); - temp[1] = get_temp_reg(cs); - const_sin[0] = emit_sincosconsts(cs, 0); - const_sin[1] = emit_sincosconsts(cs, 1); - src[0] = t_scalar_src(cs, fpi->SrcReg[0]); - - /* x = -abs(x)+0.5*PI */ - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z), //PI - pfs_half, - negate(abs - (swizzle(keep(src[0]), X, X, X, X))), - 0); - - /* C*x (sin) */ - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W, - swizzle(const_sin[0], Y, Y, Y, Y), - swizzle(keep(src[0]), X, X, X, X), - pfs_zero, 0); - - /* B*x, C*x (cos) */ - emit_arith(cs, PFS_OP_MAD, temp[0], - WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0], - Z, Z, Z, - Z), - const_sin[0], pfs_zero, 0); - - /* B*x (sin) */ - emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W, - swizzle(const_sin[0], X, X, X, X), - keep(src[0]), pfs_zero, 0); - - /* y = B*x + C*x*abs(x) (sin) */ - emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z, - absolute(src[0]), - swizzle(temp[0], W, W, W, W), - swizzle(temp[1], W, W, W, W), 0); - - /* y = B*x + C*x*abs(x) (cos) */ - emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W, - swizzle(temp[0], Y, Y, Y, Y), - absolute(swizzle(temp[0], Z, Z, Z, Z)), - swizzle(temp[0], X, X, X, X), 0); - - /* y*abs(y) - y (cos), y*abs(y) - y (sin) */ - emit_arith(cs, PFS_OP_MAD, temp[0], - WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1], - W, Z, Y, - X), - absolute(swizzle(temp[1], W, Z, Y, X)), - negate(swizzle(temp[1], W, Z, Y, X)), 0); - - /* dest.xy = mad(temp.xy, P, temp2.wz) */ - emit_arith(cs, PFS_OP_MAD, dest, - mask & (WRITEMASK_X | WRITEMASK_Y), temp[0], - swizzle(const_sin[0], W, W, W, W), - swizzle(temp[1], W, Z, Y, X), flags); - - free_temp(cs, temp[0]); - free_temp(cs, temp[1]); - break; - case OPCODE_SIN: - /* - * using a parabola: - * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x) - * extra precision is obtained by weighting against - * itself squared. - */ - - temp[0] = get_temp_reg(cs); - const_sin[0] = emit_sincosconsts(cs, 0); - const_sin[1] = emit_sincosconsts(cs, 1); - src[0] = t_scalar_src(cs, fpi->SrcReg[0]); - - /* do range reduction */ - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X, - swizzle(keep(src[0]), X, X, X, X), - swizzle(const_sin[1], Z, Z, Z, Z), - pfs_half, 0); - - emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X, - swizzle(temp[0], X, X, X, X), - undef, undef, 0); - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W), //2*PI - negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI - 0); - - /* SIN */ - - emit_arith(cs, PFS_OP_MAD, temp[0], - WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0], - Z, Z, Z, - Z), - const_sin[0], pfs_zero, 0); - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X, - swizzle(temp[0], Y, Y, Y, Y), - absolute(swizzle(temp[0], Z, Z, Z, Z)), - swizzle(temp[0], X, X, X, X), 0); - - emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y, - swizzle(temp[0], X, X, X, X), - absolute(swizzle(temp[0], X, X, X, X)), - negate(swizzle(temp[0], X, X, X, X)), 0); - - emit_arith(cs, PFS_OP_MAD, dest, mask, - swizzle(temp[0], Y, Y, Y, Y), - swizzle(const_sin[0], W, W, W, W), - swizzle(temp[0], X, X, X, X), flags); - - free_temp(cs, temp[0]); - break; case OPCODE_TEX: emit_tex(cs, fpi, R300_TEX_OP_LD); break; diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index 4a40d3e44d..fa6a67f0c1 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -149,6 +149,14 @@ static struct prog_src_register srcregswz(int file, int index, int swz) return src; } +static struct prog_src_register absolute(struct prog_src_register reg) +{ + struct prog_src_register newreg = reg; + newreg.Abs = 1; + newreg.NegateAbs = 0; + return newreg; +} + static struct prog_src_register negate(struct prog_src_register reg) { struct prog_src_register newreg = reg; @@ -412,3 +420,139 @@ GLboolean radeonTransformALU(struct radeon_transform_context* t, return GL_FALSE; } } + + +static void sincos_constants(struct radeon_transform_context* t, GLuint *constants) +{ + static const GLfloat SinCosConsts[2][4] = { + { + 1.273239545, // 4/PI + -0.405284735, // -4/(PI*PI) + 3.141592654, // PI + 0.2225 // weight + }, + { + 0.75, + 0.5, + 0.159154943, // 1/(2*PI) + 6.283185307 // 2*PI + } + }; + int i; + + for(i = 0; i < 2; ++i) { + GLuint swz; + constants[i] = _mesa_add_unnamed_constant(t->Program->Parameters, SinCosConsts[i], 4, &swz); + ASSERT(swz == SWIZZLE_NOOP); + } +} + +/** + * Approximate sin(x), where x is clamped to (-pi/2, pi/2). + * + * MUL tmp.xy, src, { 4/PI, -4/(PI^2) } + * MAD tmp.x, tmp.y, |src|, tmp.x + * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x + * MAD dest, tmp.y, weight, tmp.x + */ +static void sin_approx(struct radeon_transform_context* t, + struct prog_dst_register dst, struct prog_src_register src, const GLuint* constants) +{ + GLuint tempreg = radeonFindFreeTemporary(t); + + emit2(t->Program, OPCODE_MUL, dstregtmpmask(tempreg, WRITEMASK_XY), + swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), + srcreg(PROGRAM_CONSTANT, constants[0])); + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_X), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y), + absolute(swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)); + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_Y), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), + absolute(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)), + negate(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X))); + emit3(t->Program, OPCODE_MAD, dst, + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y), + swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)); +} + +/** + * Translate the trigonometric functions COS, SIN, and SCS + * using only the basic instructions + * MOV, ADD, MUL, MAD, FRC + */ +GLboolean radeonTransformTrigSimple(struct radeon_transform_context* t, + struct prog_instruction* inst, + void* unused) +{ + if (inst->Opcode != OPCODE_COS && + inst->Opcode != OPCODE_SIN && + inst->Opcode != OPCODE_SCS) + return GL_FALSE; + + GLuint constants[2]; + GLuint tempreg = radeonFindFreeTemporary(t); + + sincos_constants(t, constants); + + if (inst->Opcode == OPCODE_COS) { + // MAD tmp.x, src, 1/(2*PI), 0.75 + // FRC tmp.x, tmp.x + // MAD tmp.z, tmp.x, 2*PI, -PI + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)); + emit1(t->Program, OPCODE_FRC, dstregtmpmask(tempreg, WRITEMASK_W), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W)); + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z))); + + sin_approx(t, inst->DstReg, + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + constants); + } else if (inst->Opcode == OPCODE_SIN) { + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y)); + emit1(t->Program, OPCODE_FRC, dstregtmpmask(tempreg, WRITEMASK_W), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W)); + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z))); + + sin_approx(t, inst->DstReg, + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + constants); + } else { + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_XY), + swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W)); + emit1(t->Program, OPCODE_FRC, dstregtmpmask(tempreg, WRITEMASK_XY), + srcreg(PROGRAM_TEMPORARY, tempreg)); + emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_XY), + srcreg(PROGRAM_TEMPORARY, tempreg), + swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), + negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z))); + + struct prog_dst_register dst = inst->DstReg; + + dst.WriteMask = inst->DstReg.WriteMask & WRITEMASK_X; + sin_approx(t, dst, + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), + constants); + + dst.WriteMask = inst->DstReg.WriteMask & WRITEMASK_Y; + sin_approx(t, dst, + swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y), + constants); + } + + return GL_TRUE; +} diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.h b/src/mesa/drivers/dri/r300/radeon_program_alu.h index 858c5ed0b8..3fe6153fd8 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.h +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.h @@ -35,4 +35,9 @@ GLboolean radeonTransformALU( struct prog_instruction*, void*); +GLboolean radeonTransformTrigSimple( + struct radeon_transform_context *t, + struct prog_instruction*, + void*); + #endif /* __RADEON_PROGRAM_ALU_H_ */ -- cgit v1.2.3 From 056689d457924ba4d8a0723d0c5f24383d1757d9 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 6 Jul 2008 16:39:31 +0200 Subject: r500_fragprog: Fix RSQ with negative parameters --- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 678114b8c1..8c900941c4 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -784,7 +784,8 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction * break; case OPCODE_RSQ: src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_RSQ, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); + emit_sop(cs, R500_ALPHA_OP_RSQ, fpi->DstReg, src[0], + (make_sop_swizzle(fpi->SrcReg[0]) | (R500_SWIZ_MOD_ABS<<3)) & ~(R500_SWIZ_MOD_NEG<<3)); break; case OPCODE_SCS: emit_trig(cs, fpi); -- cgit v1.2.3 From b0ef353b4696672ecaea4b370b612bbb482880ca Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 11 Jul 2008 19:23:06 -0400 Subject: R300: update vap_cntl values for NUM_FPUS based on info from hw team --- src/mesa/drivers/dri/r300/r300_ioctl.c | 9 +++++---- src/mesa/drivers/dri/r300/r300_state.c | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c index 71821a01ea..bd7f060435 100644 --- a/src/mesa/drivers/dri/r300/r300_ioctl.c +++ b/src/mesa/drivers/dri/r300/r300_ioctl.c @@ -419,13 +419,14 @@ static void r300EmitClearState(GLcontext * ctx) if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515) vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT); else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) || - (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560)) + (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) || + (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570)) vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT); - else if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420) + else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) || + (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420)) vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT); else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) || - (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580) || - (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570)) + (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580)) vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT); else vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT); diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index d7a6962acc..fbe5f66418 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -1971,13 +1971,14 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count, if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515) rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (2 << R300_PVS_NUM_FPUS_SHIFT); else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) || - (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560)) + (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) || + (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570)) rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (5 << R300_PVS_NUM_FPUS_SHIFT); - else if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420) + else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) || + (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420)) rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (6 << R300_PVS_NUM_FPUS_SHIFT); else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) || - (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580) || - (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570)) + (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580)) rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (8 << R300_PVS_NUM_FPUS_SHIFT); else rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (4 << R300_PVS_NUM_FPUS_SHIFT); -- cgit v1.2.3 From 7904c9fad4c2cb2a4153258a9e86e530a0330a78 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 6 Jul 2008 16:58:51 +0200 Subject: r500_fragprog: Transform trigonometric functions in first pass --- src/mesa/drivers/dri/r300/r500_fragprog.c | 7 +- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 91 ++------------------------ src/mesa/drivers/dri/r300/radeon_program_alu.c | 52 +++++++++++++++ src/mesa/drivers/dri/r300/radeon_program_alu.h | 5 ++ 4 files changed, 65 insertions(+), 90 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index 1cdb065354..9bb92d3ba4 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -318,12 +318,13 @@ void r500TranslateFragmentShader(r300ContextPtr r300, insert_WPOS_trailer(&compiler); - struct radeon_program_transformation transformations[2] = { + struct radeon_program_transformation transformations[3] = { { &transform_TEX, &compiler }, - { &radeonTransformALU, 0 } + { &radeonTransformALU, 0 }, + { &radeonTransformTrigScale, 0 } }; radeonLocalTransform(r300->radeon.glCtx, compiler.program, - 2, transformations); + 3, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { _mesa_printf("Compiler: after all transformations:\n"); diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 8c900941c4..4f65803953 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -156,17 +156,6 @@ struct r500_pfs_compile_state { #define R500_WRITEMASK_AB 0xC #define R500_WRITEMASK_ARGB 0xF -/* 1/(2pi), needed for quick modulus in trig insts - * Thanks to glisse for pointing out how to do it! */ -static const GLfloat RCP_2PI[] = {0.15915494309189535, - 0.15915494309189535, - 0.15915494309189535, - 0.15915494309189535}; - -static const GLfloat LIT[] = {127.999999, - 127.999999, - 127.999999, - -127.999999}; static const struct prog_dst_register dstreg_template = { .File = PROGRAM_TEMPORARY, @@ -476,12 +465,6 @@ static int emit_alu(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alph return _helper_emit_alu(cs, rgbop, alphaop, dst.File, dst.Index, dst.WriteMask); } -static int emit_alu_temp(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, int dst, int writemask) -{ - return _helper_emit_alu(cs, rgbop, alphaop, - PROGRAM_TEMPORARY, dst - cs->compiler->code->temp_reg_offset, writemask); -} - /** * Set an instruction's source 0 (both RGB and ALPHA) to the given hardware index. */ @@ -612,56 +595,6 @@ static int emit_sop(struct r500_pfs_compile_state *cs, } -/** - * Emit trigonometric function COS, SIN, SCS - */ -static void emit_trig(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) -{ - int ip; - struct prog_dst_register temp = dstreg_template; - temp.Index = get_temp(cs, 0); - temp.WriteMask = WRITEMASK_W; - - struct prog_src_register srcreg; - GLuint constant_swizzle; - - srcreg.File = PROGRAM_CONSTANT; - srcreg.Index = _mesa_add_unnamed_constant(cs->compiler->program->Parameters, - RCP_2PI, 4, &constant_swizzle); - srcreg.Swizzle = constant_swizzle; - - /* temp = Input*(1/2pi) */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, temp); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, srcreg); - set_argA(cs, ip, 0, R500_SWIZ_RGB_ZERO, make_sop_swizzle(fpi->SrcReg[0])); - set_argB(cs, ip, 1, R500_SWIZ_RGB_ZERO, make_alpha_swizzle(srcreg)); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - - /* temp = frac(dst) */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, temp); - set_src0_direct(cs, ip, temp.Index); - set_argA(cs, ip, 0, R500_SWIZ_RGB_RGB, SWIZZLE_W); - - /* Dest = trig(temp) */ - if (fpi->Opcode == OPCODE_COS) { - emit_sop(cs, R500_ALPHA_OP_COS, fpi->DstReg, temp.Index, SWIZZLE_W); - } else if (fpi->Opcode == OPCODE_SIN) { - emit_sop(cs, R500_ALPHA_OP_SIN, fpi->DstReg, temp.Index, SWIZZLE_W); - } else if (fpi->Opcode == OPCODE_SCS) { - struct prog_dst_register moddst = fpi->DstReg; - - if (fpi->DstReg.WriteMask & WRITEMASK_X) { - moddst.WriteMask = WRITEMASK_X; - emit_sop(cs, R500_ALPHA_OP_COS, fpi->DstReg, temp.Index, SWIZZLE_W); - } - if (fpi->DstReg.WriteMask & WRITEMASK_Y) { - moddst.WriteMask = WRITEMASK_Y; - emit_sop(cs, R500_ALPHA_OP_SIN, fpi->DstReg, temp.Index, SWIZZLE_W); - } - } -} - static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) { PROG_CODE; GLuint src[3], dest = 0; @@ -693,7 +626,8 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction * set_argC_reg(cs, ip, 0, fpi->SrcReg[0]); break; case OPCODE_COS: - emit_trig(cs, fpi); + src[0] = make_src(cs, fpi->SrcReg[0]); + emit_sop(cs, R500_ALPHA_OP_COS, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; case OPCODE_DP3: ip = emit_alu(cs, R500_ALU_RGBA_OP_DP3, R500_ALPHA_OP_DP, fpi->DstReg); @@ -713,21 +647,6 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction * src[0] = make_src(cs, fpi->SrcReg[0]); emit_sop(cs, R500_ALPHA_OP_EX2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; - case OPCODE_FLR: - dest = get_temp(cs, 0); - ip = emit_alu_temp(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, dest, WRITEMASK_XYZW); - set_src0(cs, ip, fpi->SrcReg[0]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1_direct(cs, ip, dest); - set_argA_reg(cs, ip, 0, fpi->SrcReg[1]); - set_argB(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ONE); - set_argC(cs, ip, 1, - R500_SWIZ_RGB_RGB|(R500_SWIZ_MOD_NEG<<9), - SWIZZLE_W|(R500_SWIZ_MOD_NEG<<3)); - break; case OPCODE_FRC: ip = emit_alu(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, fpi->DstReg); set_src0(cs, ip, fpi->SrcReg[0]); @@ -787,11 +706,9 @@ static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction * emit_sop(cs, R500_ALPHA_OP_RSQ, fpi->DstReg, src[0], (make_sop_swizzle(fpi->SrcReg[0]) | (R500_SWIZ_MOD_ABS<<3)) & ~(R500_SWIZ_MOD_NEG<<3)); break; - case OPCODE_SCS: - emit_trig(cs, fpi); - break; case OPCODE_SIN: - emit_trig(cs, fpi); + src[0] = make_src(cs, fpi->SrcReg[0]); + emit_sop(cs, R500_ALPHA_OP_SIN, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); break; case OPCODE_KIL: case OPCODE_TEX: diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index fa6a67f0c1..8daa94c726 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -556,3 +556,55 @@ GLboolean radeonTransformTrigSimple(struct radeon_transform_context* t, return GL_TRUE; } + + +/** + * Transform the trigonometric functions COS, SIN, and SCS + * to include pre-scaling by 1/(2*PI) and taking the fractional + * part, so that the input to COS and SIN is always in the range [0,1). + * SCS is replaced by one COS and one SIN instruction. + * + * @warning This transformation implicitly changes the semantics of SIN and COS! + */ +GLboolean radeonTransformTrigScale(struct radeon_transform_context* t, + struct prog_instruction* inst, + void* unused) +{ + if (inst->Opcode != OPCODE_COS && + inst->Opcode != OPCODE_SIN && + inst->Opcode != OPCODE_SCS) + return GL_FALSE; + + static const GLfloat RCP_2PI[] = { 0.15915494309189535 }; + GLuint temp; + GLuint constant; + GLuint constant_swizzle; + + temp = radeonFindFreeTemporary(t); + constant = _mesa_add_unnamed_constant(t->Program->Parameters, RCP_2PI, 1, &constant_swizzle); + + emit2(t->Program, OPCODE_MUL, dstregtmpmask(temp, WRITEMASK_W), + swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), + srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)); + emit1(t->Program, OPCODE_FRC, dstregtmpmask(temp, WRITEMASK_W), + srcreg(PROGRAM_TEMPORARY, temp)); + + if (inst->Opcode == OPCODE_COS) { + emit1(t->Program, OPCODE_COS, inst->DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + } else if (inst->Opcode == OPCODE_SIN) { + emit1(t->Program, OPCODE_SIN, inst->DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + } else if (inst->Opcode == OPCODE_SCS) { + struct prog_dst_register moddst = inst->DstReg; + + if (inst->DstReg.WriteMask & WRITEMASK_X) { + moddst.WriteMask = WRITEMASK_X; + emit1(t->Program, OPCODE_COS, moddst, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + } + if (inst->DstReg.WriteMask & WRITEMASK_Y) { + moddst.WriteMask = WRITEMASK_Y; + emit1(t->Program, OPCODE_SIN, moddst, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + } + } + + return GL_TRUE; +} diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.h b/src/mesa/drivers/dri/r300/radeon_program_alu.h index 3fe6153fd8..ea9d5bb669 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.h +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.h @@ -40,4 +40,9 @@ GLboolean radeonTransformTrigSimple( struct prog_instruction*, void*); +GLboolean radeonTransformTrigScale( + struct radeon_transform_context *t, + struct prog_instruction*, + void*); + #endif /* __RADEON_PROGRAM_ALU_H_ */ -- cgit v1.2.3 From d8d086c20b5a43353c4980cf234d8329900585f5 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sun, 6 Jul 2008 19:48:50 +0200 Subject: r500: Add "Not quite SSA" and dead code elimination pass In addition, this pass fixes non-native swizzles. --- src/mesa/drivers/dri/r300/Makefile | 1 + src/mesa/drivers/dri/r300/r500_fragprog.c | 67 +++++- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 28 ++- src/mesa/drivers/dri/r300/radeon_nqssadce.c | 282 +++++++++++++++++++++++++ src/mesa/drivers/dri/r300/radeon_nqssadce.h | 96 +++++++++ src/mesa/shader/program.c | 61 +++++- src/mesa/shader/program.h | 13 +- 7 files changed, 524 insertions(+), 24 deletions(-) create mode 100644 src/mesa/drivers/dri/r300/radeon_nqssadce.c create mode 100644 src/mesa/drivers/dri/r300/radeon_nqssadce.h (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile index d52b2b4c36..1dc75a3062 100644 --- a/src/mesa/drivers/dri/r300/Makefile +++ b/src/mesa/drivers/dri/r300/Makefile @@ -38,6 +38,7 @@ DRIVER_SOURCES = \ r300_texstate.c \ radeon_program.c \ radeon_program_alu.c \ + radeon_nqssadce.c \ r300_vertprog.c \ r300_fragprog.c \ r300_fragprog_emit.c \ diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index 9bb92d3ba4..c92ea8f5e6 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -27,6 +27,7 @@ #include "r500_fragprog.h" +#include "radeon_nqssadce.h" #include "radeon_program_alu.h" @@ -250,6 +251,57 @@ static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler) } +static void nqssadce_init(struct nqssadce_state* s) +{ + s->Outputs[FRAG_RESULT_COLR].Sourced = WRITEMASK_XYZW; + s->Outputs[FRAG_RESULT_DEPR].Sourced = WRITEMASK_W; +} + +static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg) +{ + GLuint relevant; + int i; + + if (reg.Abs) + return GL_TRUE; + + relevant = 0; + for(i = 0; i < 3; ++i) { + GLuint swz = GET_SWZ(reg.Swizzle, i); + if (swz != SWIZZLE_NIL && swz != SWIZZLE_ZERO) + relevant |= 1 << i; + } + if ((reg.NegateBase & relevant) && ((reg.NegateBase & relevant) != relevant)) + return GL_FALSE; + + return GL_TRUE; +} + +/** + * Implement a non-native swizzle. This function assumes that + * is_native_swizzle returned true. + */ +static void nqssadce_build_swizzle(struct nqssadce_state *s, + struct prog_dst_register dst, struct prog_src_register src) +{ + struct prog_instruction *inst; + + _mesa_insert_instructions(s->Program, s->IP, 2); + inst = s->Program->Instructions + s->IP; + + inst[0].Opcode = OPCODE_MOV; + inst[0].DstReg = dst; + inst[0].DstReg.WriteMask &= src.NegateBase; + inst[0].SrcReg[0] = src; + + inst[1].Opcode = OPCODE_MOV; + inst[1].DstReg = dst; + inst[1].DstReg.WriteMask &= ~src.NegateBase; + inst[1].SrcReg[0] = src; + + s->IP += 2; +} + static GLuint build_dtm(GLuint depthmode) { switch(depthmode) { @@ -327,7 +379,20 @@ void r500TranslateFragmentShader(r300ContextPtr r300, 3, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { - _mesa_printf("Compiler: after all transformations:\n"); + _mesa_printf("Compiler: after native rewrite:\n"); + _mesa_print_program(compiler.program); + } + + struct radeon_nqssadce_descr nqssadce = { + .Init = &nqssadce_init, + .IsNativeSwizzle = &is_native_swizzle, + .BuildSwizzle = &nqssadce_build_swizzle, + .RewriteDepthOut = GL_TRUE + }; + radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce); + + if (RADEON_DEBUG & DEBUG_PIXEL) { + _mesa_printf("Compiler: after NqSSA-DCE:\n"); _mesa_print_program(compiler.program); } diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 4f65803953..275911679d 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -163,23 +163,30 @@ static const struct prog_dst_register dstreg_template = { .WriteMask = WRITEMASK_XYZW }; +static INLINE GLuint fix_hw_swizzle(GLuint swz) +{ + if (swz == 5) swz = 6; + if (swz == SWIZZLE_NIL) swz = 4; + return swz; +} + static INLINE GLuint make_rgb_swizzle(struct prog_src_register src) { GLuint swiz = 0x0; GLuint temp; /* This could be optimized, but it should be plenty fast already. */ int i; + int negatebase = 0; for (i = 0; i < 3; i++) { - temp = GET_SWZ(src.Swizzle, i); - /* Fix SWIZZLE_ONE */ - if (temp == 5) temp++; + temp = GET_SWZ(src.Swizzle, i); + if (temp != SWIZZLE_NIL && GET_BIT(src.NegateBase, i)) + negatebase = 1; + temp = fix_hw_swizzle(temp); swiz |= temp << i*3; } - if (src.Abs) { + if (src.Abs) swiz |= R500_SWIZ_MOD_ABS << 9; - } else if (src.NegateBase & 7) { - ASSERT((src.NegateBase & 7) == 7); + else if (negatebase) swiz |= R500_SWIZ_MOD_NEG << 9; - } if (src.NegateAbs) swiz ^= R500_SWIZ_MOD_NEG << 9; return swiz; @@ -191,8 +198,7 @@ static INLINE GLuint make_rgba_swizzle(GLuint src) { int i; for (i = 0; i < 4; i++) { temp = GET_SWZ(src, i); - /* Fix SWIZZLE_ONE */ - if (temp == 5) temp++; + temp = fix_hw_swizzle(temp); swiz |= temp << i*3; } return swiz; @@ -201,7 +207,7 @@ static INLINE GLuint make_rgba_swizzle(GLuint src) { static INLINE GLuint make_alpha_swizzle(struct prog_src_register src) { GLuint swiz = GET_SWZ(src.Swizzle, 3); - if (swiz == 5) swiz++; + swiz = fix_hw_swizzle(swiz); if (src.Abs) { swiz |= R500_SWIZ_MOD_ABS << 3; @@ -217,7 +223,7 @@ static INLINE GLuint make_alpha_swizzle(struct prog_src_register src) { static INLINE GLuint make_sop_swizzle(struct prog_src_register src) { GLuint swiz = GET_SWZ(src.Swizzle, 0); - if (swiz == 5) swiz++; + swiz = fix_hw_swizzle(swiz); if (src.Abs) { swiz |= R500_SWIZ_MOD_ABS << 3; diff --git a/src/mesa/drivers/dri/r300/radeon_nqssadce.c b/src/mesa/drivers/dri/r300/radeon_nqssadce.c new file mode 100644 index 0000000000..f10ba4004a --- /dev/null +++ b/src/mesa/drivers/dri/r300/radeon_nqssadce.c @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/** + * @file + * + * "Not-quite SSA" and Dead-Code Elimination. + * + * @note This code uses SWIZZLE_NIL in a source register to indicate that + * the corresponding component is ignored by the corresponding instruction. + */ + +#include "radeon_nqssadce.h" + + +/** + * Return the @ref register_state for the given register (or 0 for untracked + * registers, i.e. constants). + */ +static struct register_state *get_reg_state(struct nqssadce_state* s, GLuint file, GLuint index) +{ + switch(file) { + case PROGRAM_TEMPORARY: return &s->Temps[index]; + case PROGRAM_OUTPUT: return &s->Outputs[index]; + default: return 0; + } +} + + +/** + * Left multiplication of a register with a swizzle + * + * @note Works correctly only for X, Y, Z, W swizzles, not for constant swizzles. + */ +static struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg) +{ + struct prog_src_register tmp = srcreg; + int i; + tmp.Swizzle = 0; + tmp.NegateBase = 0; + for(i = 0; i < 4; ++i) { + GLuint swz = GET_SWZ(swizzle, i); + if (swz < 4) { + tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3); + tmp.NegateBase |= GET_BIT(srcreg.NegateBase, swz) << i; + } else { + tmp.Swizzle |= swz << (i*3); + } + } + return tmp; +} + + +static struct prog_instruction* track_used_srcreg(struct nqssadce_state* s, + struct prog_instruction *inst, GLint src, GLuint sourced) +{ + int i; + GLuint deswz_source = 0; + + for(i = 0; i < 4; ++i) { + if (GET_BIT(sourced, i)) { + GLuint swz = GET_SWZ(inst->SrcReg[src].Swizzle, i); + deswz_source |= 1 << swz; + } else { + inst->SrcReg[src].Swizzle &= ~(7 << (3*i)); + inst->SrcReg[src].Swizzle |= SWIZZLE_NIL << (3*i); + } + } + + if (!s->Descr->IsNativeSwizzle(inst->Opcode, inst->SrcReg[src])) { + struct prog_dst_register dstreg = inst->DstReg; + dstreg.File = PROGRAM_TEMPORARY; + dstreg.Index = _mesa_find_free_register(s->Program, PROGRAM_TEMPORARY); + dstreg.WriteMask = sourced; + + s->Descr->BuildSwizzle(s, dstreg, inst->SrcReg[src]); + + inst = s->Program->Instructions + s->IP; + inst->SrcReg[src].File = PROGRAM_TEMPORARY; + inst->SrcReg[src].Index = dstreg.Index; + inst->SrcReg[src].Swizzle = 0; + inst->SrcReg[src].NegateBase = 0; + inst->SrcReg[src].Abs = 0; + inst->SrcReg[src].NegateAbs = 0; + for(i = 0; i < 4; ++i) { + if (GET_BIT(sourced, i)) + inst->SrcReg[src].Swizzle |= i << (3*i); + else + inst->SrcReg[src].Swizzle |= SWIZZLE_NIL << (3*i); + } + deswz_source = sourced; + } + + struct register_state *regstate = get_reg_state(s, inst->SrcReg[src].File, inst->SrcReg[src].Index); + if (regstate) + regstate->Sourced |= deswz_source & 0xf; + + return inst; +} + + +static void rewrite_depth_out(struct prog_instruction *inst) +{ + if (inst->DstReg.WriteMask & WRITEMASK_Z) { + inst->DstReg.WriteMask = WRITEMASK_W; + } else { + inst->DstReg.WriteMask = 0; + return; + } + + switch (inst->Opcode) { + case OPCODE_FRC: + case OPCODE_MOV: + inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]); + break; + case OPCODE_ADD: + case OPCODE_MAX: + case OPCODE_MIN: + case OPCODE_MUL: + inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]); + inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]); + break; + case OPCODE_CMP: + case OPCODE_MAD: + inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]); + inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]); + inst->SrcReg[2] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[2]); + break; + default: + // Scalar instructions needn't be reswizzled + break; + } +} + +static void unalias_srcregs(struct prog_instruction *inst, GLuint oldindex, GLuint newindex) +{ + int nsrc = _mesa_num_inst_src_regs(inst->Opcode); + int i; + for(i = 0; i < nsrc; ++i) + if (inst->SrcReg[i].File == PROGRAM_TEMPORARY && inst->SrcReg[i].Index == oldindex) + inst->SrcReg[i].Index = newindex; +} + +static void unalias_temporary(struct nqssadce_state* s, GLuint oldindex) +{ + GLuint newindex = _mesa_find_free_register(s->Program, PROGRAM_TEMPORARY); + int ip; + for(ip = 0; ip < s->IP; ++ip) { + struct prog_instruction* inst = s->Program->Instructions + ip; + if (inst->DstReg.File == PROGRAM_TEMPORARY && inst->DstReg.Index == oldindex) + inst->DstReg.Index = newindex; + unalias_srcregs(inst, oldindex, newindex); + } + unalias_srcregs(s->Program->Instructions + s->IP, oldindex, newindex); +} + + +/** + * Handle one instruction. + */ +static void process_instruction(struct nqssadce_state* s) +{ + struct prog_instruction *inst = s->Program->Instructions + s->IP; + + if (inst->Opcode == OPCODE_END) + return; + + if (inst->Opcode != OPCODE_KIL) { + if (s->Descr->RewriteDepthOut) { + if (inst->DstReg.File == PROGRAM_OUTPUT && inst->DstReg.Index == FRAG_RESULT_DEPR) + rewrite_depth_out(inst); + } + + struct register_state *regstate = get_reg_state(s, inst->DstReg.File, inst->DstReg.Index); + if (!regstate) { + _mesa_problem(s->Ctx, "NqssaDce: bad destination register (%i[%i])\n", + inst->DstReg.File, inst->DstReg.Index); + return; + } + + inst->DstReg.WriteMask &= regstate->Sourced; + regstate->Sourced &= ~inst->DstReg.WriteMask; + + if (inst->DstReg.WriteMask == 0) { + _mesa_delete_instructions(s->Program, s->IP, 1); + return; + } + + if (inst->DstReg.File == PROGRAM_TEMPORARY && !regstate->Sourced) + unalias_temporary(s, inst->DstReg.Index); + } + + /* Attention: Due to swizzle emulation code, the following + * might change the instruction stream under us, so we have + * to be careful with the inst pointer. */ + switch (inst->Opcode) { + case OPCODE_FRC: + case OPCODE_MOV: + inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask); + break; + case OPCODE_ADD: + case OPCODE_MAX: + case OPCODE_MIN: + case OPCODE_MUL: + inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask); + inst = track_used_srcreg(s, inst, 1, inst->DstReg.WriteMask); + break; + case OPCODE_CMP: + case OPCODE_MAD: + inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask); + inst = track_used_srcreg(s, inst, 1, inst->DstReg.WriteMask); + inst = track_used_srcreg(s, inst, 2, inst->DstReg.WriteMask); + break; + case OPCODE_COS: + case OPCODE_EX2: + case OPCODE_LG2: + case OPCODE_RCP: + case OPCODE_RSQ: + case OPCODE_SIN: + inst = track_used_srcreg(s, inst, 0, 0x1); + break; + case OPCODE_DP3: + inst = track_used_srcreg(s, inst, 0, 0x7); + inst = track_used_srcreg(s, inst, 1, 0x7); + break; + case OPCODE_DP4: + inst = track_used_srcreg(s, inst, 0, 0xf); + inst = track_used_srcreg(s, inst, 1, 0xf); + break; + case OPCODE_KIL: + case OPCODE_TEX: + case OPCODE_TXB: + case OPCODE_TXP: + inst = track_used_srcreg(s, inst, 0, 0xf); + break; + default: + _mesa_problem(s->Ctx, "NqssaDce: Unknown opcode %d\n", inst->Opcode); + return; + } +} + + +void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr) +{ + struct nqssadce_state s; + + _mesa_bzero(&s, sizeof(s)); + s.Ctx = ctx; + s.Program = p; + s.Descr = descr; + s.Descr->Init(&s); + s.IP = p->NumInstructions; + + while(s.IP > 0) { + s.IP--; + process_instruction(&s); + } +} diff --git a/src/mesa/drivers/dri/r300/radeon_nqssadce.h b/src/mesa/drivers/dri/r300/radeon_nqssadce.h new file mode 100644 index 0000000000..a4f94abcb6 --- /dev/null +++ b/src/mesa/drivers/dri/r300/radeon_nqssadce.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RADEON_PROGRAM_NQSSADCE_H_ +#define __RADEON_PROGRAM_NQSSADCE_H_ + +#include "radeon_program.h" + + +struct register_state { + /** + * Bitmask indicating which components of the register are sourced + * by later instructions. + */ + GLuint Sourced : 4; +}; + +/** + * Maintain state such as which registers are used, which registers are + * read from, etc. + */ +struct nqssadce_state { + GLcontext *Ctx; + struct gl_program *Program; + struct radeon_nqssadce_descr *Descr; + + /** + * All instructions after this instruction pointer have been dealt with. + */ + int IP; + + /** + * Which registers are read by subsequent instructions? + */ + struct register_state Temps[MAX_PROGRAM_TEMPS]; + struct register_state Outputs[VERT_RESULT_MAX]; +}; + + +/** + * This structure contains a description of the hardware in-so-far as + * it is required for the NqSSA-DCE pass. + */ +struct radeon_nqssadce_descr { + /** + * Fill in which outputs + */ + void (*Init)(struct nqssadce_state *); + + /** + * Check whether the given swizzle, absolute and negate combination + * can be implemented natively by the hardware for this opcode. + */ + GLboolean (*IsNativeSwizzle)(GLuint opcode, struct prog_src_register reg); + + /** + * Emit (at the current IP) the instruction MOV dst, src; + * The transformation will work recursively on the emitted instruction(s). + */ + void (*BuildSwizzle)(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src); + + /** + * Rewrite instructions that write to DEPR.z to write to DEPR.w + * instead (rewriting is done *before* the WriteMask test). + */ + GLboolean RewriteDepthOut; + void *Data; +}; + +void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr); + +#endif /* __RADEON_PROGRAM_NQSSADCE_H_ */ diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c index 376d7ee60d..a80e6e9154 100644 --- a/src/mesa/shader/program.c +++ b/src/mesa/shader/program.c @@ -112,7 +112,7 @@ _mesa_free_program_data(GLcontext *ctx) /** * Update the default program objects in the given context to reference those - * specified in the shared state and release those referencing the old + * specified in the shared state and release those referencing the old * shared state. */ void @@ -238,7 +238,7 @@ struct gl_program * _mesa_init_fragment_program( GLcontext *ctx, struct gl_fragment_program *prog, GLenum target, GLuint id) { - if (prog) + if (prog) return _mesa_init_program_struct( ctx, &prog->Base, target, id ); else return NULL; @@ -252,7 +252,7 @@ struct gl_program * _mesa_init_vertex_program( GLcontext *ctx, struct gl_vertex_program *prog, GLenum target, GLuint id) { - if (prog) + if (prog) return _mesa_init_program_struct( ctx, &prog->Base, target, id ); else return NULL; @@ -265,7 +265,7 @@ _mesa_init_vertex_program( GLcontext *ctx, struct gl_vertex_program *prog, * ctx->Driver.NewProgram. May be overridden (ie. replaced) by a * device driver function to implement OO deriviation with additional * types not understood by this function. - * + * * \param ctx context * \param id program id/number * \param target program target/type @@ -309,7 +309,7 @@ _mesa_delete_program(GLcontext *ctx, struct gl_program *prog) if (prog == &_mesa_DummyProgram) return; - + if (prog->String) _mesa_free(prog->String); @@ -382,7 +382,7 @@ _mesa_reference_program(GLcontext *ctx, deleteFlag = ((*ptr)->RefCount == 0); /*_glthread_UNLOCK_MUTEX((*ptr)->Mutex);*/ - + if (deleteFlag) { ASSERT(ctx); ctx->Driver.DeleteProgram(ctx, *ptr); @@ -541,6 +541,53 @@ _mesa_insert_instructions(struct gl_program *prog, GLuint start, GLuint count) } +/** + * Delete 'count' instructions at 'start' in the given program. + * Adjust branch targets accordingly. + */ +GLboolean +_mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count) +{ + const GLuint origLen = prog->NumInstructions; + const GLuint newLen = origLen - count; + struct prog_instruction *newInst; + GLuint i; + + /* adjust branches */ + for (i = 0; i < prog->NumInstructions; i++) { + struct prog_instruction *inst = prog->Instructions + i; + if (inst->BranchTarget > 0) { + if (inst->BranchTarget >= start) { + inst->BranchTarget -= count; + } + } + } + + /* Alloc storage for new instructions */ + newInst = _mesa_alloc_instructions(newLen); + if (!newInst) { + return GL_FALSE; + } + + /* Copy 'start' instructions into new instruction buffer */ + _mesa_copy_instructions(newInst, prog->Instructions, start); + + /* Copy the remaining/tail instructions to new inst buffer */ + _mesa_copy_instructions(newInst + start, + prog->Instructions + start + count, + newLen - start); + + /* free old instructions */ + _mesa_free_instructions(prog->Instructions, origLen); + + /* install new instructions */ + prog->Instructions = newInst; + prog->NumInstructions = newLen; + + return GL_TRUE; +} + + /** * Search instructions for registers that match (oldFile, oldIndex), * replacing them with (newFile, newIndex). @@ -844,7 +891,7 @@ _mesa_BindProgram(GLenum target, GLuint id) * \note Not compiled into display lists. * \note Called by both glDeleteProgramsNV and glDeleteProgramsARB. */ -void GLAPIENTRY +void GLAPIENTRY _mesa_DeletePrograms(GLsizei n, const GLuint *ids) { GLint i; diff --git a/src/mesa/shader/program.h b/src/mesa/shader/program.h index f1a69a2c01..48fe06ab7f 100644 --- a/src/mesa/shader/program.h +++ b/src/mesa/shader/program.h @@ -67,13 +67,13 @@ _mesa_find_line_column(const GLubyte *string, const GLubyte *pos, GLint *line, GLint *col); -extern struct gl_program * -_mesa_init_vertex_program(GLcontext *ctx, - struct gl_vertex_program *prog, +extern struct gl_program * +_mesa_init_vertex_program(GLcontext *ctx, + struct gl_vertex_program *prog, GLenum target, GLuint id); -extern struct gl_program * -_mesa_init_fragment_program(GLcontext *ctx, +extern struct gl_program * +_mesa_init_fragment_program(GLcontext *ctx, struct gl_fragment_program *prog, GLenum target, GLuint id); @@ -115,6 +115,9 @@ _mesa_clone_program(GLcontext *ctx, const struct gl_program *prog); extern GLboolean _mesa_insert_instructions(struct gl_program *prog, GLuint start, GLuint count); +extern GLboolean +_mesa_delete_instructions(struct gl_program *prog, GLuint start, GLuint count); + extern struct gl_program * _mesa_combine_programs(GLcontext *ctx, const struct gl_program *progA, -- cgit v1.2.3 From b6765c34993b08bba4acf20738c8938413ed4daf Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 12 Jul 2008 01:14:35 +0200 Subject: r500_fragprog: Major refactoring of final emit Use an abstracted instruction scheduling and register allocation algorithm that we will be able to share with r300_fragprog. Unlike the original emit code, this code tries to pair instructions that only use the RGB part of the ALU with instructions that only use the alpha part. However, the pairing algorithm still has some shortcomings; for example, it doesn't generate optimal code for the emulation of LIT. --- src/mesa/drivers/dri/r300/Makefile | 1 + src/mesa/drivers/dri/r300/r300_context.h | 3 - src/mesa/drivers/dri/r300/r300_reg.h | 1 + src/mesa/drivers/dri/r300/r500_fragprog.c | 8 +- src/mesa/drivers/dri/r300/r500_fragprog_emit.c | 1002 +++++------------------ src/mesa/drivers/dri/r300/radeon_program.h | 4 + src/mesa/drivers/dri/r300/radeon_program_pair.c | 970 ++++++++++++++++++++++ src/mesa/drivers/dri/r300/radeon_program_pair.h | 126 +++ 8 files changed, 1290 insertions(+), 825 deletions(-) create mode 100644 src/mesa/drivers/dri/r300/radeon_program_pair.c create mode 100644 src/mesa/drivers/dri/r300/radeon_program_pair.h (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile index 1dc75a3062..9baa1e7131 100644 --- a/src/mesa/drivers/dri/r300/Makefile +++ b/src/mesa/drivers/dri/r300/Makefile @@ -38,6 +38,7 @@ DRIVER_SOURCES = \ r300_texstate.c \ radeon_program.c \ radeon_program_alu.c \ + radeon_program_pair.c \ radeon_nqssadce.c \ r300_vertprog.c \ r300_fragprog.c \ diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index a69beba9a7..8e9c5cee5f 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -777,9 +777,6 @@ struct r500_fragment_program_code { GLuint inst4; GLuint inst5; } inst[512]; - /* TODO: This is magic! */ - - int temp_reg_offset; int inst_offset; int inst_end; diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index cd232c5b7b..ec2b58377c 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -2921,6 +2921,7 @@ enum { # define R500_RGB_SRCP_OP_RGB1_PLUS_RGB0 (2 << 30) # define R500_RGB_SRCP_OP_1_MINUS_RGB0 (3 << 30) #define R500_US_CMN_INST_0 0xb800 +# define R500_INST_TYPE_MASK (3 << 0) # define R500_INST_TYPE_ALU (0 << 0) # define R500_INST_TYPE_OUT (1 << 0) # define R500_INST_TYPE_FC (2 << 0) diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index c92ea8f5e6..7b18efa69d 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -408,12 +408,10 @@ void r500TranslateFragmentShader(r300ContextPtr r300, r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM); if (RADEON_DEBUG & DEBUG_PIXEL) { - fprintf(stderr, "Mesa program:\n"); - fprintf(stderr, "-------------\n"); - _mesa_print_program(&fp->mesa_program.Base); - fflush(stdout); - if (fp->translated) + if (fp->translated) { + _mesa_printf("Machine-readable code:\n"); dump_program(&fp->code); + } } } diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 275911679d..b6f52474e2 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -43,913 +43,281 @@ * */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" -#include "shader/prog_instruction.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" - -#include "r300_context.h" #include "r500_fragprog.h" -#include "r300_reg.h" -#include "r300_state.h" -/* Mapping Mesa registers to R500 temporaries */ -struct reg_acc { - int reg; /* Assigned hw temp */ - unsigned int refcount; /* Number of uses by mesa program */ -}; - -/** - * Describe the current lifetime information for an R300 temporary - */ -struct reg_lifetime { - /* Index of the first slot where this register is free in the sense - that it can be used as a new destination register. - This is -1 if the register has been assigned to a Mesa register - and the last access to the register has not yet been emitted */ - int free; - - /* Index of the first slot where this register is currently reserved. - This is used to stop e.g. a scalar operation from being moved - before the allocation time of a register that was first allocated - for a vector operation. */ - int reserved; - - /* Index of the first slot in which the register can be used as a - source without losing the value that is written by the last - emitted instruction that writes to the register */ - int vector_valid; - int scalar_valid; - - /* Index to the slot where the register was last read. - This is also the first slot in which the register may be written again */ - int vector_lastread; - int scalar_lastread; -}; - -/** - * Store information during compilation of fragment programs. - */ -struct r500_pfs_compile_state { - struct r500_fragment_program_compiler *compiler; +#include "radeon_program_pair.h" - /* number of ALU slots used so far */ - int nrslots; - /* Used to map Mesa's inputs/temps onto hardware temps */ - int temp_in_use; - struct reg_acc inputs[32]; /* don't actually need 32... */ -}; +#define PROG_CODE \ + struct r500_fragment_program_compiler *c = (struct r500_fragment_program_compiler*)data; \ + struct r500_fragment_program_code *code = c->code -/* - * Useful macros and values - */ -#define ERROR(fmt, args...) do { \ +#define error(fmt, args...) do { \ fprintf(stderr, "%s::%s(): " fmt "\n", \ __FILE__, __FUNCTION__, ##args); \ - cs->compiler->fp->error = GL_TRUE; \ } while(0) -#define PROG_CODE struct r500_fragment_program_code *code = cs->compiler->code - -#define R500_US_NUM_TEMP_REGS 128 -#define R500_US_NUM_CONST_REGS 256 - -/* "Register" flags */ -#define REG_CONSTANT (1 << 8) -#define REG_SRC_REL (1 << 9) -#define REG_DEST_REL (1 << 7) - -/* Swizzle tools */ -#define R500_SWIZZLE_ZERO 4 -#define R500_SWIZZLE_HALF 5 -#define R500_SWIZZLE_ONE 6 -#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6)) -#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6)) -#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6)) -#define R500_SWIZ_MOD_NEG 1 -#define R500_SWIZ_MOD_ABS 2 -#define R500_SWIZ_MOD_NEG_ABS 3 -/* Swizzles for inst2 */ -#define MAKE_SWIZ_TEX_STRQ(x) (x << 8) -#define MAKE_SWIZ_TEX_RGBA(x) (x << 24) -/* Swizzles for inst3 */ -#define MAKE_SWIZ_RGB_A(x) (x << 2) -#define MAKE_SWIZ_RGB_B(x) (x << 15) -/* Swizzles for inst4 */ -#define MAKE_SWIZ_ALPHA_A(x) (x << 14) -#define MAKE_SWIZ_ALPHA_B(x) (x << 21) -/* Swizzle for inst5 */ -#define MAKE_SWIZ_RGBA_C(x) (x << 14) -#define MAKE_SWIZ_ALPHA_C(x) (x << 27) - -/* Writemasks */ -#define R500_WRITEMASK_G 0x2 -#define R500_WRITEMASK_B 0x4 -#define R500_WRITEMASK_RGB 0x7 -#define R500_WRITEMASK_A 0x8 -#define R500_WRITEMASK_AR 0x9 -#define R500_WRITEMASK_AG 0xA -#define R500_WRITEMASK_ARG 0xB -#define R500_WRITEMASK_AB 0xC -#define R500_WRITEMASK_ARGB 0xF - - -static const struct prog_dst_register dstreg_template = { - .File = PROGRAM_TEMPORARY, - .Index = 0, - .WriteMask = WRITEMASK_XYZW -}; -static INLINE GLuint fix_hw_swizzle(GLuint swz) -{ - if (swz == 5) swz = 6; - if (swz == SWIZZLE_NIL) swz = 4; - return swz; -} - -static INLINE GLuint make_rgb_swizzle(struct prog_src_register src) { - GLuint swiz = 0x0; - GLuint temp; - /* This could be optimized, but it should be plenty fast already. */ - int i; - int negatebase = 0; - for (i = 0; i < 3; i++) { - temp = GET_SWZ(src.Swizzle, i); - if (temp != SWIZZLE_NIL && GET_BIT(src.NegateBase, i)) - negatebase = 1; - temp = fix_hw_swizzle(temp); - swiz |= temp << i*3; - } - if (src.Abs) - swiz |= R500_SWIZ_MOD_ABS << 9; - else if (negatebase) - swiz |= R500_SWIZ_MOD_NEG << 9; - if (src.NegateAbs) - swiz ^= R500_SWIZ_MOD_NEG << 9; - return swiz; -} - -static INLINE GLuint make_rgba_swizzle(GLuint src) { - GLuint swiz = 0x0; - GLuint temp; - int i; - for (i = 0; i < 4; i++) { - temp = GET_SWZ(src, i); - temp = fix_hw_swizzle(temp); - swiz |= temp << i*3; - } - return swiz; -} - -static INLINE GLuint make_alpha_swizzle(struct prog_src_register src) { - GLuint swiz = GET_SWZ(src.Swizzle, 3); - - swiz = fix_hw_swizzle(swiz); - - if (src.Abs) { - swiz |= R500_SWIZ_MOD_ABS << 3; - } else if (src.NegateBase & 8) { - swiz |= R500_SWIZ_MOD_NEG << 3; - } - if (src.NegateAbs) - swiz ^= R500_SWIZ_MOD_NEG << 3; - - return swiz; -} - -static INLINE GLuint make_sop_swizzle(struct prog_src_register src) { - GLuint swiz = GET_SWZ(src.Swizzle, 0); - - swiz = fix_hw_swizzle(swiz); - - if (src.Abs) { - swiz |= R500_SWIZ_MOD_ABS << 3; - } else if (src.NegateBase & 1) { - swiz |= R500_SWIZ_MOD_NEG << 3; - } - if (src.NegateAbs) - swiz ^= R500_SWIZ_MOD_NEG << 3; - - return swiz; -} - -static INLINE GLuint make_strq_swizzle(struct prog_src_register src) { - GLuint swiz = 0x0, temp = 0x0; - int i; - for (i = 0; i < 4; i++) { - temp = GET_SWZ(src.Swizzle, i) & 0x3; - swiz |= temp << i*2; - } - return swiz; -} - -static int get_temp(struct r500_pfs_compile_state *cs, int slot) { - - PROG_CODE; - - int r = code->temp_reg_offset + cs->temp_in_use + slot; - - if (r > R500_US_NUM_TEMP_REGS) { - ERROR("Too many temporary registers requested, can't compile!\n"); - } - - return r; -} - -/* Borrowed verbatim from r300_fragprog since it hasn't changed. */ -static GLuint emit_const4fv(struct r500_pfs_compile_state *cs, - struct prog_src_register srcreg) +/** + * Callback to register hardware constants. + */ +static GLboolean emit_const(void *data, GLuint file, GLuint idx, GLuint *hwindex) { PROG_CODE; - GLuint reg = 0x0; - int index; - - for (index = 0; index < code->const_nr; ++index) { - if (code->constant[index].File == srcreg.File && - code->constant[index].Index == srcreg.Index) + for (*hwindex = 0; *hwindex < code->const_nr; ++*hwindex) { + if (code->constant[*hwindex].File == file && + code->constant[*hwindex].Index == idx) break; } - if (index >= code->const_nr) { - if (index >= R500_US_NUM_CONST_REGS) { - ERROR("Out of hw constants!\n"); - return reg; + if (*hwindex >= code->const_nr) { + if (*hwindex >= PFS_NUM_CONST_REGS) { + error("Out of hw constants!\n"); + return GL_FALSE; } code->const_nr++; - code->constant[index] = srcreg; - } - - reg = index | REG_CONSTANT; - return reg; -} - -static GLuint make_src(struct r500_pfs_compile_state *cs, struct prog_src_register src) { - PROG_CODE; - GLuint reg; - switch (src.File) { - case PROGRAM_TEMPORARY: - reg = src.Index + code->temp_reg_offset; - break; - case PROGRAM_INPUT: - reg = cs->inputs[src.Index].reg; - break; - case PROGRAM_LOCAL_PARAM: - case PROGRAM_ENV_PARAM: - case PROGRAM_STATE_VAR: - case PROGRAM_NAMED_PARAM: - case PROGRAM_CONSTANT: - reg = emit_const4fv(cs, src); - break; - case PROGRAM_BUILTIN: - reg = 0x0; - break; - default: - ERROR("Can't handle src.File %x\n", src.File); - reg = 0x0; - break; + code->constant[*hwindex].File = file; + code->constant[*hwindex].Index = idx; } - return reg; -} -static GLuint make_dest(struct r500_pfs_compile_state *cs, struct prog_dst_register dest) { - PROG_CODE; - GLuint reg; - switch (dest.File) { - case PROGRAM_TEMPORARY: - reg = dest.Index + code->temp_reg_offset; - break; - case PROGRAM_OUTPUT: - /* Eventually we may need to handle multiple - * rendering targets... */ - reg = dest.Index; - break; - case PROGRAM_BUILTIN: - reg = 0x0; - break; - default: - ERROR("Can't handle dest.File %x\n", dest.File); - reg = 0x0; - break; - } - return reg; -} - -static int emit_slot(struct r500_pfs_compile_state *cs) -{ - if (cs->nrslots >= 512) { - ERROR("Too many instructions"); - cs->nrslots = 1; - return 0; - } - return cs->nrslots++; + return GL_TRUE; } -static int emit_tex(struct r500_pfs_compile_state *cs, - struct prog_instruction *fpi, int dest) +static GLuint translate_rgb_op(GLuint opcode) { - PROG_CODE; - int hwsrc, hwdest; - GLuint mask; - int counter = emit_slot(cs); - - mask = fpi->DstReg.WriteMask << 11; - hwsrc = make_src(cs, fpi->SrcReg[0]); - - if (fpi->DstReg.File == PROGRAM_OUTPUT) { - hwdest = get_temp(cs, 0); - } else { - hwdest = dest; - } - - code->inst[counter].inst0 = R500_INST_TYPE_TEX | mask - | R500_INST_TEX_SEM_WAIT; - - code->inst[counter].inst1 = R500_TEX_ID(fpi->TexSrcUnit) - | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED; - - if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) - code->inst[counter].inst1 |= R500_TEX_UNSCALED; - - switch (fpi->Opcode) { - case OPCODE_KIL: - code->inst[counter].inst1 |= R500_TEX_INST_TEXKILL; - break; - case OPCODE_TEX: - code->inst[counter].inst1 |= R500_TEX_INST_LD; - break; - case OPCODE_TXB: - code->inst[counter].inst1 |= R500_TEX_INST_LODBIAS; - break; - case OPCODE_TXP: - code->inst[counter].inst1 |= R500_TEX_INST_PROJ; - break; + switch(opcode) { + case OPCODE_CMP: return R500_ALU_RGBA_OP_CMP; + case OPCODE_DP3: return R500_ALU_RGBA_OP_DP3; + case OPCODE_DP4: return R500_ALU_RGBA_OP_DP4; + case OPCODE_FRC: return R500_ALU_RGBA_OP_FRC; default: - ERROR("emit_tex can't handle opcode %x\n", fpi->Opcode); + error("translate_rgb_op(%d): unknown opcode\n", opcode); + /* fall through */ + case OPCODE_NOP: + /* fall through */ + case OPCODE_MAD: return R500_ALU_RGBA_OP_MAD; + case OPCODE_MAX: return R500_ALU_RGBA_OP_MAX; + case OPCODE_MIN: return R500_ALU_RGBA_OP_MIN; + case OPCODE_REPL_ALPHA: return R500_ALU_RGBA_OP_SOP; } - - code->inst[counter].inst2 = R500_TEX_SRC_ADDR(hwsrc) - | MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0])) - /* | R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G - | R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A */ - | R500_TEX_DST_ADDR(hwdest) - | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G - | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A; - - code->inst[counter].inst3 = 0x0; - code->inst[counter].inst4 = 0x0; - code->inst[counter].inst5 = 0x0; - - if (fpi->DstReg.File == PROGRAM_OUTPUT) { - counter++; - code->inst[counter].inst0 = R500_INST_TYPE_OUT - | R500_INST_TEX_SEM_WAIT | (mask << 4); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_SEL_B_SRC0 - | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_OMOD_DISABLE; - code->inst[counter].inst4 = R500_ALPHA_OP_CMP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_ALPHA_SWIZ_A_A) - | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_ALPHA_SWIZ_A_A) - | R500_ALPHA_OMOD_DISABLE; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP - | R500_ALU_RGBA_ADDRD(dest) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - } - - return counter; } -/* Do not call directly */ -static int _helper_emit_alu(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, - int File, int Index, int WriteMask) +static GLuint translate_alpha_op(GLuint opcode) { - PROG_CODE; - int counter = emit_slot(cs); - - code->inst[counter].inst4 = alphaop; - code->inst[counter].inst5 = rgbop; - - if (File == PROGRAM_OUTPUT) { - code->inst[counter].inst0 = R500_INST_TYPE_OUT; - - if (Index == FRAG_RESULT_COLR) { - code->inst[counter].inst0 |= WriteMask << 15; - } else if (Index == FRAG_RESULT_DEPR) { - code->inst[counter].inst4 |= R500_ALPHA_W_OMASK; - cs->compiler->fp->writes_depth = GL_TRUE; - } - } else { - int dest = Index + code->temp_reg_offset; - - code->inst[counter].inst0 = R500_INST_TYPE_ALU - | (WriteMask << 11); - code->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest); - code->inst[counter].inst5 |= R500_ALU_RGBA_ADDRD(dest); + switch(opcode) { + case OPCODE_CMP: return R500_ALPHA_OP_CMP; + case OPCODE_COS: return R500_ALPHA_OP_COS; + case OPCODE_DP3: return R500_ALPHA_OP_DP; + case OPCODE_DP4: return R500_ALPHA_OP_DP; + case OPCODE_EX2: return R500_ALPHA_OP_EX2; + case OPCODE_FRC: return R500_ALPHA_OP_FRC; + case OPCODE_LG2: return R500_ALPHA_OP_LN2; + default: + error("translate_alpha_op(%d): unknown opcode\n", opcode); + /* fall through */ + case OPCODE_NOP: + /* fall through */ + case OPCODE_MAD: return R500_ALPHA_OP_MAD; + case OPCODE_MAX: return R500_ALPHA_OP_MAX; + case OPCODE_MIN: return R500_ALPHA_OP_MIN; + case OPCODE_RCP: return R500_ALPHA_OP_RCP; + case OPCODE_RSQ: return R500_ALPHA_OP_RSQ; + case OPCODE_SIN: return R500_ALPHA_OP_SIN; } - - code->inst[counter].inst0 |= R500_INST_TEX_SEM_WAIT; - - return counter; } -/** - * Prepare an ALU slot with the given RGB operation, ALPHA operation, and - * destination register. - */ -static int emit_alu(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, struct prog_dst_register dst) -{ - return _helper_emit_alu(cs, rgbop, alphaop, dst.File, dst.Index, dst.WriteMask); -} - -/** - * Set an instruction's source 0 (both RGB and ALPHA) to the given hardware index. - */ -static void set_src0_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) +static GLuint fix_hw_swizzle(GLuint swz) { - PROG_CODE; - code->inst[ip].inst1 |= R500_RGB_ADDR0(src); - code->inst[ip].inst2 |= R500_ALPHA_ADDR0(src); + if (swz == 5) swz = 6; + if (swz == SWIZZLE_NIL) swz = 4; + return swz; } -/** - * Set an instruction's source 1 (both RGB and ALPHA) to the given hardware index. - */ -static void set_src1_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) +static GLuint translate_arg_rgb(struct radeon_pair_instruction *inst, int arg) { - PROG_CODE; - code->inst[ip].inst1 |= R500_RGB_ADDR1(src); - code->inst[ip].inst2 |= R500_ALPHA_ADDR1(src); -} + GLuint t = inst->RGB.Arg[arg].Source; + int comp; + t |= inst->RGB.Arg[arg].Negate << 11; + t |= inst->RGB.Arg[arg].Abs << 12; -/** - * Set an instruction's source 2 (both RGB and ALPHA) to the given hardware index. - */ -static void set_src2_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) -{ - PROG_CODE; - code->inst[ip].inst1 |= R500_RGB_ADDR2(src); - code->inst[ip].inst2 |= R500_ALPHA_ADDR2(src); -} + for(comp = 0; comp < 3; ++comp) + t |= fix_hw_swizzle(GET_SWZ(inst->RGB.Arg[arg].Swizzle, comp)) << (3*comp + 2); -/** - * Set an instruction's source 0 (both RGB and ALPHA) according to the given source register. - */ -static void set_src0(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) -{ - set_src0_direct(cs, ip, make_src(cs, srcreg)); + return t; } -/** - * Set an instruction's source 1 (both RGB and ALPHA) according to the given source register. - */ -static void set_src1(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) +static GLuint translate_arg_alpha(struct radeon_pair_instruction *inst, int i) { - set_src1_direct(cs, ip, make_src(cs, srcreg)); + GLuint t = inst->Alpha.Arg[i].Source; + t |= fix_hw_swizzle(inst->Alpha.Arg[i].Swizzle) << 2; + t |= inst->Alpha.Arg[i].Negate << 5; + t |= inst->Alpha.Arg[i].Abs << 6; + return t; } -/** - * Set an instruction's source 2 (both RGB and ALPHA) according to the given source register. - */ -static void set_src2(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) +static void use_temporary(struct r500_fragment_program_code* code, GLuint index) { - set_src2_direct(cs, ip, make_src(cs, srcreg)); + if (index > code->max_temp_idx) + code->max_temp_idx = index; } -/** - * Set an instruction's argument A (both RGB and ALPHA) from the given source, - * taking swizzles+neg+abs as specified (see also _reg version below). - */ -static void set_argA(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) +static GLuint use_source(struct r500_fragment_program_code* code, struct radeon_pair_instruction_source src) { - PROG_CODE; - code->inst[ip].inst3 |= (source << R500_ALU_RGB_SEL_A_SHIFT) | MAKE_SWIZ_RGB_A(swizRGB); - code->inst[ip].inst4 |= (source << R500_ALPHA_SEL_A_SHIFT) | MAKE_SWIZ_ALPHA_A(swizA); + if (!src.Constant) + use_temporary(code, src.Index); + return src.Index | src.Constant << 8; } -/** - * Set an instruction's argument B (both RGB and ALPHA) from the given source, - * taking swizzles+neg+abs as specified (see also _reg version below). - */ -static void set_argB(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) -{ - PROG_CODE; - code->inst[ip].inst3 |= (source << R500_ALU_RGB_SEL_B_SHIFT) | MAKE_SWIZ_RGB_B(swizRGB); - code->inst[ip].inst4 |= (source << R500_ALPHA_SEL_B_SHIFT) | MAKE_SWIZ_ALPHA_B(swizA); -} /** - * Set an instruction's argument C (both RGB and ALPHA) from the given source, - * taking swizzles+neg+abs as specified (see also _reg version below). + * Emit a paired ALU instruction. */ -static void set_argC(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) +static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst) { PROG_CODE; - code->inst[ip].inst5 |= - (source << R500_ALU_RGBA_SEL_C_SHIFT) | - MAKE_SWIZ_RGBA_C(swizRGB) | - (source << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT) | - MAKE_SWIZ_ALPHA_C(swizA); -} -/** - * Set an instruction's argument A (both RGB and ALPHA) from the given source, - * taking swizzles, negation and absolute value from the given source register. - */ -static void set_argA_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) -{ - set_argA(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); -} - -/** - * Set an instruction's argument B (both RGB and ALPHA) from the given source, - * taking swizzles, negation and absolute value from the given source register. - */ -static void set_argB_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) -{ - set_argB(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); -} - -/** - * Set an instruction's argument C (both RGB and ALPHA) from the given source, - * taking swizzles, negation and absolute value from the given source register. - */ -static void set_argC_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) -{ - set_argC(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); -} - -/** - * Emit a special scalar operation. - */ -static int emit_sop(struct r500_pfs_compile_state *cs, - int opcode, struct prog_dst_register dstreg, GLuint src, GLuint swiz) -{ - int ip = emit_alu(cs, R500_ALU_RGBA_OP_SOP, opcode, dstreg); - set_src0_direct(cs, ip, src); - set_argA(cs, ip, 0, R500_SWIZ_RGB_ZERO, swiz); - return ip; -} + if (code->inst_end >= 511) { + error("emit_alu: Too many instructions"); + return GL_FALSE; + } + int ip = ++code->inst_end; -static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) { - PROG_CODE; - GLuint src[3], dest = 0; - int ip; + code->inst[ip].inst5 = translate_rgb_op(inst->RGB.Opcode); + code->inst[ip].inst4 = translate_alpha_op(inst->Alpha.Opcode); - if (fpi->Opcode != OPCODE_KIL) { - dest = make_dest(cs, fpi->DstReg); - } + if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask) + code->inst[ip].inst0 = R500_INST_TYPE_OUT; + else + code->inst[ip].inst0 = R500_INST_TYPE_ALU; + code->inst[ip].inst0 |= R500_INST_TEX_SEM_WAIT; - switch (fpi->Opcode) { - case OPCODE_ADD: - /* Variation on MAD: 1*src0+src1 */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ONE); - set_argB_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argC_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_CMP: - /* This inst's selects need to be swapped as follows: - * 0 -> C ; 1 -> B ; 2 -> A */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_src2(cs, ip, fpi->SrcReg[2]); - set_argA_reg(cs, ip, 2, fpi->SrcReg[2]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - set_argC_reg(cs, ip, 0, fpi->SrcReg[0]); - break; - case OPCODE_COS: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_COS, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_DP3: - ip = emit_alu(cs, R500_ALU_RGBA_OP_DP3, R500_ALPHA_OP_DP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_DP4: - ip = emit_alu(cs, R500_ALU_RGBA_OP_DP4, R500_ALPHA_OP_DP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_EX2: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_EX2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_FRC: - ip = emit_alu(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - break; - case OPCODE_LG2: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_LN2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_MAD: - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_src2(cs, ip, fpi->SrcReg[2]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - set_argC_reg(cs, ip, 2, fpi->SrcReg[2]); - break; - case OPCODE_MAX: - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAX, R500_ALPHA_OP_MAX, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_MIN: - ip = emit_alu(cs, R500_ALU_RGBA_OP_MIN, R500_ALPHA_OP_MIN, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_MOV: - ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - code->inst[ip].inst3 |= R500_ALU_RGB_OMOD_DISABLE; - code->inst[ip].inst4 |= R500_ALPHA_OMOD_DISABLE; - break; - case OPCODE_MUL: - /* Variation on MAD: src0*src1+0 */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - break; - case OPCODE_RCP: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_RCP, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_RSQ: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_RSQ, fpi->DstReg, src[0], - (make_sop_swizzle(fpi->SrcReg[0]) | (R500_SWIZ_MOD_ABS<<3)) & ~(R500_SWIZ_MOD_NEG<<3)); - break; - case OPCODE_SIN: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_SIN, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_KIL: - case OPCODE_TEX: - case OPCODE_TXB: - case OPCODE_TXP: - emit_tex(cs, fpi, dest); - break; - default: - ERROR("unknown fpi->Opcode %s\n", _mesa_opcode_string(fpi->Opcode)); - break; + code->inst[ip].inst0 |= (inst->RGB.WriteMask << 11) | (inst->Alpha.WriteMask << 14); + code->inst[ip].inst0 |= (inst->RGB.OutputWriteMask << 15) | (inst->Alpha.OutputWriteMask << 18); + if (inst->Alpha.DepthWriteMask) { + code->inst[ip].inst4 |= R500_ALPHA_W_OMASK; + c->fp->writes_depth = GL_TRUE; } - /* Finishing touches */ - if (fpi->SaturateMode == SATURATE_ZERO_ONE) { - code->inst[cs->nrslots-1].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP; - } -} + code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex); + code->inst[ip].inst5 |= R500_ALU_RGBA_ADDRD(inst->RGB.DestIndex); + use_temporary(code, inst->Alpha.DestIndex); + use_temporary(code, inst->RGB.DestIndex); -static GLboolean parse_program(struct r500_pfs_compile_state *cs) -{ - PROG_CODE; - struct prog_instruction* fpi; + if (inst->RGB.Saturate) + code->inst[ip].inst0 |= R500_INST_RGB_CLAMP; + if (inst->Alpha.Saturate) + code->inst[ip].inst0 |= R500_INST_ALPHA_CLAMP; - for(fpi = cs->compiler->program->Instructions; fpi->Opcode != OPCODE_END; ++fpi) { - do_inst(cs, fpi); + code->inst[ip].inst1 |= R500_RGB_ADDR0(use_source(code, inst->RGB.Src[0])); + code->inst[ip].inst1 |= R500_RGB_ADDR1(use_source(code, inst->RGB.Src[1])); + code->inst[ip].inst1 |= R500_RGB_ADDR2(use_source(code, inst->RGB.Src[2])); - if (cs->compiler->fp->error) - return GL_FALSE; - } + code->inst[ip].inst2 |= R500_ALPHA_ADDR0(use_source(code, inst->Alpha.Src[0])); + code->inst[ip].inst2 |= R500_ALPHA_ADDR1(use_source(code, inst->Alpha.Src[1])); + code->inst[ip].inst2 |= R500_ALPHA_ADDR2(use_source(code, inst->Alpha.Src[2])); - /* Finish him! (If it's an ALU/OUT instruction...) */ - if ((code->inst[cs->nrslots-1].inst0 & 0x3) == 1) { - code->inst[cs->nrslots-1].inst0 |= R500_INST_LAST; - } else { - /* We still need to put an output inst, right? */ - WARN_ONCE("Final FP instruction is not an OUT.\n"); - } + code->inst[ip].inst3 |= translate_arg_rgb(inst, 0) << R500_ALU_RGB_SEL_A_SHIFT; + code->inst[ip].inst3 |= translate_arg_rgb(inst, 1) << R500_ALU_RGB_SEL_B_SHIFT; + code->inst[ip].inst5 |= translate_arg_rgb(inst, 2) << R500_ALU_RGBA_SEL_C_SHIFT; - code->max_temp_idx++; + code->inst[ip].inst4 |= translate_arg_alpha(inst, 0) << R500_ALPHA_SEL_A_SHIFT; + code->inst[ip].inst4 |= translate_arg_alpha(inst, 1) << R500_ALPHA_SEL_B_SHIFT; + code->inst[ip].inst5 |= translate_arg_alpha(inst, 2) << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT; return GL_TRUE; } -static void init_program(struct r500_pfs_compile_state *cs) +static GLuint translate_strq_swizzle(struct prog_src_register src) { - PROG_CODE; - struct gl_fragment_program *mp = &cs->compiler->fp->mesa_program; - struct prog_instruction *fpi; - GLuint InputsRead = mp->Base.InputsRead; - GLuint temps_used = 0; + GLuint swiz = 0; int i; + for (i = 0; i < 4; i++) + swiz |= (GET_SWZ(src.Swizzle, i) & 0x3) << i*2; + return swiz; +} - /* New compile, reset tracking data */ - cs->compiler->fp->optimization = - driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization"); - cs->compiler->fp->translated = GL_FALSE; - cs->compiler->fp->error = GL_FALSE; +/** + * Emit a single TEX instruction + */ +static GLboolean emit_tex(void *data, struct prog_instruction *inst) +{ + PROG_CODE; - _mesa_bzero(code, sizeof(*code)); - code->max_temp_idx = 1; /* Size of pixel stack, plus 1. */ - cs->nrslots = 0; - cs->compiler->fp->writes_depth = GL_FALSE; - - /* Work out what temps the Mesa inputs correspond to, this must match - * what setup_rs_unit does, which shouldn't be a problem as rs_unit - * configures itself based on the fragprog's InputsRead - * - * NOTE: this depends on get_hw_temp() allocating registers in order, - * starting from register 0, so we're just going to do that instead. - */ - - /* Texcoords come first */ - for (i = 0; i < cs->compiler->fp->ctx->Const.MaxTextureUnits; i++) { - if (InputsRead & (FRAG_BIT_TEX0 << i)) { - cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0; - cs->inputs[FRAG_ATTRIB_TEX0 + i].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - } - InputsRead &= ~FRAG_BITS_TEX_ANY; - - /* fragment position treated as a texcoord */ - if (InputsRead & FRAG_BIT_WPOS) { - cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0; - cs->inputs[FRAG_ATTRIB_WPOS].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - InputsRead &= ~FRAG_BIT_WPOS; - - /* Then primary colour */ - if (InputsRead & FRAG_BIT_COL0) { - cs->inputs[FRAG_ATTRIB_COL0].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL0].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - InputsRead &= ~FRAG_BIT_COL0; - - /* Secondary color */ - if (InputsRead & FRAG_BIT_COL1) { - cs->inputs[FRAG_ATTRIB_COL1].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL1].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - InputsRead &= ~FRAG_BIT_COL1; - - /* Anything else */ - if (InputsRead) { - WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead); - /* force read from hwreg 0 for now */ - for (i = 0; i < 32; i++) - if (InputsRead & (1 << i)) - cs->inputs[i].reg = 0; + if (code->inst_end >= 511) { + error("emit_tex: Too many instructions"); + return GL_FALSE; } - int ip; + int ip = ++code->inst_end; - for (ip = 0; ip < cs->compiler->program->NumInstructions; ip++) { - fpi = cs->compiler->program->Instructions + ip; - for (i = 0; i < 3; i++) { - if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) { - if (fpi->SrcReg[i].Index >= temps_used) - temps_used = fpi->SrcReg[i].Index + 1; - } - } - } + code->inst[ip].inst0 = R500_INST_TYPE_TEX + | (inst->DstReg.WriteMask << 11) + | R500_INST_TEX_SEM_WAIT; + code->inst[ip].inst1 = R500_TEX_ID(inst->TexSrcUnit) + | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED; + if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) + code->inst[ip].inst1 |= R500_TEX_UNSCALED; - cs->temp_in_use = temps_used + 1; + switch (inst->Opcode) { + case OPCODE_KIL: + code->inst[ip].inst1 |= R500_TEX_INST_TEXKILL; + break; + case OPCODE_TEX: + code->inst[ip].inst1 |= R500_TEX_INST_LD; + break; + case OPCODE_TXB: + code->inst[ip].inst1 |= R500_TEX_INST_LODBIAS; + break; + case OPCODE_TXP: + code->inst[ip].inst1 |= R500_TEX_INST_PROJ; + break; + default: + error("emit_tex can't handle opcode %x\n", inst->Opcode); + } - code->max_temp_idx = code->temp_reg_offset + cs->temp_in_use; + code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcReg[0].Index) + | (translate_strq_swizzle(inst->SrcReg[0]) << 8) + | R500_TEX_DST_ADDR(inst->DstReg.Index) + | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G + | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A; - if (RADEON_DEBUG & DEBUG_PIXEL) - fprintf(stderr, "FP temp indices: code->max_temp_idx: %d cs->temp_in_use: %d\n", code->max_temp_idx, cs->temp_in_use); + return GL_TRUE; } -static void dumb_shader(struct r500_pfs_compile_state *cs) -{ - PROG_CODE; - code->inst[0].inst0 = R500_INST_TYPE_TEX - | R500_INST_TEX_SEM_WAIT - | R500_INST_RGB_WMASK_R - | R500_INST_RGB_WMASK_G - | R500_INST_RGB_WMASK_B - | R500_INST_ALPHA_WMASK - | R500_INST_RGB_CLAMP - | R500_INST_ALPHA_CLAMP; - code->inst[0].inst1 = R500_TEX_ID(0) - | R500_TEX_INST_LD - | R500_TEX_SEM_ACQUIRE - | R500_TEX_IGNORE_UNCOVERED; - code->inst[0].inst2 = R500_TEX_SRC_ADDR(0) - | R500_TEX_SRC_S_SWIZ_R - | R500_TEX_SRC_T_SWIZ_G - | R500_TEX_DST_ADDR(0) - | R500_TEX_DST_R_SWIZ_R - | R500_TEX_DST_G_SWIZ_G - | R500_TEX_DST_B_SWIZ_B - | R500_TEX_DST_A_SWIZ_A; - code->inst[0].inst3 = R500_DX_ADDR(0) - | R500_DX_S_SWIZ_R - | R500_DX_T_SWIZ_R - | R500_DX_R_SWIZ_R - | R500_DX_Q_SWIZ_R - | R500_DY_ADDR(0) - | R500_DY_S_SWIZ_R - | R500_DY_T_SWIZ_R - | R500_DY_R_SWIZ_R - | R500_DY_Q_SWIZ_R; - code->inst[0].inst4 = 0x0; - code->inst[0].inst5 = 0x0; - - code->inst[1].inst0 = R500_INST_TYPE_OUT | - R500_INST_TEX_SEM_WAIT | - R500_INST_LAST | - R500_INST_RGB_OMASK_R | - R500_INST_RGB_OMASK_G | - R500_INST_RGB_OMASK_B | - R500_INST_ALPHA_OMASK; - code->inst[1].inst1 = R500_RGB_ADDR0(0) | - R500_RGB_ADDR1(0) | - R500_RGB_ADDR1_CONST | - R500_RGB_ADDR2(0) | - R500_RGB_ADDR2_CONST | - R500_RGB_SRCP_OP_1_MINUS_2RGB0; - code->inst[1].inst2 = R500_ALPHA_ADDR0(0) | - R500_ALPHA_ADDR1(0) | - R500_ALPHA_ADDR1_CONST | - R500_ALPHA_ADDR2(0) | - R500_ALPHA_ADDR2_CONST | - R500_ALPHA_SRCP_OP_1_MINUS_2A0; - code->inst[1].inst3 = R500_ALU_RGB_SEL_A_SRC0 | - R500_ALU_RGB_R_SWIZ_A_R | - R500_ALU_RGB_G_SWIZ_A_G | - R500_ALU_RGB_B_SWIZ_A_B | - R500_ALU_RGB_SEL_B_SRC0 | - R500_ALU_RGB_R_SWIZ_B_1 | - R500_ALU_RGB_B_SWIZ_B_1 | - R500_ALU_RGB_G_SWIZ_B_1; - code->inst[1].inst4 = R500_ALPHA_OP_MAD | - R500_ALPHA_SWIZ_A_A | - R500_ALPHA_SWIZ_B_1; - code->inst[1].inst5 = R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_R_SWIZ_0 | - R500_ALU_RGBA_G_SWIZ_0 | - R500_ALU_RGBA_B_SWIZ_0 | - R500_ALU_RGBA_A_SWIZ_0; - - cs->nrslots = 2; -} +static const struct radeon_pair_handler pair_handler = { + .EmitConst = emit_const, + .EmitPaired = emit_paired, + .EmitTex = emit_tex, + .MaxHwTemps = 128 +}; GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler) { - struct r500_pfs_compile_state cs; struct r500_fragment_program_code *code = compiler->code; - _mesa_memset(&cs, 0, sizeof(cs)); - cs.compiler = compiler; - init_program(&cs); - - if (!parse_program(&cs)) { -#if 0 - ERROR("Huh. Couldn't parse program. There should be additional errors explaining why.\nUsing dumb shader...\n"); - dumb_shader(fp); - code->inst_offset = 0; - code->inst_end = cs.nrslots - 1; -#endif + _mesa_bzero(code, sizeof(*code)); + code->max_temp_idx = 1; + code->inst_offset = 0; + code->inst_end = -1; + + if (!radeonPairProgram(compiler->r300->radeon.glCtx, compiler->program, &pair_handler, compiler)) return GL_FALSE; - } - code->inst_offset = 0; - code->inst_end = cs.nrslots - 1; + if ((code->inst[code->inst_end].inst0 & R500_INST_TYPE_MASK) != R500_INST_TYPE_OUT) { + /* This may happen when dead-code elimination is disabled or + * when most of the fragment program logic is leading to a KIL */ + if (code->inst_end >= 511) { + error("Introducing fake OUT: Too many instructions"); + return GL_FALSE; + } + + int ip = ++code->inst_end; + code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT; + } return GL_TRUE; } diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/radeon_program.h index ba76bc47cf..2e01dd496b 100644 --- a/src/mesa/drivers/dri/r300/radeon_program.h +++ b/src/mesa/drivers/dri/r300/radeon_program.h @@ -45,6 +45,10 @@ enum { PROGRAM_BUILTIN = PROGRAM_FILE_MAX /**< not a real register, but a special swizzle constant */ }; +enum { + OPCODE_REPL_ALPHA = MAX_OPCODE /**< used in paired instructions */ +}; + #define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO) #define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE) diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c new file mode 100644 index 0000000000..86180edcb5 --- /dev/null +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c @@ -0,0 +1,970 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/** + * @file + * + * Perform temporary register allocation and attempt to pair off instructions + * in RGB and Alpha pairs. Also attempts to optimize the TEX instruction + * vs. ALU instruction scheduling. + */ + +#include "radeon_program_pair.h" + +#include "radeon_context.h" + +#include "shader/prog_print.h" + +#define error(fmt, args...) do { \ + _mesa_problem(s->Ctx, "%s::%s(): " fmt "\n", \ + __FILE__, __FUNCTION__, ##args); \ + s->Error = GL_TRUE; \ +} while(0) + +struct pair_state_instruction { + GLuint IsTex:1; /**< Is a texture instruction */ + GLuint NeedRGB:1; /**< Needs the RGB ALU */ + GLuint NeedAlpha:1; /**< Needs the Alpha ALU */ + GLuint IsTranscendent:1; /**< Is a special transcendent instruction */ + + /** + * Number of (read and write) dependencies that must be resolved before + * this instruction can be scheduled. + */ + GLuint NumDependencies:5; + + /** + * Next instruction in the linked list of ready instructions. + */ + struct pair_state_instruction *NextReady; + + /** + * Values that this instruction writes + */ + struct reg_value *Values[4]; +}; + + +/** + * Used to keep track of which instructions read a value. + */ +struct reg_value_reader { + GLuint IP; /**< IP of the instruction that performs this access */ + struct reg_value_reader *Next; +}; + +/** + * Used to keep track which values are stored in each component of a + * PROGRAM_TEMPORARY. + */ +struct reg_value { + GLuint IP; /**< IP of the instruction that writes this value */ + struct reg_value *Next; /**< Pointer to the next value to be written to the same PROGRAM_TEMPORARY component */ + + /** + * Unordered linked list of instructions that read from this value. + */ + struct reg_value_reader *Readers; + + /** + * Number of readers of this value. This is calculated during @ref scan_instructions + * and continually decremented during code emission. + * When this count reaches zero, the instruction that writes the @ref Next value + * can be scheduled. + */ + GLuint NumReaders; +}; + +/** + * Used to translate a PROGRAM_INPUT or PROGRAM_TEMPORARY Mesa register + * to the proper hardware temporary. + */ +struct pair_register_translation { + GLuint Allocated:1; + GLuint HwIndex:8; + GLuint RefCount:23; /**< # of times this occurs in an unscheduled instruction SrcReg or DstReg */ + + /** + * Notes the value that is currently contained in each component + * (only used for PROGRAM_TEMPORARY registers). + */ + struct reg_value *Value[4]; +}; + +struct pair_state { + GLcontext *Ctx; + struct gl_program *Program; + const struct radeon_pair_handler *Handler; + GLboolean Error; + GLboolean Debug; + GLboolean Verbose; + void *UserData; + + /** + * Translate Mesa registers to hardware registers + */ + struct pair_register_translation Inputs[FRAG_ATTRIB_MAX]; + struct pair_register_translation Temps[MAX_PROGRAM_TEMPS]; + + /** + * Derived information about program instructions. + */ + struct pair_state_instruction *Instructions; + + struct { + GLuint RefCount; /**< # of times this occurs in an unscheduled SrcReg or DstReg */ + } HwTemps[128]; + + /** + * Linked list of instructions that can be scheduled right now, + * based on which ALU/TEX resources they require. + */ + struct pair_state_instruction *ReadyFullALU; + struct pair_state_instruction *ReadyRGB; + struct pair_state_instruction *ReadyAlpha; + struct pair_state_instruction *ReadyTEX; + + /** + * Pool of @ref reg_value structures for fast allocation. + */ + struct reg_value *ValuePool; + GLuint ValuePoolUsed; + struct reg_value_reader *ReaderPool; + GLuint ReaderPoolUsed; +}; + + +static struct pair_register_translation *get_register(struct pair_state *s, GLuint file, GLuint index) +{ + switch(file) { + case PROGRAM_TEMPORARY: return &s->Temps[index]; + case PROGRAM_INPUT: return &s->Inputs[index]; + default: return 0; + } +} + + +static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index) +{ + GLuint hwindex; + + struct pair_register_translation *t = get_register(s, file, index); + if (!t) { + _mesa_problem(s->Ctx, "get_hw_reg: %i[%i]\n", file, index); + return 0; + } + + if (t->Allocated) + return t->HwIndex; + + for(hwindex = 0; hwindex < s->Handler->MaxHwTemps; ++hwindex) + if (!s->HwTemps[hwindex].RefCount) + break; + + if (hwindex >= s->Handler->MaxHwTemps) { + error("Ran out of hardware temporaries"); + return 0; + } + + s->HwTemps[hwindex].RefCount = t->RefCount; + t->Allocated = 1; + t->HwIndex = hwindex; + return hwindex; +} + + +static void deref_hw_reg(struct pair_state *s, GLuint hwindex) +{ + if (!s->HwTemps[hwindex].RefCount) { + error("Hwindex %i refcount error", hwindex); + return; + } + + s->HwTemps[hwindex].RefCount--; +} + +static void add_pairinst_to_list(struct pair_state_instruction **list, struct pair_state_instruction *pairinst) +{ + pairinst->NextReady = *list; + *list = pairinst; +} + +/** + * The instruction at the given IP has become ready. Link it into the ready + * instructions. + */ +static void instruction_ready(struct pair_state *s, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + + if (s->Verbose) + _mesa_printf("instruction_ready(%i)\n", ip); + + if (pairinst->IsTex) + add_pairinst_to_list(&s->ReadyTEX, pairinst); + else if (!pairinst->NeedAlpha) + add_pairinst_to_list(&s->ReadyRGB, pairinst); + else if (!pairinst->NeedRGB) + add_pairinst_to_list(&s->ReadyAlpha, pairinst); + else + add_pairinst_to_list(&s->ReadyFullALU, pairinst); +} + + +/** + * Finally rewrite ADD, MOV, MUL as the appropriate native instruction + * and reverse the order of arguments for CMP. + */ +static void final_rewrite(struct pair_state *s, struct prog_instruction *inst) +{ + struct prog_src_register tmp; + + switch(inst->Opcode) { + case OPCODE_ADD: + inst->SrcReg[2] = inst->SrcReg[1]; + inst->SrcReg[1].File = PROGRAM_BUILTIN; + inst->SrcReg[1].Swizzle = SWIZZLE_1111; + inst->SrcReg[1].NegateBase = 0; + inst->SrcReg[1].NegateAbs = 0; + inst->Opcode = OPCODE_MAD; + break; + case OPCODE_CMP: + tmp = inst->SrcReg[2]; + inst->SrcReg[2] = inst->SrcReg[0]; + inst->SrcReg[0] = tmp; + break; + case OPCODE_MOV: + inst->SrcReg[1] = inst->SrcReg[0]; + inst->SrcReg[2].File = PROGRAM_BUILTIN; + inst->SrcReg[2].Swizzle = SWIZZLE_0000; + inst->Opcode = OPCODE_CMP; + // TODO: disable output modifiers on R500 + break; + case OPCODE_MUL: + inst->SrcReg[2].File = PROGRAM_BUILTIN; + inst->SrcReg[2].Swizzle = SWIZZLE_0000; + inst->Opcode = OPCODE_MAD; + break; + default: + /* nothing to do */ + break; + } +} + + +/** + * Classify an instruction according to which ALUs etc. it needs + */ +static void classify_instruction(struct pair_state *s, + struct prog_instruction *inst, struct pair_state_instruction *pairinst) +{ + pairinst->NeedRGB = (inst->DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0; + pairinst->NeedAlpha = (inst->DstReg.WriteMask & WRITEMASK_W) ? 1 : 0; + + switch(inst->Opcode) { + case OPCODE_ADD: + case OPCODE_CMP: + case OPCODE_FRC: + case OPCODE_MAD: + case OPCODE_MAX: + case OPCODE_MIN: + case OPCODE_MOV: + case OPCODE_MUL: + break; + case OPCODE_COS: + case OPCODE_EX2: + case OPCODE_LG2: + case OPCODE_RCP: + case OPCODE_RSQ: + case OPCODE_SIN: + pairinst->IsTranscendent = 1; + pairinst->NeedAlpha = 1; + break; + case OPCODE_DP4: + pairinst->NeedAlpha = 1; + /* fall through */ + case OPCODE_DP3: + pairinst->NeedRGB = 1; + break; + case OPCODE_KIL: + case OPCODE_TEX: + case OPCODE_TXB: + case OPCODE_TXP: + case OPCODE_END: + pairinst->IsTex = 1; + break; + default: + error("Unknown opcode %d\n", inst->Opcode); + break; + } +} + + +/** + * Count which (input, temporary) register is read and written how often, + * and scan the instruction stream to find dependencies. + */ +static void scan_instructions(struct pair_state *s) +{ + struct prog_instruction *inst; + struct pair_state_instruction *pairinst; + GLuint ip; + + for(inst = s->Program->Instructions, pairinst = s->Instructions, ip = 0; + inst->Opcode != OPCODE_END; + ++inst, ++pairinst, ++ip) { + final_rewrite(s, inst); + classify_instruction(s, inst, pairinst); + + int nsrc = _mesa_num_inst_src_regs(inst->Opcode); + int j; + for(j = 0; j < nsrc; j++) { + struct pair_register_translation *t = + get_register(s, inst->SrcReg[j].File, inst->SrcReg[j].Index); + if (!t) + continue; + + t->RefCount++; + + if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) { + int i; + for(i = 0; i < 4; ++i) { + GLuint swz = GET_SWZ(inst->SrcReg[j].Swizzle, i); + if (swz >= 4) + continue; /* constant or NIL swizzle */ + if (!t->Value[swz]) + continue; /* this is an undefined read */ + + /* Do not add a dependency if this instruction + * also rewrites the value. The code below adds + * a dependency for the DstReg, which is a superset + * of the SrcReg dependency. */ + if (inst->DstReg.File == PROGRAM_TEMPORARY && + inst->DstReg.Index == inst->SrcReg[j].Index && + GET_BIT(inst->DstReg.WriteMask, swz)) + continue; + + struct reg_value_reader* r = &s->ReaderPool[s->ReaderPoolUsed++]; + pairinst->NumDependencies++; + t->Value[swz]->NumReaders++; + r->IP = ip; + r->Next = t->Value[swz]->Readers; + t->Value[swz]->Readers = r; + } + } + } + + int ndst = _mesa_num_inst_dst_regs(inst->Opcode); + if (ndst) { + struct pair_register_translation *t = + get_register(s, inst->DstReg.File, inst->DstReg.Index); + if (t) { + t->RefCount++; + + if (inst->DstReg.File == PROGRAM_TEMPORARY) { + int j; + for(j = 0; j < 4; ++j) { + if (!GET_BIT(inst->DstReg.WriteMask, j)) + continue; + + struct reg_value* v = &s->ValuePool[s->ValuePoolUsed++]; + v->IP = ip; + if (t->Value[j]) { + pairinst->NumDependencies++; + t->Value[j]->Next = v; + } + t->Value[j] = v; + pairinst->Values[j] = v; + } + } + } + } + + if (s->Verbose) + _mesa_printf("scan(%i): NumDeps = %i\n", ip, pairinst->NumDependencies); + + if (!pairinst->NumDependencies) + instruction_ready(s, ip); + } + + /* Clear the PROGRAM_TEMPORARY state */ + int i, j; + for(i = 0; i < MAX_PROGRAM_TEMPS; ++i) { + for(j = 0; j < 4; ++j) + s->Temps[i].Value[j] = 0; + } +} + + +/** + * Reserve hardware temporary registers for the program inputs. + * + * @note This allocation is performed explicitly, because the order of inputs + * is determined by the RS hardware. + */ +static void allocate_input_registers(struct pair_state *s) +{ + GLuint InputsRead = s->Program->InputsRead; + int i; + + /* Texcoords come first */ + for (i = 0; i < s->Ctx->Const.MaxTextureUnits; i++) { + if (InputsRead & (FRAG_BIT_TEX0 << i)) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i); + } + InputsRead &= ~FRAG_BITS_TEX_ANY; + + /* fragment position treated as a texcoord */ + if (InputsRead & FRAG_BIT_WPOS) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS); + InputsRead &= ~FRAG_BIT_WPOS; + + /* Then primary colour */ + if (InputsRead & FRAG_BIT_COL0) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0); + InputsRead &= ~FRAG_BIT_COL0; + + /* Secondary color */ + if (InputsRead & FRAG_BIT_COL1) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1); + InputsRead &= ~FRAG_BIT_COL1; + + /* Anything else */ + if (InputsRead) + error("Don't know how to handle inputs 0x%x\n", InputsRead); +} + + +static void decrement_dependencies(struct pair_state *s, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + ASSERT(pairinst->NumDependencies > 0); + if (!--pairinst->NumDependencies) + instruction_ready(s, ip); +} + +/** + * Update the dependency tracking state based on what the instruction + * at the given IP does. + */ +static void commit_instruction(struct pair_state *s, int ip) +{ + struct prog_instruction *inst = s->Program->Instructions + ip; + struct pair_state_instruction *pairinst = s->Instructions + ip; + + if (s->Verbose) + _mesa_printf("commit_instruction(%i)\n", ip); + + if (inst->DstReg.File == PROGRAM_TEMPORARY) { + struct pair_register_translation *t = &s->Temps[inst->DstReg.Index]; + deref_hw_reg(s, t->HwIndex); + + int i; + for(i = 0; i < 4; ++i) { + if (!GET_BIT(inst->DstReg.WriteMask, i)) + continue; + + t->Value[i] = pairinst->Values[i]; + if (t->Value[i]->NumReaders) { + struct reg_value_reader *r; + for(r = pairinst->Values[i]->Readers; r; r = r->Next) + decrement_dependencies(s, r->IP); + } else if (t->Value[i]->Next) { + /* This happens when the only reader writes + * the register at the same time */ + decrement_dependencies(s, t->Value[i]->Next->IP); + } + } + } + + int nsrc = _mesa_num_inst_src_regs(inst->Opcode); + int i; + for(i = 0; i < nsrc; i++) { + struct pair_register_translation *t = get_register(s, inst->SrcReg[i].File, inst->SrcReg[i].Index); + if (!t) + continue; + + deref_hw_reg(s, get_hw_reg(s, inst->SrcReg[i].File, inst->SrcReg[i].Index)); + + if (inst->SrcReg[i].File != PROGRAM_TEMPORARY) + continue; + + int j; + for(j = 0; j < 4; ++j) { + GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j); + if (swz >= 4) + continue; + if (!t->Value[swz]) + continue; + + /* Do not free a dependency if this instruction + * also rewrites the value. See scan_instructions. */ + if (inst->DstReg.File == PROGRAM_TEMPORARY && + inst->DstReg.Index == inst->SrcReg[i].Index && + GET_BIT(inst->DstReg.WriteMask, swz)) + continue; + + if (!--t->Value[swz]->NumReaders) { + if (t->Value[swz]->Next) + decrement_dependencies(s, t->Value[swz]->Next->IP); + } + } + } +} + + +/** + * Emit all ready texture instructions in a single block. + * + * Emit as a single block to (hopefully) sample many textures in parallel, + * and to avoid hardware indirections on R300. + * + * In R500, we don't really know when the result of a texture instruction + * arrives. So allocate all destinations first, to make sure they do not + * arrive early and overwrite a texture coordinate we're going to use later + * in the block. + */ +static void emit_all_tex(struct pair_state *s) +{ + struct pair_state_instruction *readytex; + struct pair_state_instruction *pairinst; + + ASSERT(s->ReadyTEX); + + // Don't let the ready list change under us! + readytex = s->ReadyTEX; + s->ReadyTEX = 0; + + // Allocate destination hardware registers in one block to avoid conflicts. + for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) { + int ip = pairinst - s->Instructions; + struct prog_instruction *inst = s->Program->Instructions + ip; + if (inst->Opcode != OPCODE_KIL) + get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index); + } + + if (s->Debug) + _mesa_printf(" BEGIN_TEX\n"); + + for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) { + int ip = pairinst - s->Instructions; + struct prog_instruction *inst = s->Program->Instructions + ip; + commit_instruction(s, ip); + + if (inst->Opcode != OPCODE_KIL) + inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index); + inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index); + + if (s->Debug) { + _mesa_printf(" "); + _mesa_print_instruction(inst); + } + s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst); + } + + if (s->Handler->EndTexBlock) + s->Handler->EndTexBlock(s->UserData); + + if (s->Debug) + _mesa_printf(" END_TEX\n"); +} + + +static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instruction *pair, + struct prog_src_register src, GLboolean rgb, GLboolean alpha) +{ + int candidate = -1; + int candidate_quality = -1; + int i; + + if (!rgb && !alpha) + return 0; + + GLuint constant; + GLuint index; + + if (src.File == PROGRAM_TEMPORARY || src.File == PROGRAM_INPUT) { + constant = 0; + index = get_hw_reg(s, src.File, src.Index); + } else { + constant = 1; + s->Error |= !s->Handler->EmitConst(s->UserData, src.File, src.Index, &index); + } + + for(i = 0; i < 3; ++i) { + int q = 0; + if (rgb) { + if (pair->RGB.Src[i].Used) { + if (pair->RGB.Src[i].Constant != constant || + pair->RGB.Src[i].Index != index) + continue; + q++; + } + } + if (alpha) { + if (pair->Alpha.Src[i].Used) { + if (pair->Alpha.Src[i].Constant != constant || + pair->Alpha.Src[i].Index != index) + continue; + q++; + } + } + if (q > candidate_quality) { + candidate_quality = q; + candidate = i; + } + } + + if (candidate >= 0) { + if (rgb) { + pair->RGB.Src[candidate].Used = 1; + pair->RGB.Src[candidate].Constant = constant; + pair->RGB.Src[candidate].Index = index; + } + if (alpha) { + pair->Alpha.Src[candidate].Used = 1; + pair->Alpha.Src[candidate].Constant = constant; + pair->Alpha.Src[candidate].Index = index; + } + } + + return candidate; +} + + + +/** + * Fill the given ALU instruction's opcodes and source operands into the given pair, + * if possible. + */ +static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + struct prog_instruction *inst = s->Program->Instructions + ip; + + ASSERT(!pairinst->NeedRGB || pair->RGB.Opcode == OPCODE_NOP); + ASSERT(!pairinst->NeedAlpha || pair->Alpha.Opcode == OPCODE_NOP); + + if (pairinst->NeedRGB) { + if (pairinst->IsTranscendent) + pair->RGB.Opcode = OPCODE_REPL_ALPHA; + else + pair->RGB.Opcode = inst->Opcode; + } + if (pairinst->NeedAlpha) + pair->Alpha.Opcode = inst->Opcode; + + int nargs = _mesa_num_inst_src_regs(inst->Opcode); + int i; + + for(i = 0; i < nargs; ++i) { + int source; + if (pairinst->NeedRGB && !pairinst->IsTranscendent) { + GLboolean srcrgb = GL_FALSE; + GLboolean srcalpha = GL_FALSE; + GLuint negatebase = 0; + int j; + for(j = 0; j < 3; ++j) { + GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j); + if (swz < 3) + srcrgb = GL_TRUE; + else if (swz < 4) + srcalpha = GL_TRUE; + if (swz != SWIZZLE_NIL && GET_BIT(inst->SrcReg[i].NegateBase, j)) + negatebase = 1; + } + source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha); + if (source < 0) + return GL_FALSE; + pair->RGB.Arg[i].Source = source; + pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff; + pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs; + pair->RGB.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs; + } + if (pairinst->NeedAlpha) { + GLboolean srcrgb = GL_FALSE; + GLboolean srcalpha = GL_FALSE; + GLuint negatebase = GET_BIT(inst->SrcReg[i].NegateBase, pairinst->IsTranscendent ? 0 : 3); + GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3); + if (swz < 3) + srcrgb = GL_TRUE; + else if (swz < 4) + srcalpha = GL_TRUE; + source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha); + if (source < 0) + return GL_FALSE; + pair->Alpha.Arg[i].Source = source; + pair->Alpha.Arg[i].Swizzle = swz; + pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs; + pair->Alpha.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs; + } + } + + return GL_TRUE; +} + + +/** + * Fill in the destination register information. + * + * This is split from filling in source registers because we want + * to avoid allocating hardware temporaries for destinations until + * we are absolutely certain that we're going to emit a certain + * instruction pairing. + */ +static void fill_dest_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + struct prog_instruction *inst = s->Program->Instructions + ip; + + if (inst->DstReg.File == PROGRAM_OUTPUT) { + if (inst->DstReg.Index == FRAG_RESULT_COLR) { + pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ; + pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3); + } else if (inst->DstReg.Index == FRAG_RESULT_DEPR) { + pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3); + } + } else { + GLuint hwindex = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index); + if (pairinst->NeedRGB) { + pair->RGB.DestIndex = hwindex; + pair->RGB.WriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ; + } + if (pairinst->NeedAlpha) { + pair->Alpha.DestIndex = hwindex; + pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3); + } + } +} + + +/** + * Find a good ALU instruction or pair of ALU instruction and emit it. + * + * Prefer emitting full ALU instructions, so that when we reach a point + * where no full ALU instruction can be emitted, we have more candidates + * for RGB/Alpha pairing. + */ +static void emit_alu(struct pair_state *s) +{ + struct radeon_pair_instruction pair; + + if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) { + int ip; + if (s->ReadyFullALU) { + ip = s->ReadyFullALU - s->Instructions; + s->ReadyFullALU = s->ReadyFullALU->NextReady; + } else if (s->ReadyRGB) { + ip = s->ReadyRGB - s->Instructions; + s->ReadyRGB = s->ReadyRGB->NextReady; + } else { + ip = s->ReadyAlpha - s->Instructions; + s->ReadyAlpha = s->ReadyAlpha->NextReady; + } + + _mesa_bzero(&pair, sizeof(pair)); + fill_instruction_into_pair(s, &pair, ip); + fill_dest_into_pair(s, &pair, ip); + commit_instruction(s, ip); + } else { + struct pair_state_instruction **prgb; + struct pair_state_instruction **palpha; + + /* Some pairings might fail because they require too + * many source slots; try all possible pairings if necessary */ + for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) { + for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) { + int rgbip = *prgb - s->Instructions; + int alphaip = *palpha - s->Instructions; + _mesa_bzero(&pair, sizeof(pair)); + fill_instruction_into_pair(s, &pair, rgbip); + if (!fill_instruction_into_pair(s, &pair, alphaip)) + continue; + *prgb = (*prgb)->NextReady; + *palpha = (*palpha)->NextReady; + fill_dest_into_pair(s, &pair, rgbip); + fill_dest_into_pair(s, &pair, alphaip); + commit_instruction(s, rgbip); + commit_instruction(s, alphaip); + goto success; + } + } + + /* No success in pairing; just take the first RGB instruction */ + int ip = s->ReadyRGB - s->Instructions; + s->ReadyRGB = s->ReadyRGB->NextReady; + _mesa_bzero(&pair, sizeof(pair)); + fill_instruction_into_pair(s, &pair, ip); + fill_dest_into_pair(s, &pair, ip); + commit_instruction(s, ip); + success: ; + } + + if (s->Debug) + radeonPrintPairInstruction(&pair); + + s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair); +} + + +GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program, + const struct radeon_pair_handler* handler, void *userdata) +{ + struct pair_state s; + + _mesa_bzero(&s, sizeof(s)); + s.Ctx = ctx; + s.Program = program; + s.Handler = handler; + s.UserData = userdata; + s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE; + s.Verbose = GL_FALSE && s.Debug; + + s.Instructions = (struct pair_state_instruction*)_mesa_calloc( + sizeof(struct pair_state_instruction)*s.Program->NumInstructions); + s.ValuePool = (struct reg_value*)_mesa_calloc(sizeof(struct reg_value)*s.Program->NumInstructions*4); + s.ReaderPool = (struct reg_value_reader*)_mesa_calloc( + sizeof(struct reg_value_reader)*s.Program->NumInstructions*12); + + if (s.Debug) + _mesa_printf("Emit paired program\n"); + + scan_instructions(&s); + allocate_input_registers(&s); + + while(!s.Error && + (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) { + if (s.ReadyTEX) + emit_all_tex(&s); + + while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha) + emit_alu(&s); + } + + if (s.Debug) + _mesa_printf(" END\n"); + + _mesa_free(s.Instructions); + _mesa_free(s.ValuePool); + _mesa_free(s.ReaderPool); + + return !s.Error; +} + + +static void print_pair_src(int i, struct radeon_pair_instruction_source* src) +{ + _mesa_printf(" Src%i = %s[%i]", i, src->Constant ? "CNST" : "TEMP", src->Index); +} + +static const char* opcode_string(GLuint opcode) +{ + if (opcode == OPCODE_REPL_ALPHA) + return "SOP"; + else + return _mesa_opcode_string(opcode); +} + +static int num_pairinst_args(GLuint opcode) +{ + if (opcode == OPCODE_REPL_ALPHA) + return 0; + else + return _mesa_num_inst_src_regs(opcode); +} + +static char swizzle_char(GLuint swz) +{ + switch(swz) { + case SWIZZLE_X: return 'x'; + case SWIZZLE_Y: return 'y'; + case SWIZZLE_Z: return 'z'; + case SWIZZLE_W: return 'w'; + case SWIZZLE_ZERO: return '0'; + case SWIZZLE_ONE: return '1'; + case SWIZZLE_NIL: return '_'; + default: return '?'; + } +} + +void radeonPrintPairInstruction(struct radeon_pair_instruction *inst) +{ + int nargs; + int i; + + _mesa_printf(" RGB: "); + for(i = 0; i < 3; ++i) { + if (inst->RGB.Src[i].Used) + print_pair_src(i, inst->RGB.Src + i); + } + _mesa_printf("\n"); + _mesa_printf(" Alpha:"); + for(i = 0; i < 3; ++i) { + if (inst->Alpha.Src[i].Used) + print_pair_src(i, inst->Alpha.Src + i); + } + _mesa_printf("\n"); + + _mesa_printf(" %s%s", opcode_string(inst->RGB.Opcode), inst->RGB.Saturate ? "_SAT" : ""); + if (inst->RGB.WriteMask) + _mesa_printf(" TEMP[%i].%s%s%s", inst->RGB.DestIndex, + (inst->RGB.WriteMask & 1) ? "x" : "", + (inst->RGB.WriteMask & 2) ? "y" : "", + (inst->RGB.WriteMask & 4) ? "z" : ""); + if (inst->RGB.OutputWriteMask) + _mesa_printf(" COLOR.%s%s%s", + (inst->RGB.OutputWriteMask & 1) ? "x" : "", + (inst->RGB.OutputWriteMask & 2) ? "y" : "", + (inst->RGB.OutputWriteMask & 4) ? "z" : ""); + nargs = num_pairinst_args(inst->RGB.Opcode); + for(i = 0; i < nargs; ++i) { + const char* abs = inst->RGB.Arg[i].Abs ? "|" : ""; + const char* neg = inst->RGB.Arg[i].Negate ? "-" : ""; + _mesa_printf(", %s%sSrc%i.%c%c%c%s", neg, abs, inst->RGB.Arg[i].Source, + swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 0)), + swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 1)), + swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 2)), + abs); + } + _mesa_printf("\n"); + + _mesa_printf(" %s%s", opcode_string(inst->Alpha.Opcode), inst->Alpha.Saturate ? "_SAT" : ""); + if (inst->Alpha.WriteMask) + _mesa_printf(" TEMP[%i].w", inst->Alpha.DestIndex); + if (inst->Alpha.OutputWriteMask) + _mesa_printf(" COLOR.w"); + if (inst->Alpha.DepthWriteMask) + _mesa_printf(" DEPTH.w"); + nargs = num_pairinst_args(inst->Alpha.Opcode); + for(i = 0; i < nargs; ++i) { + const char* abs = inst->Alpha.Arg[i].Abs ? "|" : ""; + const char* neg = inst->Alpha.Arg[i].Negate ? "-" : ""; + _mesa_printf(", %s%sSrc%i.%c%s", neg, abs, inst->Alpha.Arg[i].Source, + swizzle_char(inst->Alpha.Arg[i].Swizzle), abs); + } + _mesa_printf("\n"); +} diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.h b/src/mesa/drivers/dri/r300/radeon_program_pair.h new file mode 100644 index 0000000000..b2bdd08d27 --- /dev/null +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.h @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RADEON_PROGRAM_PAIR_H_ +#define __RADEON_PROGRAM_PAIR_H_ + +#include "radeon_program.h" + + +/** + * Represents a paired instruction, as found in R300 and R500 + * fragment programs. + */ +struct radeon_pair_instruction_source { + GLuint Index:8; + GLuint Constant:1; + GLuint Used:1; +}; + +struct radeon_pair_instruction_rgb { + GLuint Opcode:8; + GLuint DestIndex:8; + GLuint WriteMask:3; + GLuint OutputWriteMask:3; + GLuint Saturate:1; + + struct radeon_pair_instruction_source Src[3]; + + struct { + GLuint Source:2; + GLuint Swizzle:9; + GLuint Abs:1; + GLuint Negate:1; + } Arg[3]; +}; + +struct radeon_pair_instruction_alpha { + GLuint Opcode:8; + GLuint DestIndex:8; + GLuint WriteMask:1; + GLuint OutputWriteMask:1; + GLuint DepthWriteMask:1; + GLuint Saturate:1; + + struct radeon_pair_instruction_source Src[3]; + + struct { + GLuint Source:2; + GLuint Swizzle:3; + GLuint Abs:1; + GLuint Negate:1; + } Arg[3]; +}; + +struct radeon_pair_instruction { + struct radeon_pair_instruction_rgb RGB; + struct radeon_pair_instruction_alpha Alpha; +}; + + +/** + * + */ +struct radeon_pair_handler { + /** + * Fill in the proper hardware index for the given constant register. + * + * @return GL_FALSE on error. + */ + GLboolean (*EmitConst)(void*, GLuint file, GLuint index, GLuint *hwindex); + + /** + * Write a paired instruction to the hardware. + * + * @return GL_FALSE on error. + */ + GLboolean (*EmitPaired)(void*, struct radeon_pair_instruction*); + + /** + * Write a texture instruction to the hardware. + * Register indices have already been rewritten to the allocated + * hardware register numbers. + * + * @return GL_FALSE on error. + */ + GLboolean (*EmitTex)(void*, struct prog_instruction*); + + /** + * Called after a block of contiguous, independent texture + * instructions has been emitted. + */ + void (*EndTexBlock)(void*); + + GLuint MaxHwTemps; +}; + +GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program, + const struct radeon_pair_handler*, void *userdata); + +void radeonPrintPairInstruction(struct radeon_pair_instruction *inst); + +#endif /* __RADEON_PROGRAM_PAIR_H_ */ -- cgit v1.2.3 From 11d711df360265f25dc5a96cc3a4c5a2d34f5b64 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 12 Jul 2008 01:19:19 +0200 Subject: r300: Explicitly set absolute value for the argument of RSQ This fixes the last r500 bug related to glean/fragProg1. --- src/mesa/drivers/dri/r300/radeon_program_alu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index 8daa94c726..f06ebfcdbf 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -153,6 +153,7 @@ static struct prog_src_register absolute(struct prog_src_register reg) { struct prog_src_register newreg = reg; newreg.Abs = 1; + newreg.NegateBase = 0; newreg.NegateAbs = 0; return newreg; } @@ -343,6 +344,12 @@ static void transform_POW(struct radeon_transform_context* t, emit1(t->Program, OPCODE_EX2, inst->DstReg, tempsrc); } +static void transform_RSQ(struct radeon_transform_context* t, + struct prog_instruction* inst) +{ + emit1(t->Program, OPCODE_RSQ, inst->DstReg, absolute(inst->SrcReg[0])); +} + static void transform_SGE(struct radeon_transform_context* t, struct prog_instruction* inst) { @@ -397,6 +404,9 @@ static void transform_XPD(struct radeon_transform_context* t, * using: * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP * + * Transforms RSQ to Radeon's native RSQ by explicitly setting + * absolute value. + * * @note should be applicable to R300 and R500 fragment programs. */ GLboolean radeonTransformALU(struct radeon_transform_context* t, @@ -411,6 +421,7 @@ GLboolean radeonTransformALU(struct radeon_transform_context* t, case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE; case OPCODE_LRP: transform_LRP(t, inst); return GL_TRUE; case OPCODE_POW: transform_POW(t, inst); return GL_TRUE; + case OPCODE_RSQ: transform_RSQ(t, inst); return GL_TRUE; case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE; case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE; case OPCODE_SUB: transform_SUB(t, inst); return GL_TRUE; -- cgit v1.2.3 From 8774fcd89acc9e180e0cb135bd62646f58cb623e Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 12 Jul 2008 11:11:59 +0200 Subject: r300: Fix input register allocation in radeon_program_pair When an input is marked in gl_program.InputsRead but is not actually read in the final program (due to dead-code elimination or whatever), the order of input registers must still match gl_program.InputsRead. This is done even more explicitly now. --- src/mesa/drivers/dri/r300/radeon_program_pair.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c index 86180edcb5..4eaac50412 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_pair.c +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c @@ -167,6 +167,15 @@ static struct pair_register_translation *get_register(struct pair_state *s, GLui } } +static void alloc_hw_reg(struct pair_state *s, GLuint file, GLuint index, GLuint hwindex) +{ + struct pair_register_translation *t = get_register(s, file, index); + ASSERT(!s->HwTemps[hwindex].RefCount); + ASSERT(!t->Allocated); + s->HwTemps[hwindex].RefCount = t->RefCount; + t->Allocated = 1; + t->HwIndex = hwindex; +} static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index) { @@ -190,9 +199,7 @@ static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index) return 0; } - s->HwTemps[hwindex].RefCount = t->RefCount; - t->Allocated = 1; - t->HwIndex = hwindex; + alloc_hw_reg(s, file, index, hwindex); return hwindex; } @@ -430,27 +437,28 @@ static void allocate_input_registers(struct pair_state *s) { GLuint InputsRead = s->Program->InputsRead; int i; + GLuint hwindex = 0; /* Texcoords come first */ for (i = 0; i < s->Ctx->Const.MaxTextureUnits; i++) { if (InputsRead & (FRAG_BIT_TEX0 << i)) - get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i); + alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i, hwindex++); } InputsRead &= ~FRAG_BITS_TEX_ANY; /* fragment position treated as a texcoord */ if (InputsRead & FRAG_BIT_WPOS) - get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS); + alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS, hwindex++); InputsRead &= ~FRAG_BIT_WPOS; /* Then primary colour */ if (InputsRead & FRAG_BIT_COL0) - get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0); + alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0, hwindex++); InputsRead &= ~FRAG_BIT_COL0; /* Secondary color */ if (InputsRead & FRAG_BIT_COL1) - get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1); + alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1, hwindex++); InputsRead &= ~FRAG_BIT_COL1; /* Anything else */ -- cgit v1.2.3 From cf0ae102dbc34bf75e853c2ece630fe18dd4d41e Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 12 Jul 2008 12:04:28 +0200 Subject: r500: Set Saturate correctly in radeon_program_pair --- src/mesa/drivers/dri/r300/radeon_program_pair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c index 4eaac50412..85ddf1dc50 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_pair.c +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c @@ -682,9 +682,14 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_ pair->RGB.Opcode = OPCODE_REPL_ALPHA; else pair->RGB.Opcode = inst->Opcode; + if (inst->SaturateMode == SATURATE_ZERO_ONE) + pair->RGB.Saturate = 1; } - if (pairinst->NeedAlpha) + if (pairinst->NeedAlpha) { pair->Alpha.Opcode = inst->Opcode; + if (inst->SaturateMode == SATURATE_ZERO_ONE) + pair->Alpha.Saturate = 1; + } int nargs = _mesa_num_inst_src_regs(inst->Opcode); int i; -- cgit v1.2.3 From 2d766923c45b544cca17c7fefe625715cf1fd1fe Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 12 Jul 2008 12:20:28 +0200 Subject: r300: Fix saturate mode handling in radeon_program_alu --- src/mesa/drivers/dri/r300/radeon_program_alu.c | 110 ++++++++++++++----------- 1 file changed, 60 insertions(+), 50 deletions(-) (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c index f06ebfcdbf..e0a2bd0e93 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_alu.c +++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c @@ -39,24 +39,26 @@ static struct prog_instruction *emit1(struct gl_program* p, - gl_inst_opcode Opcode, struct prog_dst_register DstReg, + gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg, struct prog_src_register SrcReg) { struct prog_instruction *fpi = radeonAppendInstructions(p, 1); fpi->Opcode = Opcode; + fpi->SaturateMode = Saturate; fpi->DstReg = DstReg; fpi->SrcReg[0] = SrcReg; return fpi; } static struct prog_instruction *emit2(struct gl_program* p, - gl_inst_opcode Opcode, struct prog_dst_register DstReg, + gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg, struct prog_src_register SrcReg0, struct prog_src_register SrcReg1) { struct prog_instruction *fpi = radeonAppendInstructions(p, 1); fpi->Opcode = Opcode; + fpi->SaturateMode = Saturate; fpi->DstReg = DstReg; fpi->SrcReg[0] = SrcReg0; fpi->SrcReg[1] = SrcReg1; @@ -64,13 +66,14 @@ static struct prog_instruction *emit2(struct gl_program* p, } static struct prog_instruction *emit3(struct gl_program* p, - gl_inst_opcode Opcode, struct prog_dst_register DstReg, + gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg, struct prog_src_register SrcReg0, struct prog_src_register SrcReg1, struct prog_src_register SrcReg2) { struct prog_instruction *fpi = radeonAppendInstructions(p, 1); fpi->Opcode = Opcode; + fpi->SaturateMode = Saturate; fpi->DstReg = DstReg; fpi->SrcReg[0] = SrcReg0; fpi->SrcReg[1] = SrcReg1; @@ -188,7 +191,7 @@ static void transform_ABS(struct radeon_transform_context* t, src.Abs = 1; src.NegateBase = 0; src.NegateAbs = 0; - emit1(t->Program, OPCODE_MOV, inst->DstReg, src); + emit1(t->Program, OPCODE_MOV, inst->SaturateMode, inst->DstReg, src); } static void transform_DPH(struct radeon_transform_context* t, @@ -198,7 +201,7 @@ static void transform_DPH(struct radeon_transform_context* t, if (src0.NegateAbs) { if (src0.Abs) { int tempreg = radeonFindFreeTemporary(t); - emit1(t->Program, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0); + emit1(t->Program, OPCODE_MOV, 0, dstreg(PROGRAM_TEMPORARY, tempreg), src0); src0 = srcreg(src0.File, src0.Index); } else { src0.NegateAbs = 0; @@ -207,7 +210,7 @@ static void transform_DPH(struct radeon_transform_context* t, } set_swizzle(&src0, 3, SWIZZLE_ONE); set_negate_base(&src0, 3, 0); - emit2(t->Program, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]); + emit2(t->Program, OPCODE_DP4, inst->SaturateMode, inst->DstReg, src0, inst->SrcReg[1]); } /** @@ -217,7 +220,7 @@ static void transform_DPH(struct radeon_transform_context* t, static void transform_DST(struct radeon_transform_context* t, struct prog_instruction* inst) { - emit2(t->Program, OPCODE_MUL, inst->DstReg, + emit2(t->Program, OPCODE_MUL, inst->SaturateMode, inst->DstReg, swizzle(inst->SrcReg[0], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE), swizzle(inst->SrcReg[1], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_ONE, SWIZZLE_W)); } @@ -226,8 +229,9 @@ static void transform_FLR(struct radeon_transform_context* t, struct prog_instruction* inst) { int tempreg = radeonFindFreeTemporary(t); - emit1(t->Program, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]); - emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg))); + emit1(t->Program, OPCODE_FRC, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]); + emit2(t->Program, OPCODE_ADD, inst->SaturateMode, inst->DstReg, + inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg))); } /** @@ -279,42 +283,42 @@ static void transform_LIT(struct radeon_transform_context* t, // tmp.x = max(0.0, Src.x); // tmp.y = max(0.0, Src.y); // tmp.w = clamp(Src.z, -128+eps, 128-eps); - emit2(t->Program, OPCODE_MAX, + emit2(t->Program, OPCODE_MAX, 0, dstregtmpmask(temp, WRITEMASK_XYW), inst->SrcReg[0], swizzle(srcreg(PROGRAM_CONSTANT, constant), SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3)); - emit2(t->Program, OPCODE_MIN, + emit2(t->Program, OPCODE_MIN, 0, dstregtmpmask(temp, WRITEMASK_Z), swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle))); // tmp.w = Pow(tmp.y, tmp.w) - emit1(t->Program, OPCODE_LG2, + emit1(t->Program, OPCODE_LG2, 0, dstregtmpmask(temp, WRITEMASK_W), swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y)); - emit2(t->Program, OPCODE_MUL, + emit2(t->Program, OPCODE_MUL, 0, dstregtmpmask(temp, WRITEMASK_W), swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)); - emit1(t->Program, OPCODE_EX2, + emit1(t->Program, OPCODE_EX2, 0, dstregtmpmask(temp, WRITEMASK_W), swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W)); // tmp.z = (tmp.x > 0) ? tmp.w : 0.0 - emit3(t->Program, OPCODE_CMP, + emit3(t->Program, OPCODE_CMP, inst->SaturateMode, dstregtmpmask(temp, WRITEMASK_Z), negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)), swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), builtin_zero); // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 - emit1(t->Program, OPCODE_MOV, + emit1(t->Program, OPCODE_MOV, inst->SaturateMode, dstregtmpmask(temp, WRITEMASK_XYW), swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE)); if (needTemporary) - emit1(t->Program, OPCODE_MOV, inst->DstReg, srctemp); + emit1(t->Program, OPCODE_MOV, 0, inst->DstReg, srctemp); } static void transform_LRP(struct radeon_transform_context* t, @@ -322,10 +326,10 @@ static void transform_LRP(struct radeon_transform_context* t, { int tempreg = radeonFindFreeTemporary(t); - emit2(t->Program, OPCODE_ADD, + emit2(t->Program, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[1], negate(inst->SrcReg[2])); - emit3(t->Program, OPCODE_MAD, + emit3(t->Program, OPCODE_MAD, inst->SaturateMode, inst->DstReg, inst->SrcReg[0], srcreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[2]); } @@ -339,15 +343,15 @@ static void transform_POW(struct radeon_transform_context* t, tempdst.WriteMask = WRITEMASK_W; tempsrc.Swizzle = SWIZZLE_WWWW; - emit1(t->Program, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0])); - emit2(t->Program, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1])); - emit1(t->Program, OPCODE_EX2, inst->DstReg, tempsrc); + emit1(t->Program, OPCODE_LG2, 0, tempdst, scalar(inst->SrcReg[0])); + emit2(t->Program, OPCODE_MUL, 0, tempdst, tempsrc, scalar(inst->SrcReg[1])); + emit1(t->Program, OPCODE_EX2, inst->SaturateMode, inst->DstReg, tempsrc); } static void transform_RSQ(struct radeon_transform_context* t, struct prog_instruction* inst) { - emit1(t->Program, OPCODE_RSQ, inst->DstReg, absolute(inst->SrcReg[0])); + emit1(t->Program, OPCODE_RSQ, inst->SaturateMode, inst->DstReg, absolute(inst->SrcReg[0])); } static void transform_SGE(struct radeon_transform_context* t, @@ -355,8 +359,9 @@ static void transform_SGE(struct radeon_transform_context* t, { int tempreg = radeonFindFreeTemporary(t); - emit2(t->Program, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); - emit3(t->Program, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one); + emit2(t->Program, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); + emit3(t->Program, OPCODE_CMP, inst->SaturateMode, inst->DstReg, + srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one); } static void transform_SLT(struct radeon_transform_context* t, @@ -364,20 +369,21 @@ static void transform_SLT(struct radeon_transform_context* t, { int tempreg = radeonFindFreeTemporary(t); - emit2(t->Program, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); - emit3(t->Program, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero); + emit2(t->Program, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1])); + emit3(t->Program, OPCODE_CMP, inst->SaturateMode, inst->DstReg, + srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero); } static void transform_SUB(struct radeon_transform_context* t, struct prog_instruction* inst) { - emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1])); + emit2(t->Program, OPCODE_ADD, inst->SaturateMode, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1])); } static void transform_SWZ(struct radeon_transform_context* t, struct prog_instruction* inst) { - emit1(t->Program, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]); + emit1(t->Program, OPCODE_MOV, inst->SaturateMode, inst->DstReg, inst->SrcReg[0]); } static void transform_XPD(struct radeon_transform_context* t, @@ -385,10 +391,10 @@ static void transform_XPD(struct radeon_transform_context* t, { int tempreg = radeonFindFreeTemporary(t); - emit2(t->Program, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg), + emit2(t->Program, OPCODE_MUL, 0, dstreg(PROGRAM_TEMPORARY, tempreg), swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W), swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W)); - emit3(t->Program, OPCODE_MAD, inst->DstReg, + emit3(t->Program, OPCODE_MAD, inst->SaturateMode, inst->DstReg, swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W), swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W), negate(srcreg(PROGRAM_TEMPORARY, tempreg))); @@ -471,18 +477,18 @@ static void sin_approx(struct radeon_transform_context* t, { GLuint tempreg = radeonFindFreeTemporary(t); - emit2(t->Program, OPCODE_MUL, dstregtmpmask(tempreg, WRITEMASK_XY), + emit2(t->Program, OPCODE_MUL, 0, dstregtmpmask(tempreg, WRITEMASK_XY), swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), srcreg(PROGRAM_CONSTANT, constants[0])); - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_X), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_X), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y), absolute(swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)); - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_Y), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_Y), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), absolute(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)), negate(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X))); - emit3(t->Program, OPCODE_MAD, dst, + emit3(t->Program, OPCODE_MAD, 0, dst, swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y), swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)); @@ -511,13 +517,13 @@ GLboolean radeonTransformTrigSimple(struct radeon_transform_context* t, // MAD tmp.x, src, 1/(2*PI), 0.75 // FRC tmp.x, tmp.x // MAD tmp.z, tmp.x, 2*PI, -PI - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W), swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)); - emit1(t->Program, OPCODE_FRC, dstregtmpmask(tempreg, WRITEMASK_W), + emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W)); - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z))); @@ -526,13 +532,13 @@ GLboolean radeonTransformTrigSimple(struct radeon_transform_context* t, swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), constants); } else if (inst->Opcode == OPCODE_SIN) { - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W), swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y)); - emit1(t->Program, OPCODE_FRC, dstregtmpmask(tempreg, WRITEMASK_W), + emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W)); - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_W), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W), swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z))); @@ -541,13 +547,13 @@ GLboolean radeonTransformTrigSimple(struct radeon_transform_context* t, swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), constants); } else { - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_XY), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY), swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W)); - emit1(t->Program, OPCODE_FRC, dstregtmpmask(tempreg, WRITEMASK_XY), + emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_XY), srcreg(PROGRAM_TEMPORARY, tempreg)); - emit3(t->Program, OPCODE_MAD, dstregtmpmask(tempreg, WRITEMASK_XY), + emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY), srcreg(PROGRAM_TEMPORARY, tempreg), swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W), negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z))); @@ -594,26 +600,30 @@ GLboolean radeonTransformTrigScale(struct radeon_transform_context* t, temp = radeonFindFreeTemporary(t); constant = _mesa_add_unnamed_constant(t->Program->Parameters, RCP_2PI, 1, &constant_swizzle); - emit2(t->Program, OPCODE_MUL, dstregtmpmask(temp, WRITEMASK_W), + emit2(t->Program, OPCODE_MUL, 0, dstregtmpmask(temp, WRITEMASK_W), swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)); - emit1(t->Program, OPCODE_FRC, dstregtmpmask(temp, WRITEMASK_W), + emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(temp, WRITEMASK_W), srcreg(PROGRAM_TEMPORARY, temp)); if (inst->Opcode == OPCODE_COS) { - emit1(t->Program, OPCODE_COS, inst->DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + emit1(t->Program, OPCODE_COS, inst->SaturateMode, inst->DstReg, + srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); } else if (inst->Opcode == OPCODE_SIN) { - emit1(t->Program, OPCODE_SIN, inst->DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + emit1(t->Program, OPCODE_SIN, inst->SaturateMode, + inst->DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); } else if (inst->Opcode == OPCODE_SCS) { struct prog_dst_register moddst = inst->DstReg; if (inst->DstReg.WriteMask & WRITEMASK_X) { moddst.WriteMask = WRITEMASK_X; - emit1(t->Program, OPCODE_COS, moddst, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + emit1(t->Program, OPCODE_COS, inst->SaturateMode, moddst, + srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); } if (inst->DstReg.WriteMask & WRITEMASK_Y) { moddst.WriteMask = WRITEMASK_Y; - emit1(t->Program, OPCODE_SIN, moddst, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); + emit1(t->Program, OPCODE_SIN, inst->SaturateMode, moddst, + srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW)); } } -- cgit v1.2.3 From e81ba58bf4c20229677cdf89b5970b55cefb2199 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Sat, 12 Jul 2008 21:13:03 +0200 Subject: r300_fragprog: Use nqssa+dce and program_pair for emit Share almost all code with r500_fragprog now. This also fixes Piglit's texrect-many test, which means that the compiz bicubic plugin should work with hardware acceleration now. --- src/mesa/drivers/dri/r300/Makefile | 1 + src/mesa/drivers/dri/r300/r300_context.h | 24 +- src/mesa/drivers/dri/r300/r300_fragprog.c | 61 +- src/mesa/drivers/dri/r300/r300_fragprog.h | 27 - src/mesa/drivers/dri/r300/r300_fragprog_emit.c | 1752 +++------------------ src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c | 227 +++ src/mesa/drivers/dri/r300/r300_fragprog_swizzle.h | 42 + src/mesa/drivers/dri/r300/r300_state.c | 33 +- src/mesa/drivers/dri/r300/r500_fragprog.h | 30 - src/mesa/drivers/dri/r300/radeon_program_pair.c | 6 +- src/mesa/drivers/dri/r300/radeon_program_pair.h | 6 +- 11 files changed, 521 insertions(+), 1688 deletions(-) create mode 100644 src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c create mode 100644 src/mesa/drivers/dri/r300/r300_fragprog_swizzle.h (limited to 'src/mesa/drivers/dri/r300') diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile index 9baa1e7131..6ca934204f 100644 --- a/src/mesa/drivers/dri/r300/Makefile +++ b/src/mesa/drivers/dri/r300/Makefile @@ -42,6 +42,7 @@ DRIVER_SOURCES = \ radeon_nqssadce.c \ r300_vertprog.c \ r300_fragprog.c \ + r300_fragprog_swizzle.c \ r300_fragprog_emit.c \ r500_fragprog.c \ r500_fragprog_emit.c \ diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index 8e9c5cee5f..98af6d8f10 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -683,16 +683,25 @@ struct r300_fragment_program_external_state { }; +struct r300_fragment_program_node { + int tex_offset; /**< first tex instruction */ + int tex_end; /**< last tex instruction, relative to tex_offset */ + int alu_offset; /**< first ALU instruction */ + int alu_end; /**< last ALU instruction, relative to alu_offset */ + int flags; +}; + /** * Stores an R300 fragment program in its compiled-to-hardware form. */ struct r300_fragment_program_code { struct { - int length; + int length; /**< total # of texture instructions used */ GLuint inst[PFS_MAX_TEX_INST]; } tex; struct { + int length; /**< total # of ALU instructions used */ struct { GLuint inst0; GLuint inst1; @@ -701,21 +710,10 @@ struct r300_fragment_program_code { } inst[PFS_MAX_ALU_INST]; } alu; - struct { - int tex_offset; - int tex_end; - int alu_offset; - int alu_end; - int flags; - } node[4]; + struct r300_fragment_program_node node[4]; int cur_node; int first_node_has_tex; - int alu_offset; - int alu_end; - int tex_offset; - int tex_end; - /** * Remember which program register a given hardware constant * belongs to. diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index 8a1d690ae4..d390de54b8 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -29,10 +29,8 @@ * \file * * Fragment program compiler. Perform transformations on the intermediate - * \ref radeon_program representation (which is essentially the Mesa - * program representation plus the notion of clauses) until the program - * is in a form where we can translate it more or less directly into - * machine-readable form. + * representation until the program is in a form where we can translate + * it more or less directly into machine-readable form. * * \author Ben Skeggs * \author Jerome Glisse @@ -47,8 +45,10 @@ #include "r300_context.h" #include "r300_fragprog.h" +#include "r300_fragprog_swizzle.h" #include "r300_state.h" +#include "radeon_nqssadce.h" #include "radeon_program_alu.h" @@ -133,25 +133,6 @@ static GLboolean transform_TEX( inst.SrcReg[0].Index = tempreg; } - /* Texture operations do not support swizzles etc. in hardware, - * so emit an additional arithmetic operation if necessary. - */ - if (inst.SrcReg[0].Swizzle != SWIZZLE_NOOP || - inst.SrcReg[0].Abs || inst.SrcReg[0].NegateBase || inst.SrcReg[0].NegateAbs) { - int tempreg = radeonFindFreeTemporary(t); - - tgt = radeonAppendInstructions(t->Program, 1); - - tgt->Opcode = OPCODE_MOV; - tgt->DstReg.File = PROGRAM_TEMPORARY; - tgt->DstReg.Index = tempreg; - tgt->SrcReg[0] = inst.SrcReg[0]; - - reset_srcreg(&inst.SrcReg[0]); - inst.SrcReg[0].File = PROGRAM_TEMPORARY; - inst.SrcReg[0].Index = tempreg; - } - if (inst.Opcode != OPCODE_KIL) { if (inst.DstReg.File != PROGRAM_TEMPORARY || inst.DstReg.WriteMask != WRITEMASK_XYZW) { @@ -339,6 +320,13 @@ static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler) } +static void nqssadce_init(struct nqssadce_state* s) +{ + s->Outputs[FRAG_RESULT_COLR].Sourced = WRITEMASK_XYZW; + s->Outputs[FRAG_RESULT_DEPR].Sourced = WRITEMASK_W; +} + + static GLuint build_dtm(GLuint depthmode) { switch(depthmode) { @@ -417,7 +405,20 @@ void r300TranslateFragmentShader(r300ContextPtr r300, 3, transformations); if (RADEON_DEBUG & DEBUG_PIXEL) { - _mesa_printf("Fragment Program: After transformations:\n"); + _mesa_printf("Fragment Program: After native rewrite:\n"); + _mesa_print_program(compiler.program); + } + + struct radeon_nqssadce_descr nqssadce = { + .Init = &nqssadce_init, + .IsNativeSwizzle = &r300FPIsNativeSwizzle, + .BuildSwizzle = &r300FPBuildSwizzle, + .RewriteDepthOut = GL_TRUE + }; + radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce); + + if (RADEON_DEBUG & DEBUG_PIXEL) { + _mesa_printf("Compiler: after NqSSA-DCE:\n"); _mesa_print_program(compiler.program); } @@ -451,22 +452,18 @@ void r300FragmentProgramDump( fprintf(stderr, "pc=%d*************************************\n", pc++); - fprintf(stderr, "Mesa program:\n"); - fprintf(stderr, "-------------\n"); - _mesa_print_program(&fp->mesa_program.Base); - fflush(stdout); - fprintf(stderr, "Hardware program\n"); fprintf(stderr, "----------------\n"); for (n = 0; n < (code->cur_node + 1); n++) { fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, " - "alu_end: %d, tex_end: %d\n", n, + "alu_end: %d, tex_end: %d, flags: %08x\n", n, code->node[n].alu_offset, code->node[n].tex_offset, - code->node[n].alu_end, code->node[n].tex_end); + code->node[n].alu_end, code->node[n].tex_end, + code->node[n].flags); - if (code->tex.length) { + if (n > 0 || code->first_node_has_tex) { fprintf(stderr, " TEX:\n"); for (i = code->node[n].tex_offset; i <= code->node[n].tex_offset + code->node[n].tex_end; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h index c76ae62701..b3a3cd2e04 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.h +++ b/src/mesa/drivers/dri/r300/r300_fragprog.h @@ -42,33 +42,6 @@ #include "r300_context.h" #include "radeon_program.h" -/* supported hw opcodes */ -#define PFS_OP_MAD 0 -#define PFS_OP_DP3 1 -#define PFS_OP_DP4 2 -#define PFS_OP_MIN 3 -#define PFS_OP_MAX 4 -#define PFS_OP_CMP 5 -#define PFS_OP_FRC 6 -#define PFS_OP_EX2 7 -#define PFS_OP_LG2 8 -#define PFS_OP_RCP 9 -#define PFS_OP_RSQ 10 -#define PFS_OP_REPL_ALPHA 11 -#define PFS_OP_CMPH 12 -#define MAX_PFS_OP 12 - -#define PFS_FLAG_SAT (1 << 0) -#define PFS_FLAG_ABS (1 << 1) - -#define ARG_NEG (1 << 5) -#define ARG_ABS (1 << 6) -#define ARG_MASK (127 << 0) -#define ARG_STRIDE 7 -#define SRC_CONST (1 << 5) -#define SRC_MASK (63 << 0) -#define SRC_STRIDE 6 - #define DRI_CONF_FP_OPTIMIZATION_SPEED 0 #define DRI_CONF_FP_OPTIMIZATION_QUALITY 1 diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c index 4786b4554d..9f0b7e3534 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c @@ -36,1674 +36,308 @@ * \author Jerome Glisse * * \todo FogOption - * - * \todo Verify results of opcodes for accuracy, I've only checked them in - * specific cases. */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" -#include "shader/prog_instruction.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" - -#include "r300_context.h" #include "r300_fragprog.h" -#include "r300_reg.h" -#include "r300_state.h" - -/* Mapping Mesa registers to R300 temporaries */ -struct reg_acc { - int reg; /* Assigned hw temp */ - unsigned int refcount; /* Number of uses by mesa program */ -}; - -/** - * Describe the current lifetime information for an R300 temporary - */ -struct reg_lifetime { - /* Index of the first slot where this register is free in the sense - that it can be used as a new destination register. - This is -1 if the register has been assigned to a Mesa register - and the last access to the register has not yet been emitted */ - int free; - - /* Index of the first slot where this register is currently reserved. - This is used to stop e.g. a scalar operation from being moved - before the allocation time of a register that was first allocated - for a vector operation. */ - int reserved; - - /* Index of the first slot in which the register can be used as a - source without losing the value that is written by the last - emitted instruction that writes to the register */ - int vector_valid; - int scalar_valid; - - /* Index to the slot where the register was last read. - This is also the first slot in which the register may be written again */ - int vector_lastread; - int scalar_lastread; -}; - -/** - * Store usage information about an ALU instruction slot during the - * compilation of a fragment program. - */ -#define SLOT_SRC_VECTOR (1<<0) -#define SLOT_SRC_SCALAR (1<<3) -#define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR) -#define SLOT_OP_VECTOR (1<<16) -#define SLOT_OP_SCALAR (1<<17) -#define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR) - -struct r300_pfs_compile_slot { - /* Bitmask indicating which parts of the slot are used, using SLOT_ constants - defined above */ - unsigned int used; - - /* Selected sources */ - int vsrc[3]; - int ssrc[3]; -}; - -/** - * Store information during compilation of fragment programs. - */ -struct r300_pfs_compile_state { - struct r300_fragment_program_compiler *compiler; - int nrslots; /* number of ALU slots used so far */ - - /* Track which (parts of) slots are already filled with instructions */ - struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST]; - - /* Track the validity of R300 temporaries */ - struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS]; - - /* Used to map Mesa's inputs/temps onto hardware temps */ - int temp_in_use; - struct reg_acc temps[PFS_NUM_TEMP_REGS]; - struct reg_acc inputs[32]; /* don't actually need 32... */ +#include "radeon_program_pair.h" +#include "r300_fragprog_swizzle.h" +#include "r300_reg.h" - /* Track usage of hardware temps, for register allocation, - * indirection detection, etc. */ - GLuint used_in_node; - GLuint dest_in_node; -}; +#define PROG_CODE \ + struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \ + struct r300_fragment_program_code *code = c->code -/* - * Usefull macros and values - */ -#define ERROR(fmt, args...) do { \ +#define error(fmt, args...) do { \ fprintf(stderr, "%s::%s(): " fmt "\n", \ __FILE__, __FUNCTION__, ##args); \ - fp->error = GL_TRUE; \ } while(0) -#define PFS_INVAL 0xFFFFFFFF -#define COMPILE_STATE \ - struct r300_fragment_program *fp = cs->compiler->fp; \ - struct r300_fragment_program_code *code = cs->compiler->code; \ - (void)code; (void)fp - -#define SWIZZLE_XYZ 0 -#define SWIZZLE_XXX 1 -#define SWIZZLE_YYY 2 -#define SWIZZLE_ZZZ 3 -#define SWIZZLE_WWW 4 -#define SWIZZLE_YZX 5 -#define SWIZZLE_ZXY 6 -#define SWIZZLE_WZY 7 -#define SWIZZLE_111 8 -#define SWIZZLE_000 9 -#define SWIZZLE_HHH 10 - -#define swizzle(r, x, y, z, w) do_swizzle(cs, r, \ - ((SWIZZLE_##x<<0)| \ - (SWIZZLE_##y<<3)| \ - (SWIZZLE_##z<<6)| \ - (SWIZZLE_##w<<9)), \ - 0) - -#define REG_TYPE_INPUT 0 -#define REG_TYPE_OUTPUT 1 -#define REG_TYPE_TEMP 2 -#define REG_TYPE_CONST 3 - -#define REG_TYPE_SHIFT 0 -#define REG_INDEX_SHIFT 2 -#define REG_VSWZ_SHIFT 8 -#define REG_SSWZ_SHIFT 13 -#define REG_NEGV_SHIFT 18 -#define REG_NEGS_SHIFT 19 -#define REG_ABS_SHIFT 20 -#define REG_NO_USE_SHIFT 21 // Hack for refcounting -#define REG_VALID_SHIFT 22 // Does the register contain a defined value? -#define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)? - -#define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT) -#define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT) -#define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT) -#define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT) -#define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT) -#define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT) -#define REG_ABS_MASK (0x01 << REG_ABS_SHIFT) -#define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT) -#define REG_VALID_MASK (0x01 << REG_VALID_SHIFT) -#define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT) - -#define REG(type, index, vswz, sswz, nouse, valid, builtin) \ - (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \ - ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \ - ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \ - ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \ - ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \ - ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \ - ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK)) -#define REG_GET_TYPE(reg) \ - ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT) -#define REG_GET_INDEX(reg) \ - ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT) -#define REG_GET_VSWZ(reg) \ - ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT) -#define REG_GET_SSWZ(reg) \ - ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT) -#define REG_GET_NO_USE(reg) \ - ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT) -#define REG_GET_VALID(reg) \ - ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT) -#define REG_GET_BUILTIN(reg) \ - ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT) -#define REG_SET_TYPE(reg, type) \ - reg = ((reg & ~REG_TYPE_MASK) | \ - ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK)) -#define REG_SET_INDEX(reg, index) \ - reg = ((reg & ~REG_INDEX_MASK) | \ - ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK)) -#define REG_SET_VSWZ(reg, vswz) \ - reg = ((reg & ~REG_VSWZ_MASK) | \ - ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK)) -#define REG_SET_SSWZ(reg, sswz) \ - reg = ((reg & ~REG_SSWZ_MASK) | \ - ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK)) -#define REG_SET_NO_USE(reg, nouse) \ - reg = ((reg & ~REG_NO_USE_MASK) | \ - ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK)) -#define REG_SET_VALID(reg, valid) \ - reg = ((reg & ~REG_VALID_MASK) | \ - ((valid << REG_VALID_SHIFT) & REG_VALID_MASK)) -#define REG_SET_BUILTIN(reg, builtin) \ - reg = ((reg & ~REG_BUILTIN_MASK) | \ - ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK)) -#define REG_ABS(reg) \ - reg = (reg | REG_ABS_MASK) -#define REG_NEGV(reg) \ - reg = (reg | REG_NEGV_MASK) -#define REG_NEGS(reg) \ - reg = (reg | REG_NEGS_MASK) - -#define NOP_INST0 ( \ - (R300_ALU_OUTC_MAD) | \ - (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \ - (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \ - (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT)) -#define NOP_INST1 ( \ - ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \ - ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \ - ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT)) -#define NOP_INST2 ( \ - (R300_ALU_OUTA_MAD) | \ - (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \ - (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \ - (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT)) -#define NOP_INST3 ( \ - ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \ - ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \ - ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT)) - - -/* - * Datas structures for fragment program generation - */ - -/* description of r300 native hw instructions */ -static const struct { - const char *name; - int argc; - int v_op; - int s_op; -} r300_fpop[] = { - /* *INDENT-OFF* */ - {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD}, - {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4}, - {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4}, - {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN}, - {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX}, - {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP}, - {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC}, - {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2}, - {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2}, - {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP}, - {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ}, - {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL}, - {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL}, - /* *INDENT-ON* */ -}; - -/* vector swizzles r300 can support natively, with a couple of - * cases we handle specially - * - * REG_VSWZ/REG_SSWZ is an index into this table - */ - -/* mapping from SWIZZLE_* to r300 native values for scalar insns */ -#define SWIZZLE_HALF 6 - -#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \ - SWIZZLE_##y, \ - SWIZZLE_##z, \ - SWIZZLE_ZERO)) -/* native swizzles */ -static const struct r300_pfs_swizzle { - GLuint hash; /* swizzle value this matches */ - GLuint base; /* base value for hw swizzle */ - GLuint stride; /* difference in base between arg0/1/2 */ - GLuint flags; -} v_swiz[] = { - /* *INDENT-OFF* */ - {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR}, - {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR}, - {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR}, - {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR}, - {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR}, - {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR}, - {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR}, - {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH}, - {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0}, - {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0}, - {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0}, - {PFS_INVAL, 0, 0, 0}, - /* *INDENT-ON* */ -}; - -/* used during matching of non-native swizzles */ -#define SWZ_X_MASK (7 << 0) -#define SWZ_Y_MASK (7 << 3) -#define SWZ_Z_MASK (7 << 6) -#define SWZ_W_MASK (7 << 9) -static const struct { - GLuint hash; /* used to mask matching swizzle components */ - int mask; /* actual outmask */ - int count; /* count of components matched */ -} s_mask[] = { - /* *INDENT-OFF* */ - {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3}, - {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2}, - {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2}, - {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2}, - {SWZ_X_MASK, 1, 1}, - {SWZ_Y_MASK, 2, 1}, - {SWZ_Z_MASK, 4, 1}, - {PFS_INVAL, PFS_INVAL, PFS_INVAL} - /* *INDENT-ON* */ -}; - -static const struct { - int base; /* hw value of swizzle */ - int stride; /* difference between SRC0/1/2 */ - GLuint flags; -} s_swiz[] = { - /* *INDENT-OFF* */ - {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR}, - {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR}, - {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR}, - {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR}, - {R300_ALU_ARGA_ZERO, 0, 0}, - {R300_ALU_ARGA_ONE, 0, 0}, - {R300_ALU_ARGA_HALF, 0, 0} - /* *INDENT-ON* */ -}; - -/* boiler-plate reg, for convenience */ -static const GLuint undef = REG(REG_TYPE_TEMP, - 0, - SWIZZLE_XYZ, - SWIZZLE_W, - GL_FALSE, - GL_FALSE, - GL_FALSE); - -/* constant one source */ -static const GLuint pfs_one = REG(REG_TYPE_CONST, - 0, - SWIZZLE_111, - SWIZZLE_ONE, - GL_FALSE, - GL_TRUE, - GL_TRUE); - -/* constant half source */ -static const GLuint pfs_half = REG(REG_TYPE_CONST, - 0, - SWIZZLE_HHH, - SWIZZLE_HALF, - GL_FALSE, - GL_TRUE, - GL_TRUE); - -/* constant zero source */ -static const GLuint pfs_zero = REG(REG_TYPE_CONST, - 0, - SWIZZLE_000, - SWIZZLE_ZERO, - GL_FALSE, - GL_TRUE, - GL_TRUE); - -/* - * Common functions prototypes - */ -static void emit_arith(struct r300_pfs_compile_state *cs, int op, - GLuint dest, int mask, - GLuint src0, GLuint src1, GLuint src2, int flags); - -/** - * Get an R300 temporary that can be written to in the given slot. - */ -static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot) -{ - COMPILE_STATE; - int r; - - for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) { - if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot) - break; - } - - if (r >= PFS_NUM_TEMP_REGS) { - ERROR("Out of hardware temps\n"); - return 0; - } - // Reserved is used to avoid the following scenario: - // R300 temporary X is first assigned to Mesa temporary Y during vector ops - // R300 temporary X is then assigned to Mesa temporary Z for further vector ops - // Then scalar ops on Mesa temporary Z are emitted and move back in time - // to overwrite the value of temporary Y. - // End scenario. - cs->hwtemps[r].reserved = cs->hwtemps[r].free; - cs->hwtemps[r].free = -1; - - // Reset to some value that won't mess things up when the user - // tries to read from a temporary that hasn't been assigned a value yet. - // In the normal case, vector_valid and scalar_valid should be set to - // a sane value by the first emit that writes to this temporary. - cs->hwtemps[r].vector_valid = 0; - cs->hwtemps[r].scalar_valid = 0; - - if (r > code->max_temp_idx) - code->max_temp_idx = r; - - return r; -} - -/** - * Get an R300 temporary that will act as a TEX destination register. - */ -static int get_hw_temp_tex(struct r300_pfs_compile_state *cs) -{ - COMPILE_STATE; - int r; - - for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) { - if (cs->used_in_node & (1 << r)) - continue; - - // Note: Be very careful here - if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0) - break; - } - - if (r >= PFS_NUM_TEMP_REGS) - return get_hw_temp(cs, 0); /* Will cause an indirection */ - - cs->hwtemps[r].reserved = cs->hwtemps[r].free; - cs->hwtemps[r].free = -1; - - // Reset to some value that won't mess things up when the user - // tries to read from a temporary that hasn't been assigned a value yet. - // In the normal case, vector_valid and scalar_valid should be set to - // a sane value by the first emit that writes to this temporary. - cs->hwtemps[r].vector_valid = cs->nrslots; - cs->hwtemps[r].scalar_valid = cs->nrslots; - if (r > code->max_temp_idx) - code->max_temp_idx = r; - - return r; -} - -/** - * Mark the given hardware register as free. - */ -static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx) +static GLboolean emit_const(void* data, GLuint file, GLuint index, GLuint *hwindex) { - // Be very careful here. Consider sequences like - // MAD r0, r1,r2,r3 - // TEX r4, ... - // The TEX instruction may be moved in front of the MAD instruction - // due to the way nodes work. We don't want to alias r1 and r4 in - // this case. - // I'm certain the register allocation could be further sanitized, - // but it's tricky because of stuff that can happen inside emit_tex - // and emit_arith. - cs->hwtemps[idx].free = cs->nrslots + 1; -} + PROG_CODE; -/** - * Create a new Mesa temporary register. - */ -static GLuint get_temp_reg(struct r300_pfs_compile_state *cs) -{ - COMPILE_STATE; - GLuint r = undef; - GLuint index; - - index = ffs(~cs->temp_in_use); - if (!index) { - ERROR("Out of program temps\n"); - return r; - } - - cs->temp_in_use |= (1 << --index); - cs->temps[index].refcount = 0xFFFFFFFF; - cs->temps[index].reg = -1; - - REG_SET_TYPE(r, REG_TYPE_TEMP); - REG_SET_INDEX(r, index); - REG_SET_VALID(r, GL_TRUE); - return r; -} - -/** - * Free a Mesa temporary and the associated R300 temporary. - */ -static void free_temp(struct r300_pfs_compile_state *cs, GLuint r) -{ - GLuint index = REG_GET_INDEX(r); - - if (!(cs->temp_in_use & (1 << index))) - return; - - if (REG_GET_TYPE(r) == REG_TYPE_TEMP) { - free_hw_temp(cs, cs->temps[index].reg); - cs->temps[index].reg = -1; - cs->temp_in_use &= ~(1 << index); - } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) { - free_hw_temp(cs, cs->inputs[index].reg); - cs->inputs[index].reg = -1; - } -} - -/** - * Emit a hardware constant/parameter. - */ -static GLuint emit_const4fv(struct r300_pfs_compile_state *cs, - struct prog_src_register srcreg) -{ - COMPILE_STATE; - GLuint reg = undef; - int index; - - for (index = 0; index < code->const_nr; ++index) { - if (code->constant[index].File == srcreg.File && - code->constant[index].Index == srcreg.Index) + for (*hwindex = 0; *hwindex < code->const_nr; ++*hwindex) { + if (code->constant[*hwindex].File == file && + code->constant[*hwindex].Index == index) break; } - if (index >= code->const_nr) { - if (index >= PFS_NUM_CONST_REGS) { - ERROR("Out of hw constants!\n"); - return reg; + if (*hwindex >= code->const_nr) { + if (*hwindex >= PFS_NUM_CONST_REGS) { + error("Out of hw constants!\n"); + return GL_FALSE; } code->const_nr++; - code->constant[index] = srcreg; + code->constant[*hwindex].File = file; + code->constant[*hwindex].Index = index; } - REG_SET_TYPE(reg, REG_TYPE_CONST); - REG_SET_INDEX(reg, index); - REG_SET_VALID(reg, GL_TRUE); - return reg; + return GL_TRUE; } -static INLINE GLuint negate(GLuint r) -{ - REG_NEGS(r); - REG_NEGV(r); - return r; -} -/* Hack, to prevent clobbering sources used multiple times when - * emulating non-native instructions +/** + * Mark a temporary register as used. */ -static INLINE GLuint keep(GLuint r) -{ - REG_SET_NO_USE(r, GL_TRUE); - return r; -} - -static INLINE GLuint absolute(GLuint r) -{ - REG_ABS(r); - return r; -} - -static int swz_native(struct r300_pfs_compile_state *cs, - GLuint src, GLuint * r, GLuint arbneg) -{ - COMPILE_STATE; - - /* Native swizzle, handle negation */ - src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT); - - if ((arbneg & 0x7) == 0x0) { - src = src & ~REG_NEGV_MASK; - *r = src; - } else if ((arbneg & 0x7) == 0x7) { - src |= REG_NEGV_MASK; - *r = src; - } else { - if (!REG_GET_VALID(*r)) - *r = get_temp_reg(cs); - src |= REG_NEGV_MASK; - emit_arith(cs, - PFS_OP_MAD, - *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0); - src = src & ~REG_NEGV_MASK; - emit_arith(cs, - PFS_OP_MAD, - *r, - (arbneg ^ 0x7) | WRITEMASK_W, - src, pfs_one, pfs_zero, 0); - } - - return 3; -} - -static int swz_emit_partial(struct r300_pfs_compile_state *cs, - GLuint src, - GLuint * r, int mask, int mc, GLuint arbneg) -{ - COMPILE_STATE; - GLuint tmp; - GLuint wmask = 0; - - if (!REG_GET_VALID(*r)) - *r = get_temp_reg(cs); - - /* A partial match, VSWZ/mask define what parts of the - * desired swizzle we match - */ - if (mc + s_mask[mask].count == 3) { - wmask = WRITEMASK_W; - src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT; - } - - tmp = arbneg & s_mask[mask].mask; - if (tmp) { - tmp = tmp ^ s_mask[mask].mask; - if (tmp) { - emit_arith(cs, - PFS_OP_MAD, - *r, - arbneg & s_mask[mask].mask, - keep(src) | REG_NEGV_MASK, - pfs_one, pfs_zero, 0); - if (!wmask) { - REG_SET_NO_USE(src, GL_TRUE); - } else { - REG_SET_NO_USE(src, GL_FALSE); - } - emit_arith(cs, - PFS_OP_MAD, - *r, tmp | wmask, src, pfs_one, pfs_zero, 0); - } else { - if (!wmask) { - REG_SET_NO_USE(src, GL_TRUE); - } else { - REG_SET_NO_USE(src, GL_FALSE); - } - emit_arith(cs, - PFS_OP_MAD, - *r, - (arbneg & s_mask[mask].mask) | wmask, - src | REG_NEGV_MASK, pfs_one, pfs_zero, 0); - } - } else { - if (!wmask) { - REG_SET_NO_USE(src, GL_TRUE); - } else { - REG_SET_NO_USE(src, GL_FALSE); - } - emit_arith(cs, PFS_OP_MAD, - *r, - s_mask[mask].mask | wmask, - src, pfs_one, pfs_zero, 0); - } - - return s_mask[mask].count; -} - -static GLuint do_swizzle(struct r300_pfs_compile_state *cs, - GLuint src, GLuint arbswz, GLuint arbneg) -{ - COMPILE_STATE; - GLuint r = undef; - GLuint vswz; - int c_mask = 0; - int v_match = 0; - - /* If swizzling from something without an XYZW native swizzle, - * emit result to a temp, and do new swizzle from the temp. - */ -#if 0 - if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) { - GLuint temp = get_temp_reg(fp); - emit_arith(fp, - PFS_OP_MAD, - temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0); - src = temp; - } -#endif - - if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) { - GLuint vsrcswz = - (v_swiz[REG_GET_VSWZ(src)]. - hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) | - REG_GET_SSWZ(src) << 9; - GLint i; - - GLuint newswz = 0; - GLuint offset; - for (i = 0; i < 4; ++i) { - offset = GET_SWZ(arbswz, i); - - newswz |= - (offset <= 3) ? GET_SWZ(vsrcswz, - offset) << i * - 3 : offset << i * 3; - } - - arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK); - REG_SET_SSWZ(src, GET_SWZ(newswz, 3)); - } else { - /* set scalar swizzling */ - REG_SET_SSWZ(src, GET_SWZ(arbswz, 3)); - - } - do { - vswz = REG_GET_VSWZ(src); - do { - int chash; - - REG_SET_VSWZ(src, vswz); - chash = v_swiz[REG_GET_VSWZ(src)].hash & - s_mask[c_mask].hash; - - if (chash == (arbswz & s_mask[c_mask].hash)) { - if (s_mask[c_mask].count == 3) { - v_match += swz_native(cs, - src, &r, arbneg); - } else { - v_match += swz_emit_partial(cs, - src, - &r, - c_mask, - v_match, - arbneg); - } - - if (v_match == 3) - return r; - - /* Fill with something invalid.. all 0's was - * wrong before, matched SWIZZLE_X. So all - * 1's will be okay for now - */ - arbswz |= (PFS_INVAL & s_mask[c_mask].hash); - } - } while (v_swiz[++vswz].hash != PFS_INVAL); - REG_SET_VSWZ(src, SWIZZLE_XYZ); - } while (s_mask[++c_mask].hash != PFS_INVAL); - - ERROR("should NEVER get here\n"); - return r; -} - -static GLuint t_src(struct r300_pfs_compile_state *cs, - struct prog_src_register fpsrc) -{ - COMPILE_STATE; - GLuint r = undef; - - switch (fpsrc.File) { - case PROGRAM_TEMPORARY: - REG_SET_INDEX(r, fpsrc.Index); - REG_SET_VALID(r, GL_TRUE); - REG_SET_TYPE(r, REG_TYPE_TEMP); - break; - case PROGRAM_INPUT: - REG_SET_INDEX(r, fpsrc.Index); - REG_SET_VALID(r, GL_TRUE); - REG_SET_TYPE(r, REG_TYPE_INPUT); - break; - case PROGRAM_LOCAL_PARAM: - case PROGRAM_ENV_PARAM: - case PROGRAM_STATE_VAR: - case PROGRAM_NAMED_PARAM: - case PROGRAM_CONSTANT: - r = emit_const4fv(cs, fpsrc); - break; - case PROGRAM_BUILTIN: - switch(fpsrc.Swizzle) { - case SWIZZLE_1111: r = pfs_one; break; - case SWIZZLE_0000: r = pfs_zero; break; - default: - ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle); - break; - } - break; - default: - ERROR("unknown SrcReg->File %x\n", fpsrc.File); - return r; - } - - /* no point swizzling ONE/ZERO/HALF constants... */ - if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO) - r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase); - if (fpsrc.Abs) - r = absolute(r); - if (fpsrc.NegateAbs) - r = negate(r); - return r; -} - -static GLuint t_scalar_src(struct r300_pfs_compile_state *cs, - struct prog_src_register fpsrc) +static void use_temporary(struct r300_fragment_program_code *code, GLuint index) { - struct prog_src_register src = fpsrc; - int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */ - - src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9)); - - return t_src(cs, src); + if (index > code->max_temp_idx) + code->max_temp_idx = index; } -static GLuint t_dst(struct r300_pfs_compile_state *cs, - struct prog_dst_register dest) -{ - COMPILE_STATE; - GLuint r = undef; - switch (dest.File) { - case PROGRAM_TEMPORARY: - REG_SET_INDEX(r, dest.Index); - REG_SET_VALID(r, GL_TRUE); - REG_SET_TYPE(r, REG_TYPE_TEMP); - return r; - case PROGRAM_OUTPUT: - REG_SET_TYPE(r, REG_TYPE_OUTPUT); - switch (dest.Index) { - case FRAG_RESULT_COLR: - case FRAG_RESULT_DEPR: - REG_SET_INDEX(r, dest.Index); - REG_SET_VALID(r, GL_TRUE); - return r; - default: - ERROR("Bad DstReg->Index 0x%x\n", dest.Index); - return r; - } - default: - ERROR("Bad DstReg->File 0x%x\n", dest.File); - return r; - } -} - -static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex) +static GLuint translate_rgb_opcode(GLuint opcode) { - COMPILE_STATE; - int idx; - int index = REG_GET_INDEX(src); - - switch (REG_GET_TYPE(src)) { - case REG_TYPE_TEMP: - /* NOTE: if reg==-1 here, a source is being read that - * hasn't been written to. Undefined results. - */ - if (cs->temps[index].reg == -1) - cs->temps[index].reg = get_hw_temp(cs, cs->nrslots); - - idx = cs->temps[index].reg; - - if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0)) - free_temp(cs, src); - break; - case REG_TYPE_INPUT: - idx = cs->inputs[index].reg; - - if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0)) - free_hw_temp(cs, cs->inputs[index].reg); - break; - case REG_TYPE_CONST: - return (index | SRC_CONST); + switch(opcode) { + case OPCODE_CMP: return R300_ALU_OUTC_CMP; + case OPCODE_DP3: return R300_ALU_OUTC_DP3; + case OPCODE_DP4: return R300_ALU_OUTC_DP4; + case OPCODE_FRC: return R300_ALU_OUTC_FRC; default: - ERROR("Invalid type for source reg\n"); - return (0 | SRC_CONST); + error("translate_rgb_opcode(%i): Unknown opcode", opcode); + /* fall through */ + case OPCODE_NOP: + /* fall through */ + case OPCODE_MAD: return R300_ALU_OUTC_MAD; + case OPCODE_MAX: return R300_ALU_OUTC_MAX; + case OPCODE_MIN: return R300_ALU_OUTC_MIN; + case OPCODE_REPL_ALPHA: return R300_ALU_OUTC_REPL_ALPHA; } - - if (!tex) - cs->used_in_node |= (1 << idx); - - return idx; } -static int t_hw_dst(struct r300_pfs_compile_state *cs, - GLuint dest, GLboolean tex, int slot) +static GLuint translate_alpha_opcode(GLuint opcode) { - COMPILE_STATE; - int idx; - GLuint index = REG_GET_INDEX(dest); - assert(REG_GET_VALID(dest)); - - switch (REG_GET_TYPE(dest)) { - case REG_TYPE_TEMP: - if (cs->temps[REG_GET_INDEX(dest)].reg == -1) { - if (!tex) { - cs->temps[index].reg = get_hw_temp(cs, slot); - } else { - cs->temps[index].reg = get_hw_temp_tex(cs); - } - } - idx = cs->temps[index].reg; - - if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0)) - free_temp(cs, dest); - - cs->dest_in_node |= (1 << idx); - cs->used_in_node |= (1 << idx); - break; - case REG_TYPE_OUTPUT: - switch (index) { - case FRAG_RESULT_COLR: - code->node[code->cur_node].flags |= R300_RGBA_OUT; - break; - case FRAG_RESULT_DEPR: - fp->WritesDepth = GL_TRUE; - code->node[code->cur_node].flags |= R300_W_OUT; - break; - } - return index; - break; + switch(opcode) { + case OPCODE_CMP: return R300_ALU_OUTA_CMP; + case OPCODE_DP3: return R300_ALU_OUTA_DP4; + case OPCODE_DP4: return R300_ALU_OUTA_DP4; + case OPCODE_EX2: return R300_ALU_OUTA_EX2; + case OPCODE_FRC: return R300_ALU_OUTA_FRC; + case OPCODE_LG2: return R300_ALU_OUTA_LG2; default: - ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest)); - return 0; - } - - return idx; -} - -static void emit_nop(struct r300_pfs_compile_state *cs) -{ - COMPILE_STATE; - - if (cs->nrslots >= PFS_MAX_ALU_INST) { - ERROR("Out of ALU instruction slots\n"); - return; - } - - code->alu.inst[cs->nrslots].inst0 = NOP_INST0; - code->alu.inst[cs->nrslots].inst1 = NOP_INST1; - code->alu.inst[cs->nrslots].inst2 = NOP_INST2; - code->alu.inst[cs->nrslots].inst3 = NOP_INST3; - cs->nrslots++; -} - -static void emit_tex(struct r300_pfs_compile_state *cs, - struct prog_instruction *fpi, int opcode) -{ - COMPILE_STATE; - GLuint coord = t_src(cs, fpi->SrcReg[0]); - GLuint dest = undef; - GLuint din, uin; - int unit = fpi->TexSrcUnit; - int hwsrc, hwdest; - - /* Ensure correct node indirection */ - uin = cs->used_in_node; - din = cs->dest_in_node; - - /* Resolve source/dest to hardware registers */ - hwsrc = t_hw_src(cs, coord, GL_TRUE); - - if (opcode != R300_TEX_OP_KIL) { - dest = t_dst(cs, fpi->DstReg); - - hwdest = - t_hw_dst(cs, dest, GL_TRUE, - code->node[code->cur_node].alu_offset); - - /* Use a temp that hasn't been used in this node, rather - * than causing an indirection - */ - if (uin & (1 << hwdest)) { - free_hw_temp(cs, hwdest); - hwdest = get_hw_temp_tex(cs); - cs->temps[REG_GET_INDEX(dest)].reg = hwdest; - } - } else { - hwdest = 0; - unit = 0; - } - - /* Indirection if source has been written in this node, or if the - * dest has been read/written in this node - */ - if ((REG_GET_TYPE(coord) != REG_TYPE_CONST && - (din & (1 << hwsrc))) || (uin & (1 << hwdest))) { - - /* Finish off current node */ - if (code->node[code->cur_node].alu_offset == cs->nrslots) - emit_nop(cs); - - code->node[code->cur_node].alu_end = - cs->nrslots - code->node[code->cur_node].alu_offset - 1; - assert(code->node[code->cur_node].alu_end >= 0); - - if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) { - ERROR("too many levels of texture indirection\n"); - return; - } - - /* Start new node */ - code->node[code->cur_node].tex_offset = code->tex.length; - code->node[code->cur_node].alu_offset = cs->nrslots; - code->node[code->cur_node].tex_end = -1; - code->node[code->cur_node].alu_end = -1; - code->node[code->cur_node].flags = 0; - cs->used_in_node = 0; - cs->dest_in_node = 0; + error("translate_rgb_opcode(%i): Unknown opcode", opcode); + /* fall through */ + case OPCODE_NOP: + /* fall through */ + case OPCODE_MAD: return R300_ALU_OUTA_MAD; + case OPCODE_MAX: return R300_ALU_OUTA_MAX; + case OPCODE_MIN: return R300_ALU_OUTA_MIN; + case OPCODE_RCP: return R300_ALU_OUTA_RCP; + case OPCODE_RSQ: return R300_ALU_OUTA_RSQ; } - - if (code->cur_node == 0) - code->first_node_has_tex = 1; - - code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT) - | (hwdest << R300_DST_ADDR_SHIFT) - | (unit << R300_TEX_ID_SHIFT) - | (opcode << R300_TEX_INST_SHIFT); - - cs->dest_in_node |= (1 << hwdest); - if (REG_GET_TYPE(coord) != REG_TYPE_CONST) - cs->used_in_node |= (1 << hwsrc); - - code->node[code->cur_node].tex_end++; } /** - * Returns the first slot where we could possibly allow writing to dest, - * according to register allocation. + * Emit one paired ALU instruction. */ -static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs, - GLuint dest, int mask) +static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst) { - COMPILE_STATE; - int idx; - int pos; - GLuint index = REG_GET_INDEX(dest); - assert(REG_GET_VALID(dest)); + PROG_CODE; - switch (REG_GET_TYPE(dest)) { - case REG_TYPE_TEMP: - if (cs->temps[index].reg == -1) - return 0; - - idx = cs->temps[index].reg; - break; - case REG_TYPE_OUTPUT: - return 0; - default: - ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest)); - return 0; - } - - pos = cs->hwtemps[idx].reserved; - if (mask & WRITEMASK_XYZ) { - if (pos < cs->hwtemps[idx].vector_lastread) - pos = cs->hwtemps[idx].vector_lastread; - } - if (mask & WRITEMASK_W) { - if (pos < cs->hwtemps[idx].scalar_lastread) - pos = cs->hwtemps[idx].scalar_lastread; + if (code->alu.length >= PFS_MAX_ALU_INST) { + error("Too many ALU instructions"); + return GL_FALSE; } - return pos; -} - -/** - * Allocates a slot for an ALU instruction that can consist of - * a vertex part or a scalar part or both. - * - * Sources from src (src[0] to src[argc-1]) are added to the slot in the - * appropriate position (vector and/or scalar), and their positions are - * recorded in the srcpos array. - * - * This function emits instruction code for the source fetch and the - * argument selection. It does not emit instruction code for the - * opcode or the destination selection. - * - * @return the index of the slot - */ -static int find_and_prepare_slot(struct r300_pfs_compile_state *cs, - GLboolean emit_vop, - GLboolean emit_sop, - int argc, GLuint * src, GLuint dest, int mask) -{ - COMPILE_STATE; - int hwsrc[3]; - int srcpos[3]; - unsigned int used; - int tempused; - int tempvsrc[3]; - int tempssrc[3]; - int pos; - int regnr; - int i, j; + int ip = code->alu.length++; + int j; + code->node[code->cur_node].alu_end++; - // Determine instruction slots, whether sources are required on - // vector or scalar side, and the smallest slot number where - // all source registers are available - used = 0; - if (emit_vop) - used |= SLOT_OP_VECTOR; - if (emit_sop) - used |= SLOT_OP_SCALAR; + code->alu.inst[ip].inst0 = translate_rgb_opcode(inst->RGB.Opcode); + code->alu.inst[ip].inst2 = translate_alpha_opcode(inst->Alpha.Opcode); - pos = get_earliest_allowed_write(cs, dest, mask); + for(j = 0; j < 3; ++j) { + GLuint src = inst->RGB.Src[j].Index | (inst->RGB.Src[j].Constant << 5); + if (!inst->RGB.Src[j].Constant) + use_temporary(code, inst->RGB.Src[j].Index); + code->alu.inst[ip].inst1 |= src << (6*j); - if (code->node[code->cur_node].alu_offset > pos) - pos = code->node[code->cur_node].alu_offset; - for (i = 0; i < argc; ++i) { - if (!REG_GET_BUILTIN(src[i])) { - if (emit_vop) - used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i; - if (emit_sop) - used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i; - } + src = inst->Alpha.Src[j].Index | (inst->Alpha.Src[j].Constant << 5); + if (!inst->Alpha.Src[j].Constant) + use_temporary(code, inst->Alpha.Src[j].Index); + code->alu.inst[ip].inst3 |= src << (6*j); - hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */ - regnr = hwsrc[i] & 31; + GLuint arg = r300FPTranslateRGBSwizzle(inst->RGB.Arg[j].Source, inst->RGB.Arg[j].Swizzle); + arg |= inst->RGB.Arg[j].Abs << 6; + arg |= inst->RGB.Arg[j].Negate << 5; + code->alu.inst[ip].inst0 |= arg << (7*j); - if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) { - if (used & (SLOT_SRC_VECTOR << i)) { - if (cs->hwtemps[regnr].vector_valid > pos) - pos = cs->hwtemps[regnr].vector_valid; - } - if (used & (SLOT_SRC_SCALAR << i)) { - if (cs->hwtemps[regnr].scalar_valid > pos) - pos = cs->hwtemps[regnr].scalar_valid; - } - } + arg = r300FPTranslateAlphaSwizzle(inst->Alpha.Arg[j].Source, inst->Alpha.Arg[j].Swizzle); + arg |= inst->Alpha.Arg[j].Abs << 6; + arg |= inst->Alpha.Arg[j].Negate << 5; + code->alu.inst[ip].inst2 |= arg << (7*j); } - // Find a slot that fits - for (;; ++pos) { - if (cs->slot[pos].used & used & SLOT_OP_BOTH) - continue; - - if (pos >= cs->nrslots) { - if (cs->nrslots >= PFS_MAX_ALU_INST) { - ERROR("Out of ALU instruction slots\n"); - return -1; - } - - code->alu.inst[pos].inst0 = NOP_INST0; - code->alu.inst[pos].inst1 = NOP_INST1; - code->alu.inst[pos].inst2 = NOP_INST2; - code->alu.inst[pos].inst3 = NOP_INST3; - - cs->nrslots++; - } - // Note: When we need both parts (vector and scalar) of a source, - // we always try to put them into the same position. This makes the - // code easier to read, and it is optimal (i.e. one doesn't gain - // anything by splitting the parts). - // It also avoids headaches with swizzles that access both parts (i.e WXY) - tempused = cs->slot[pos].used; - for (i = 0; i < 3; ++i) { - tempvsrc[i] = cs->slot[pos].vsrc[i]; - tempssrc[i] = cs->slot[pos].ssrc[i]; - } - - for (i = 0; i < argc; ++i) { - int flags = (used >> i) & SLOT_SRC_BOTH; - - if (!flags) { - srcpos[i] = 0; - continue; - } - - for (j = 0; j < 3; ++j) { - if ((tempused >> j) & flags & SLOT_SRC_VECTOR) { - if (tempvsrc[j] != hwsrc[i]) - continue; - } - - if ((tempused >> j) & flags & SLOT_SRC_SCALAR) { - if (tempssrc[j] != hwsrc[i]) - continue; - } + if (inst->RGB.Saturate) + code->alu.inst[ip].inst0 |= R300_ALU_OUTC_CLAMP; + if (inst->Alpha.Saturate) + code->alu.inst[ip].inst2 |= R300_ALU_OUTA_CLAMP; - break; - } - - if (j == 3) - break; - - srcpos[i] = j; - tempused |= flags << j; - if (flags & SLOT_SRC_VECTOR) - tempvsrc[j] = hwsrc[i]; - if (flags & SLOT_SRC_SCALAR) - tempssrc[j] = hwsrc[i]; - } - - if (i == argc) - break; + if (inst->RGB.WriteMask) { + use_temporary(code, inst->RGB.DestIndex); + code->alu.inst[ip].inst1 |= + (inst->RGB.DestIndex << R300_ALU_DSTC_SHIFT) | + (inst->RGB.WriteMask << R300_ALU_DSTC_REG_MASK_SHIFT); } - - // Found a slot, reserve it - cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH); - for (i = 0; i < 3; ++i) { - cs->slot[pos].vsrc[i] = tempvsrc[i]; - cs->slot[pos].ssrc[i] = tempssrc[i]; + if (inst->RGB.OutputWriteMask) { + code->alu.inst[ip].inst1 |= (inst->RGB.OutputWriteMask << R300_ALU_DSTC_OUTPUT_MASK_SHIFT); + code->node[code->cur_node].flags |= R300_RGBA_OUT; } - for (i = 0; i < argc; ++i) { - if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) { - int regnr = hwsrc[i] & 31; - - if (used & (SLOT_SRC_VECTOR << i)) { - if (cs->hwtemps[regnr].vector_lastread < pos) - cs->hwtemps[regnr].vector_lastread = - pos; - } - if (used & (SLOT_SRC_SCALAR << i)) { - if (cs->hwtemps[regnr].scalar_lastread < pos) - cs->hwtemps[regnr].scalar_lastread = - pos; - } - } + if (inst->Alpha.WriteMask) { + use_temporary(code, inst->Alpha.DestIndex); + code->alu.inst[ip].inst3 |= + (inst->Alpha.DestIndex << R300_ALU_DSTA_SHIFT) | + R300_ALU_DSTA_REG; } - - // Emit the source fetch code - code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK; - code->alu.inst[pos].inst1 |= - ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) | - (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) | - (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT)); - - code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK; - code->alu.inst[pos].inst3 |= - ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) | - (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) | - (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT)); - - // Emit the argument selection code - if (emit_vop) { - int swz[3]; - - for (i = 0; i < 3; ++i) { - if (i < argc) { - swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base + - (srcpos[i] * - v_swiz[REG_GET_VSWZ(src[i])]. - stride)) | ((src[i] & REG_NEGV_MASK) - ? ARG_NEG : 0) | ((src[i] - & - REG_ABS_MASK) - ? - ARG_ABS - : 0); - } else { - swz[i] = R300_ALU_ARGC_ZERO; - } - } - - code->alu.inst[pos].inst0 &= - ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK | - R300_ALU_ARG2C_MASK); - code->alu.inst[pos].inst0 |= - (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] << - R300_ALU_ARG1C_SHIFT) - | (swz[2] << R300_ALU_ARG2C_SHIFT); + if (inst->Alpha.OutputWriteMask) { + code->alu.inst[ip].inst3 |= R300_ALU_DSTA_OUTPUT; + code->node[code->cur_node].flags |= R300_RGBA_OUT; } - - if (emit_sop) { - int swz[3]; - - for (i = 0; i < 3; ++i) { - if (i < argc) { - swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base + - (srcpos[i] * - s_swiz[REG_GET_SSWZ(src[i])]. - stride)) | ((src[i] & REG_NEGS_MASK) - ? ARG_NEG : 0) | ((src[i] - & - REG_ABS_MASK) - ? - ARG_ABS - : 0); - } else { - swz[i] = R300_ALU_ARGA_ZERO; - } - } - - code->alu.inst[pos].inst2 &= - ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK | - R300_ALU_ARG2A_MASK); - code->alu.inst[pos].inst2 |= - (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] << - R300_ALU_ARG1A_SHIFT) - | (swz[2] << R300_ALU_ARG2A_SHIFT); + if (inst->Alpha.DepthWriteMask) { + code->alu.inst[ip].inst3 |= R300_ALU_DSTA_DEPTH; + code->node[code->cur_node].flags |= R300_W_OUT; + c->fp->WritesDepth = GL_TRUE; } - return pos; + return GL_TRUE; } + /** - * Append an ALU instruction to the instruction list. + * Finish the current node without advancing to the next one. */ -static void emit_arith(struct r300_pfs_compile_state *cs, - int op, - GLuint dest, - int mask, - GLuint src0, GLuint src1, GLuint src2, int flags) +static GLboolean finish_node(struct r300_fragment_program_compiler *c) { - COMPILE_STATE; - GLuint src[3] = { src0, src1, src2 }; - int hwdest; - GLboolean emit_vop, emit_sop; - int vop, sop, argc; - int pos; - - vop = r300_fpop[op].v_op; - sop = r300_fpop[op].s_op; - argc = r300_fpop[op].argc; - - if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT && - REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) { - if (mask & WRITEMASK_Z) { - mask = WRITEMASK_W; - } else { - return; - } - } - - emit_vop = GL_FALSE; - emit_sop = GL_FALSE; - if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3) - emit_vop = GL_TRUE; - if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA) - emit_sop = GL_TRUE; - - pos = - find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest, - mask); - if (pos < 0) - return; - - hwdest = t_hw_dst(cs, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */ - - if (flags & PFS_FLAG_SAT) { - vop |= R300_ALU_OUTC_CLAMP; - sop |= R300_ALU_OUTA_CLAMP; + struct r300_fragment_program_code *code = c->code; + struct r300_fragment_program_node *node = &code->node[code->cur_node]; + + if (node->alu_end < 0) { + /* Generate a single NOP for this node */ + struct radeon_pair_instruction inst; + _mesa_bzero(&inst, sizeof(inst)); + if (!emit_alu(c, &inst)) + return GL_FALSE; } - /* Throw the pieces together and get ALU/1 */ - if (emit_vop) { - code->alu.inst[pos].inst0 |= vop; - - code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT; - - if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { - if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { - code->alu.inst[pos].inst1 |= - (mask & WRITEMASK_XYZ) << - R300_ALU_DSTC_OUTPUT_MASK_SHIFT; - } else - assert(0); + if (node->tex_end < 0) { + if (code->cur_node == 0) { + node->tex_end = 0; } else { - code->alu.inst[pos].inst1 |= - (mask & WRITEMASK_XYZ) << - R300_ALU_DSTC_REG_MASK_SHIFT; - - cs->hwtemps[hwdest].vector_valid = pos + 1; - } - } - - /* And now ALU/3 */ - if (emit_sop) { - code->alu.inst[pos].inst2 |= sop; - - if (mask & WRITEMASK_W) { - if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) { - if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) { - code->alu.inst[pos].inst3 |= - (hwdest << R300_ALU_DSTA_SHIFT) | - R300_ALU_DSTA_OUTPUT; - } else if (REG_GET_INDEX(dest) == - FRAG_RESULT_DEPR) { - code->alu.inst[pos].inst3 |= - R300_ALU_DSTA_DEPTH; - } else - assert(0); - } else { - code->alu.inst[pos].inst3 |= - (hwdest << R300_ALU_DSTA_SHIFT) | - R300_ALU_DSTA_REG; - - cs->hwtemps[hwdest].scalar_valid = pos + 1; - } + error("Node %i has no TEX instructions", code->cur_node); + return GL_FALSE; } + } else { + if (code->cur_node == 0) + code->first_node_has_tex = 1; } - return; + return GL_TRUE; } -static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi) -{ - COMPILE_STATE; - GLuint src[3], dest; - int flags, mask = 0; - if (fpi->SaturateMode == SATURATE_ZERO_ONE) - flags = PFS_FLAG_SAT; - else - flags = 0; +/** + * Begin a block of texture instructions. + * Create the necessary indirection. + */ +static GLboolean begin_tex(void* data) +{ + PROG_CODE; - if (fpi->Opcode != OPCODE_KIL) { - dest = t_dst(cs, fpi->DstReg); - mask = fpi->DstReg.WriteMask; + if (code->cur_node == 0) { + if (code->node[0].alu_end < 0 && + code->node[0].tex_end < 0) + return GL_TRUE; } - switch (fpi->Opcode) { - case OPCODE_ADD: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - emit_arith(cs, PFS_OP_MAD, dest, mask, - src[0], pfs_one, src[1], flags); - break; - case OPCODE_CMP: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - src[2] = t_src(cs, fpi->SrcReg[2]); - /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c - * r300 - if src2.c < 0.0 ? src1.c : src0.c - */ - emit_arith(cs, PFS_OP_CMP, dest, mask, - src[2], src[1], src[0], flags); - break; - case OPCODE_DP3: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - emit_arith(cs, PFS_OP_DP3, dest, mask, - src[0], src[1], undef, flags); - break; - case OPCODE_DP4: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - emit_arith(cs, PFS_OP_DP4, dest, mask, - src[0], src[1], undef, flags); - break; - case OPCODE_EX2: - src[0] = t_scalar_src(cs, fpi->SrcReg[0]); - emit_arith(cs, PFS_OP_EX2, dest, mask, - src[0], undef, undef, flags); - break; - case OPCODE_FRC: - src[0] = t_src(cs, fpi->SrcReg[0]); - emit_arith(cs, PFS_OP_FRC, dest, mask, - src[0], undef, undef, flags); - break; - case OPCODE_KIL: - emit_tex(cs, fpi, R300_TEX_OP_KIL); - break; - case OPCODE_LG2: - src[0] = t_scalar_src(cs, fpi->SrcReg[0]); - emit_arith(cs, PFS_OP_LG2, dest, mask, - src[0], undef, undef, flags); - break; - case OPCODE_MAD: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - src[2] = t_src(cs, fpi->SrcReg[2]); - emit_arith(cs, PFS_OP_MAD, dest, mask, - src[0], src[1], src[2], flags); - break; - case OPCODE_MAX: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - emit_arith(cs, PFS_OP_MAX, dest, mask, - src[0], src[1], undef, flags); - break; - case OPCODE_MIN: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - emit_arith(cs, PFS_OP_MIN, dest, mask, - src[0], src[1], undef, flags); - break; - case OPCODE_MOV: - src[0] = t_src(cs, fpi->SrcReg[0]); - emit_arith(cs, PFS_OP_MAD, dest, mask, - src[0], pfs_one, pfs_zero, flags); - break; - case OPCODE_MUL: - src[0] = t_src(cs, fpi->SrcReg[0]); - src[1] = t_src(cs, fpi->SrcReg[1]); - emit_arith(cs, PFS_OP_MAD, dest, mask, - src[0], src[1], pfs_zero, flags); - break; - case OPCODE_RCP: - src[0] = t_scalar_src(cs, fpi->SrcReg[0]); - emit_arith(cs, PFS_OP_RCP, dest, mask, - src[0], undef, undef, flags); - break; - case OPCODE_RSQ: - src[0] = t_scalar_src(cs, fpi->SrcReg[0]); - emit_arith(cs, PFS_OP_RSQ, dest, mask, - absolute(src[0]), pfs_zero, pfs_zero, flags); - break; - case OPCODE_TEX: - emit_tex(cs, fpi, R300_TEX_OP_LD); - break; - case OPCODE_TXB: - emit_tex(cs, fpi, R300_TEX_OP_TXB); - break; - case OPCODE_TXP: - emit_tex(cs, fpi, R300_TEX_OP_TXP); - break; - default: - ERROR("unknown fpi->Opcode %d\n", fpi->Opcode); - break; + if (code->cur_node == 3) { + error("Too many texture indirections"); + return GL_FALSE; } -} - -static GLboolean parse_program(struct r300_pfs_compile_state *cs) -{ - COMPILE_STATE; - struct prog_instruction* fpi; - for(fpi = cs->compiler->program->Instructions; fpi->Opcode != OPCODE_END; ++fpi) { - emit_instruction(cs, fpi); - - if (fp->error) - return GL_FALSE; - } + if (!finish_node(c)) + return GL_FALSE; + struct r300_fragment_program_node *node = &code->node[++code->cur_node]; + node->alu_offset = code->alu.length; + node->alu_end = -1; + node->tex_offset = code->tex.length; + node->tex_end = -1; return GL_TRUE; } -/* - Init structures - * - Determine what hwregs each input corresponds to - */ -static void init_program(struct r300_pfs_compile_state *cs) +static GLboolean emit_tex(void* data, struct prog_instruction* inst) { - COMPILE_STATE; - struct gl_fragment_program *mp = &fp->mesa_program; - GLuint InputsRead = mp->Base.InputsRead; - GLuint temps_used = 0; /* for fp->temps[] */ - int i, j; - - /* New compile, reset tracking data */ - fp->optimization = - driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization"); - fp->translated = GL_FALSE; - fp->error = GL_FALSE; - fp->WritesDepth = GL_FALSE; - code->tex.length = 0; - code->cur_node = 0; - code->first_node_has_tex = 0; - code->const_nr = 0; - code->max_temp_idx = 0; - code->node[0].alu_end = -1; - code->node[0].tex_end = -1; - - for (i = 0; i < PFS_MAX_ALU_INST; i++) { - for (j = 0; j < 3; j++) { - cs->slot[i].vsrc[j] = SRC_CONST; - cs->slot[i].ssrc[j] = SRC_CONST; - } - } - - /* Work out what temps the Mesa inputs correspond to, this must match - * what setup_rs_unit does, which shouldn't be a problem as rs_unit - * configures itself based on the fragprog's InputsRead - * - * NOTE: this depends on get_hw_temp() allocating registers in order, - * starting from register 0. - */ + PROG_CODE; - /* Texcoords come first */ - for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) { - if (InputsRead & (FRAG_BIT_TEX0 << i)) { - cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0; - cs->inputs[FRAG_ATTRIB_TEX0 + i].reg = - get_hw_temp(cs, 0); - } - } - InputsRead &= ~FRAG_BITS_TEX_ANY; - - /* fragment position treated as a texcoord */ - if (InputsRead & FRAG_BIT_WPOS) { - cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0; - cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0); + if (code->tex.length >= PFS_MAX_TEX_INST) { + error("Too many TEX instructions"); + return GL_FALSE; } - InputsRead &= ~FRAG_BIT_WPOS; - /* Then primary colour */ - if (InputsRead & FRAG_BIT_COL0) { - cs->inputs[FRAG_ATTRIB_COL0].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0); - } - InputsRead &= ~FRAG_BIT_COL0; + GLuint unit = inst->TexSrcUnit; + GLuint dest = inst->DstReg.Index; + GLuint opcode; - /* Secondary color */ - if (InputsRead & FRAG_BIT_COL1) { - cs->inputs[FRAG_ATTRIB_COL1].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0); + switch(inst->Opcode) { + case OPCODE_KIL: opcode = R300_TEX_OP_KIL; break; + case OPCODE_TEX: opcode = R300_TEX_OP_LD; break; + case OPCODE_TXB: opcode = R300_TEX_OP_TXB; break; + case OPCODE_TXP: opcode = R300_TEX_OP_TXP; break; + default: + error("Unknown texture opcode %i", inst->Opcode); + return GL_FALSE; } - InputsRead &= ~FRAG_BIT_COL1; - /* Anything else */ - if (InputsRead) { - WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead); - /* force read from hwreg 0 for now */ - for (i = 0; i < 32; i++) - if (InputsRead & (1 << i)) - cs->inputs[i].reg = 0; + if (inst->Opcode == OPCODE_KIL) { + unit = 0; + dest = 0; + } else { + use_temporary(code, dest); } - /* Pre-parse the program, grabbing refcounts on input/temp regs. - * That way, we can free up the reg when it's no longer needed - */ - for (i = 0; i < cs->compiler->program->NumInstructions; ++i) { - struct prog_instruction *fpi = cs->compiler->program->Instructions + i; - int idx; - - for (j = 0; j < 3; j++) { - idx = fpi->SrcReg[j].Index; - switch (fpi->SrcReg[j].File) { - case PROGRAM_TEMPORARY: - if (!(temps_used & (1 << idx))) { - cs->temps[idx].reg = -1; - cs->temps[idx].refcount = 1; - temps_used |= (1 << idx); - } else - cs->temps[idx].refcount++; - break; - case PROGRAM_INPUT: - cs->inputs[idx].refcount++; - break; - default: - break; - } - } + use_temporary(code, inst->SrcReg[0].Index); - idx = fpi->DstReg.Index; - if (fpi->DstReg.File == PROGRAM_TEMPORARY) { - if (!(temps_used & (1 << idx))) { - cs->temps[idx].reg = -1; - cs->temps[idx].refcount = 1; - temps_used |= (1 << idx); - } else - cs->temps[idx].refcount++; - } - } - cs->temp_in_use = temps_used; + code->node[code->cur_node].tex_end++; + code->tex.inst[code->tex.length++] = + (inst->SrcReg[0].Index << R300_SRC_ADDR_SHIFT) | + (dest << R300_DST_ADDR_SHIFT) | + (unit << R300_TEX_ID_SHIFT) | + (opcode << R300_TEX_INST_SHIFT); + return GL_TRUE; } +static const struct radeon_pair_handler pair_handler = { + .EmitConst = &emit_const, + .EmitPaired = &emit_alu, + .EmitTex = &emit_tex, + .BeginTexBlock = &begin_tex, + .MaxHwTemps = PFS_NUM_TEMP_REGS +}; + /** * Final compilation step: Turn the intermediate radeon_program into * machine-readable instructions. */ GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler) { - struct r300_pfs_compile_state cs; struct r300_fragment_program_code *code = compiler->code; - _mesa_memset(&cs, 0, sizeof(cs)); - cs.compiler = compiler; - init_program(&cs); + _mesa_bzero(code, sizeof(struct r300_fragment_program_code)); + code->node[0].alu_end = -1; + code->node[0].tex_end = -1; - if (!parse_program(&cs)) + if (!radeonPairProgram(compiler->r300->radeon.glCtx, compiler->program, &pair_handler, compiler)) return GL_FALSE; - /* Finish off */ - code->node[code->cur_node].alu_end = - cs.nrslots - code->node[code->cur_node].alu_offset - 1; - if (code->node[code->cur_node].tex_end < 0) - code->node[code->cur_node].tex_end = 0; - code->alu_offset = 0; - code->alu_end = cs.nrslots - 1; - code->tex_offset = 0; - code->tex_end = code->tex.length ? code->tex.length - 1 : 0; - assert(code->node[code->cur_node].alu_end >= 0); - assert(code->alu_end >= 0); + if (!finish_node(compiler)) + return GL_FALSE; return GL_TRUE; } diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c b/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c new file mode 100644 index 0000000000..a86d2bd471 --- /dev/null +++ b/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/** + * @file + * Utilities to deal with the somewhat odd restriction on R300 fragment + * program swizzles. + */ + +#include "r300_fragprog_swizzle.h" + +#include "r300_reg.h" +#include "radeon_nqssadce.h" + +#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, SWIZZLE_##y, SWIZZLE_##z, SWIZZLE_ZERO)) + +struct swizzle_data { + GLuint hash; /**< swizzle value this matches */ + GLuint base; /**< base value for hw swizzle */ + GLuint stride; /**< difference in base between arg0/1/2 */ +}; + +static const struct swizzle_data native_swizzles[] = { + {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4}, + {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4}, + {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4}, + {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4}, + {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1}, + {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1}, + {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1}, + {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1}, + {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0}, + {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0} +}; + +static const int num_native_swizzles = sizeof(native_swizzles)/sizeof(native_swizzles[0]); + + +/** + * Find a native RGB swizzle that matches the given swizzle. + * Returns 0 if none found. + */ +static const struct swizzle_data* lookup_native_swizzle(GLuint swizzle) +{ + int i, comp; + + for(i = 0; i < num_native_swizzles; ++i) { + const struct swizzle_data* sd = &native_swizzles[i]; + for(comp = 0; comp < 3; ++comp) { + GLuint swz = GET_SWZ(swizzle, comp); + if (swz == SWIZZLE_NIL) + continue; + if (swz != GET_SWZ(sd->hash, comp)) + break; + } + if (comp == 3) + return sd; + } + + return 0; +} + + +/** + * Check whether the given instruction supports the swizzle and negate + * combinations in the given source register. + */ +GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg) +{ + if (reg.Abs) + reg.NegateBase = 0; + + if (opcode == OPCODE_KIL || + opcode == OPCODE_TEX || + opcode == OPCODE_TXB || + opcode == OPCODE_TXP) { + int j; + + if (reg.Abs || reg.NegateBase != (15*reg.NegateAbs)) + return GL_FALSE; + + for(j = 0; j < 4; ++j) { + GLuint swz = GET_SWZ(reg.Swizzle, j); + if (swz == SWIZZLE_NIL) + continue; + if (swz != j) + return GL_FALSE; + } + + return GL_TRUE; + } + + GLuint relevant = 0; + int j; + + for(j = 0; j < 3; ++j) + if (GET_SWZ(reg.Swizzle, j) != SWIZZLE_NIL) + relevant |= 1 << j; + + if ((reg.NegateBase & relevant) && (reg.NegateBase & relevant) != relevant) + return GL_FALSE; + + if (!lookup_native_swizzle(reg.Swizzle)) + return GL_FALSE; + + return GL_TRUE; +} + + +/** + * Generate MOV dst, src using only native swizzles. + */ +void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src) +{ + if (src.Abs) + src.NegateBase = 0; + + while(dst.WriteMask) { + const struct swizzle_data *best_swizzle = 0; + GLuint best_matchcount = 0; + GLuint best_matchmask = 0; + GLboolean rgbnegate; + int i, comp; + + for(i = 0; i < num_native_swizzles; ++i) { + const struct swizzle_data *sd = &native_swizzles[i]; + GLuint matchcount = 0; + GLuint matchmask = 0; + for(comp = 0; comp < 3; ++comp) { + if (!GET_BIT(dst.WriteMask, comp)) + continue; + GLuint swz = GET_SWZ(src.Swizzle, comp); + if (swz == SWIZZLE_NIL) + continue; + if (swz == GET_SWZ(sd->hash, comp)) { + matchcount++; + matchmask |= 1 << comp; + } + } + if (matchcount > best_matchcount) { + best_swizzle = sd; + best_matchcount = matchcount; + best_matchmask = matchmask; + if (matchmask == (dst.WriteMask & WRITEMASK_XYZ)) + break; + } + } + + if ((src.NegateBase & best_matchmask) != 0) { + best_matchmask &= src.NegateBase; + rgbnegate = !src.NegateAbs; + } else { + rgbnegate = src.NegateAbs; + } + + struct prog_instruction *inst; + + _mesa_insert_instructions(s->Program, s->IP, 1); + inst = s->Program->Instructions + s->IP++; + inst->Opcode = OPCODE_MOV; + inst->DstReg = dst; + inst->DstReg.WriteMask &= (best_matchmask | WRITEMASK_W); + inst->SrcReg[0] = src; + /* Note: We rely on NqSSA/DCE to set unused swizzle components to NIL */ + + dst.WriteMask &= ~inst->DstReg.WriteMask; + } +} + + +/** + * Translate an RGB (XYZ) swizzle into the hardware code for the given + * instruction source. + */ +GLuint r300FPTranslateRGBSwizzle(GLuint src, GLuint swizzle) +{ + const struct swizzle_data* sd = lookup_native_swizzle(swizzle); + + if (!sd) { + _mesa_printf("Not a native swizzle: %08x\n", swizzle); + return 0; + } + + return sd->base + src*sd->stride; +} + + +/** + * Translate an Alpha (W) swizzle into the hardware code for the given + * instruction source. + */ +GLuint r300FPTranslateAlphaSwizzle(GLuint src, GLuint swizzle) +{ + if (swizzle < 3) + return swizzle + 3*src; + + switch(swizzle) { + case SWIZZLE_W: return R300_ALU_ARGA_SRC0A + src; + case SWIZZLE_ONE: return R300_ALU_ARGA_ONE; + case SWIZZLE_ZERO: return R300_ALU_ARGA_ZERO; + default: return R300_ALU_ARGA_ONE; + } +} diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.h b/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.h new file mode 100644 index 0000000000..3da99a9dbe --- /dev/null +++ b/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __R300_FRAGPROG_SWIZZLE_H_ +#define __R300_FRAGPROG_SWIZZLE_H_ + +#include "glheader.h" +#include "shader/prog_instruction.h" + +struct nqssadce_state; + +GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg); +void r300FPBuildSwizzle(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src); + +GLuint r300FPTranslateRGBSwizzle(GLuint src, GLuint swizzle); +GLuint r300FPTranslateAlphaSwizzle(GLuint src, GLuint swizzle); + +#endif /* __R300_FRAGPROG_SWIZZLE_H_ */ diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c index fbe5f66418..6931de4421 100644 --- a/src/mesa/drivers/dri/r300/r300_state.c +++ b/src/mesa/drivers/dri/r300/r300_state.c @@ -2497,26 +2497,17 @@ static void r300SetupPixelShader(r300ContextPtr rmesa) r300SetupTextures(ctx); R300_STATECHANGE(rmesa, fpi[0]); - rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu_end + 1); - for (i = 0; i <= code->alu_end; i++) { - rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0; - } - R300_STATECHANGE(rmesa, fpi[1]); - rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu_end + 1); - for (i = 0; i <= code->alu_end; i++) { - rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1; - } - R300_STATECHANGE(rmesa, fpi[2]); - rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu_end + 1); - for (i = 0; i <= code->alu_end; i++) { - rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2; - } - R300_STATECHANGE(rmesa, fpi[3]); - rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu_end + 1); - for (i = 0; i <= code->alu_end; i++) { + rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu.length); + rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu.length); + rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu.length); + rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu.length); + for (i = 0; i < code->alu.length; i++) { + rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0; + rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1; + rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2; rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3; } @@ -2524,10 +2515,10 @@ static void r300SetupPixelShader(r300ContextPtr rmesa) rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3); rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx; rmesa->hw.fp.cmd[R300_FP_CNTL2] = - (code->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT) | - (code->alu_end << R300_PFS_CNTL_ALU_END_SHIFT) | - (code->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT) | - (code->tex_end << R300_PFS_CNTL_TEX_END_SHIFT); + (0 << R300_PFS_CNTL_ALU_OFFSET_SHIFT) | + ((code->alu.length-1) << R300_PFS_CNTL_ALU_END_SHIFT) | + (0 << R300_PFS_CNTL_TEX_OFFSET_SHIFT) | + ((code->tex.length ? code->tex.length-1 : 0) << R300_PFS_CNTL_TEX_END_SHIFT); /* I just want to say, the way these nodes are stored.. weird.. */ for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) { if (i < (code->cur_node + 1)) { diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h index ed8f7f41ca..8641ceeb8f 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.h +++ b/src/mesa/drivers/dri/r300/r500_fragprog.h @@ -45,36 +45,6 @@ #include "r300_state.h" #include "radeon_program.h" -/* supported hw opcodes */ -#define PFS_OP_MAD 0 -#define PFS_OP_DP3 1 -#define PFS_OP_DP4 2 -#define PFS_OP_MIN 3 -#define PFS_OP_MAX 4 -#define PFS_OP_CMP 5 -#define PFS_OP_FRC 6 -#define PFS_OP_EX2 7 -#define PFS_OP_LG2 8 -#define PFS_OP_RCP 9 -#define PFS_OP_RSQ 10 -#define PFS_OP_REPL_ALPHA 11 -#define PFS_OP_CMPH 12 -#define MAX_PFS_OP 12 - -#define PFS_FLAG_SAT (1 << 0) -#define PFS_FLAG_ABS (1 << 1) - -#define ARG_NEG (1 << 5) -#define ARG_ABS (1 << 6) -#define ARG_MASK (127 << 0) -#define ARG_STRIDE 7 -#define SRC_CONST (1 << 5) -#define SRC_MASK (63 << 0) -#define SRC_STRIDE 6 - -#define DRI_CONF_FP_OPTIMIZATION_SPEED 0 -#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1 - struct r500_fragment_program; extern void r500TranslateFragmentShader(r300ContextPtr r300, diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c index 85ddf1dc50..8762422801 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_pair.c +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c @@ -578,6 +578,9 @@ static void emit_all_tex(struct pair_state *s) if (s->Debug) _mesa_printf(" BEGIN_TEX\n"); + if (s->Handler->BeginTexBlock) + s->Error = s->Error || !s->Handler->BeginTexBlock(s->UserData); + for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) { int ip = pairinst - s->Instructions; struct prog_instruction *inst = s->Program->Instructions + ip; @@ -594,9 +597,6 @@ static void emit_all_tex(struct pair_state *s) s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst); } - if (s->Handler->EndTexBlock) - s->Handler->EndTexBlock(s->UserData); - if (s->Debug) _mesa_printf(" END_TEX\n"); } diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.h b/src/mesa/drivers/dri/r300/radeon_program_pair.h index b2bdd08d27..4624a24629 100644 --- a/src/mesa/drivers/dri/r300/radeon_program_pair.h +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.h @@ -110,10 +110,10 @@ struct radeon_pair_handler { GLboolean (*EmitTex)(void*, struct prog_instruction*); /** - * Called after a block of contiguous, independent texture - * instructions has been emitted. + * Called before a block of contiguous, independent texture + * instructions is emitted. */ - void (*EndTexBlock)(void*); + GLboolean (*BeginTexBlock)(void*); GLuint MaxHwTemps; }; -- cgit v1.2.3