/*
 * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

/*
 * Authors:
 *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
 */

#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#include "main/mtypes.h"
#include "main/imports.h"
#include "shader/prog_parameter.h"

#include "radeon_debug.h"
#include "r600_context.h"

#include "r700_assembler.h"

#define USE_CF_FOR_CONTINUE_BREAK 1
#define USE_CF_FOR_POP_AFTER      1

struct prog_instruction noise1_insts[12] = { 
    {OPCODE_BGNSUB , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_MOV , {{0, 0, 0, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 2, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_MOV , {{8, 0, 0, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 4, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_MOV , {{8, 0, 585, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 8, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_SGT , {{0, 0, 585, 0, 0, 0}, {8, 0, 1170, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 1, 1, 0, 8, 1672, 0}, 1, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_IF , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 7, 0, 0}, 0, 0, 0, 1, 0, 0, 0, 15, 0, 0, 0}, 
    {OPCODE_MOV , {{0, 0, 1755, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 1, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_RET , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_ENDIF , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_MOV , {{0, 0, 1170, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 1, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_RET , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
    {OPCODE_ENDSUB , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}
};
float noise1_const[2][4] = {
    {0.300000f, 0.900000f, 0.500000f, 0.300000f}
};

COMPILED_SUB noise1_presub = {
    &(noise1_insts[0]),
    12, 
    2, 
    1, 
    0, 
    &(noise1_const[0]), 
    SWIZZLE_X, 
    SWIZZLE_X, 
    SWIZZLE_X, 
    SWIZZLE_X,
    {0,0,0},
    0 
};

BITS addrmode_PVSDST(PVSDST * pPVSDST)
{
	return pPVSDST->addrmode0 | ((BITS)pPVSDST->addrmode1 << 1);
}

void setaddrmode_PVSDST(PVSDST * pPVSDST, BITS addrmode) 
{
	pPVSDST->addrmode0 = addrmode & 1;
	pPVSDST->addrmode1 = (addrmode >> 1) & 1;
}

void nomask_PVSDST(PVSDST * pPVSDST) 
{
	pPVSDST->writex = pPVSDST->writey = pPVSDST->writez = pPVSDST->writew = 1;
}

BITS addrmode_PVSSRC(PVSSRC* pPVSSRC) 
{
	return pPVSSRC->addrmode0 | ((BITS)pPVSSRC->addrmode1 << 1);
}

void setaddrmode_PVSSRC(PVSSRC* pPVSSRC, BITS addrmode) 
{
	pPVSSRC->addrmode0 = addrmode & 1;
	pPVSSRC->addrmode1 = (addrmode >> 1) & 1;
}


void setswizzle_PVSSRC(PVSSRC* pPVSSRC, BITS swz) 
{
	pPVSSRC->swizzlex = 
	pPVSSRC->swizzley = 
	pPVSSRC->swizzlez = 
	pPVSSRC->swizzlew = swz;
}

void noswizzle_PVSSRC(PVSSRC* pPVSSRC) 
{
	pPVSSRC->swizzlex = SQ_SEL_X;
	pPVSSRC->swizzley = SQ_SEL_Y;
	pPVSSRC->swizzlez = SQ_SEL_Z;
	pPVSSRC->swizzlew = SQ_SEL_W;
}

void
swizzleagain_PVSSRC(PVSSRC * pPVSSRC, BITS x, BITS y, BITS z, BITS w)
{
    switch (x) 
    {
        case SQ_SEL_X: x = pPVSSRC->swizzlex; 
            break;
        case SQ_SEL_Y: x = pPVSSRC->swizzley; 
            break;
        case SQ_SEL_Z: x = pPVSSRC->swizzlez; 
            break;
        case SQ_SEL_W: x = pPVSSRC->swizzlew; 
            break;
        default:;
    }

    switch (y) 
    {
        case SQ_SEL_X: y = pPVSSRC->swizzlex; 
            break;
        case SQ_SEL_Y: y = pPVSSRC->swizzley; 
            break;
        case SQ_SEL_Z: y = pPVSSRC->swizzlez; 
            break;
        case SQ_SEL_W: y = pPVSSRC->swizzlew; 
            break;
        default:;
    }

    switch (z) 
    {
        case SQ_SEL_X: z = pPVSSRC->swizzlex; 
            break;
        case SQ_SEL_Y: z = pPVSSRC->swizzley; 
            break;
        case SQ_SEL_Z: z = pPVSSRC->swizzlez; 
            break;
        case SQ_SEL_W: z = pPVSSRC->swizzlew; 
            break;
        default:;
    }

    switch (w) 
    {
        case SQ_SEL_X: w = pPVSSRC->swizzlex; 
            break;
        case SQ_SEL_Y: w = pPVSSRC->swizzley; 
            break;
        case SQ_SEL_Z: w = pPVSSRC->swizzlez; 
            break;
        case SQ_SEL_W: w = pPVSSRC->swizzlew; 
            break;
        default:;
    }

    pPVSSRC->swizzlex = x;
    pPVSSRC->swizzley = y;
    pPVSSRC->swizzlez = z;
    pPVSSRC->swizzlew = w;
}

void neg_PVSSRC(PVSSRC* pPVSSRC) 
{
	pPVSSRC->negx = 1;
	pPVSSRC->negy = 1;
	pPVSSRC->negz = 1;
	pPVSSRC->negw = 1;
}

void noneg_PVSSRC(PVSSRC* pPVSSRC) 
{
	pPVSSRC->negx = 0;
	pPVSSRC->negy = 0;
	pPVSSRC->negz = 0;
	pPVSSRC->negw = 0;
}

// negate argument (for SUB instead of ADD and alike)
void flipneg_PVSSRC(PVSSRC* pPVSSRC) 
{
	pPVSSRC->negx = !pPVSSRC->negx;
	pPVSSRC->negy = !pPVSSRC->negy;
	pPVSSRC->negz = !pPVSSRC->negz;
	pPVSSRC->negw = !pPVSSRC->negw;
}

void zerocomp_PVSSRC(PVSSRC* pPVSSRC, int c) 
{
	switch (c) 
	{
		case 0: pPVSSRC->swizzlex = SQ_SEL_0; pPVSSRC->negx = 0; break;
		case 1: pPVSSRC->swizzley = SQ_SEL_0; pPVSSRC->negy = 0; break;
		case 2: pPVSSRC->swizzlez = SQ_SEL_0; pPVSSRC->negz = 0; break;
		case 3: pPVSSRC->swizzlew = SQ_SEL_0; pPVSSRC->negw = 0; break;
		default:;
	} 
}

void onecomp_PVSSRC(PVSSRC* pPVSSRC, int c) 
{
	switch (c) 
	{
		case 0: pPVSSRC->swizzlex = SQ_SEL_1; pPVSSRC->negx = 0; break;
		case 1: pPVSSRC->swizzley = SQ_SEL_1; pPVSSRC->negy = 0; break;
		case 2: pPVSSRC->swizzlez = SQ_SEL_1; pPVSSRC->negz = 0; break;
		case 3: pPVSSRC->swizzlew = SQ_SEL_1; pPVSSRC->negw = 0; break;
		default:;
	} 
}

BITS is_misc_component_exported(VAP_OUT_VTX_FMT_0* pOutVTXFmt0)  
{
	  return (pOutVTXFmt0->point_size            |
			  pOutVTXFmt0->edge_flag             |
			  pOutVTXFmt0->rta_index             |
			  pOutVTXFmt0->kill_flag             |
			  pOutVTXFmt0->viewport_index);
}

BITS is_depth_component_exported(OUT_FRAGMENT_FMT_0* pFPOutFmt) 
{
	  return (pFPOutFmt->depth            | 
			  pFPOutFmt->stencil_ref      | 
			  pFPOutFmt->mask             | 
			  pFPOutFmt->coverage_to_mask);
}

GLboolean is_reduction_opcode(PVSDWORD* dest)
{
    if (dest->dst.op3 == 0) 
    {
        if ( (dest->dst.opcode == SQ_OP2_INST_DOT4 || dest->dst.opcode == SQ_OP2_INST_DOT4_IEEE || dest->dst.opcode == SQ_OP2_INST_CUBE) ) 
        {
            return GL_TRUE;
        }
    }
    return GL_FALSE;
}

GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size)
{
    GLuint format = FMT_INVALID;
	GLuint uiElemSize = 0;

    switch (eType)
    {
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
			uiElemSize = 1;
            switch(nChannels)
            {
                case 1:
                    format = FMT_8; break;
                case 2:
                    format = FMT_8_8; break;
                case 3:
                    format = FMT_8_8_8; break;
                case 4:
                    format = FMT_8_8_8_8; break;
                default:
                    break;
            }
            break;

        case GL_UNSIGNED_SHORT:
        case GL_SHORT:
			uiElemSize = 2;
            switch(nChannels)
            {
                case 1:
                    format = FMT_16; break;
                case 2:
                    format = FMT_16_16; break;
                case 3:
                    format = FMT_16_16_16; break;
                case 4:
                    format = FMT_16_16_16_16; break;
                default:
                    break;
            }
            break;

        case GL_UNSIGNED_INT:
        case GL_INT:
			uiElemSize = 4;
            switch(nChannels)
            {
                case 1:
                    format = FMT_32; break;
                case 2:
                    format = FMT_32_32; break;
                case 3:
                    format = FMT_32_32_32; break;
                case 4:
                    format = FMT_32_32_32_32; break;
                default:
                    break;
            }
            break;

        case GL_FLOAT:
			uiElemSize = 4;
			switch(nChannels)
            {
                case 1:
                    format = FMT_32_FLOAT; break;
                case 2:
                    format = FMT_32_32_FLOAT; break;
                case 3:
                    format = FMT_32_32_32_FLOAT; break;
                case 4:
                    format = FMT_32_32_32_32_FLOAT; break;
                default:
                    break;
            }
			break;
        case GL_DOUBLE:
			uiElemSize = 8;
            switch(nChannels)
            {
                case 1:
                    format = FMT_32_FLOAT; break;
                case 2:
                    format = FMT_32_32_FLOAT; break;
                case 3:
                    format = FMT_32_32_32_FLOAT; break;
                case 4:
                    format = FMT_32_32_32_32_FLOAT; break;
                default:
                    break;
            }
            break;
        default:
			;
            //GL_ASSERT_NO_CASE();
    }

    if(NULL != pClient_size)
    {
	    *pClient_size = uiElemSize * nChannels;
    }

    return(format);
}

unsigned int r700GetNumOperands(GLuint opcode, GLuint nIsOp3) 
{
    if(nIsOp3 > 0)
    {
        return 3;
    }

    switch (opcode)
    {
    case SQ_OP2_INST_ADD:
    case SQ_OP2_INST_KILLE:
    case SQ_OP2_INST_KILLGT:
    case SQ_OP2_INST_KILLGE:
    case SQ_OP2_INST_KILLNE:
    case SQ_OP2_INST_MUL: 
    case SQ_OP2_INST_MAX:
    case SQ_OP2_INST_MIN:
    //case SQ_OP2_INST_MAX_DX10:
    //case SQ_OP2_INST_MIN_DX10:
    case SQ_OP2_INST_SETE: 
    case SQ_OP2_INST_SETNE:
    case SQ_OP2_INST_SETGT:
    case SQ_OP2_INST_SETGE:
    case SQ_OP2_INST_PRED_SETE:
    case SQ_OP2_INST_PRED_SETGT:
    case SQ_OP2_INST_PRED_SETGE:
    case SQ_OP2_INST_PRED_SETNE:
    case SQ_OP2_INST_DOT4:
    case SQ_OP2_INST_DOT4_IEEE:
    case SQ_OP2_INST_CUBE:
        return 2;  

    case SQ_OP2_INST_MOV: 
    case SQ_OP2_INST_MOVA_FLOOR:
    case SQ_OP2_INST_FRACT:
    case SQ_OP2_INST_FLOOR:
    case SQ_OP2_INST_TRUNC:
    case SQ_OP2_INST_EXP_IEEE:
    case SQ_OP2_INST_LOG_CLAMPED:
    case SQ_OP2_INST_LOG_IEEE:
    case SQ_OP2_INST_RECIP_IEEE:
    case SQ_OP2_INST_RECIPSQRT_IEEE:
    case SQ_OP2_INST_FLT_TO_INT:
    case SQ_OP2_INST_SIN:
    case SQ_OP2_INST_COS:
        return 1;
        
    default: radeon_error(
		    "Need instruction operand number for %x.\n", opcode); 
    };

    return 3;
}

int Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700_Shader* pShader)
{
    GLuint i;

    Init_R700_Shader(pShader);
    pAsm->pR700Shader = pShader;
    pAsm->currentShaderType = spt;

    pAsm->cf_last_export_ptr   = NULL;

    pAsm->cf_current_export_clause_ptr = NULL;
    pAsm->cf_current_alu_clause_ptr    = NULL;
    pAsm->cf_current_tex_clause_ptr    = NULL;
    pAsm->cf_current_vtx_clause_ptr    = NULL;
    pAsm->cf_current_cf_clause_ptr     = NULL;

    // No clause has been created yet
    pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;

    pAsm->number_of_colorandz_exports = 0;
    pAsm->number_of_exports           = 0;
    pAsm->number_of_export_opcodes    = 0;

    pAsm->alu_x_opcode = 0;

    pAsm->D2.bits = 0;

    pAsm->D.bits = 0;
    pAsm->S[0].bits = 0;
    pAsm->S[1].bits = 0;
    pAsm->S[2].bits = 0;

    pAsm->uLastPosUpdate = 0; 
	
    *(BITS *) &pAsm->fp_stOutFmt0 = 0;

    pAsm->uIIns = 0;
    pAsm->uOIns = 0;
    pAsm->number_used_registers = 0;
    pAsm->uUsedConsts = 256; 


    // Fragment programs
    pAsm->uBoolConsts = 0;
    pAsm->uIntConsts = 0;
    pAsm->uInsts = 0;
    pAsm->uConsts = 0;

    pAsm->FCSP = 0;
    pAsm->fc_stack[0].type = FC_NONE;

    pAsm->aArgSubst[0] =
    pAsm->aArgSubst[1] =
    pAsm->aArgSubst[2] =
    pAsm->aArgSubst[3] = (-1);

    pAsm->uOutputs = 0;

    for (i=0; i<NUMBER_OF_OUTPUT_COLORS; i++) 
    {
        pAsm->color_export_register_number[i] = (-1);
    }


    pAsm->depth_export_register_number = (-1);
    pAsm->stencil_export_register_number = (-1);
    pAsm->coverage_to_mask_export_register_number = (-1);
    pAsm->mask_export_register_number = (-1);

    pAsm->starting_export_register_number = 0;
    pAsm->starting_vfetch_register_number = 0;
    pAsm->starting_temp_register_number   = 0;
    pAsm->uFirstHelpReg = 0;

    pAsm->input_position_is_used = GL_FALSE;
    pAsm->input_normal_is_used   = GL_FALSE;

    for (i=0; i<NUMBER_OF_INPUT_COLORS; i++) 
    {
        pAsm->input_color_is_used[ i ] = GL_FALSE;
    }

    for (i=0; i<NUMBER_OF_TEXTURE_UNITS; i++) 
    {
        pAsm->input_texture_unit_is_used[ i ] = GL_FALSE;
    }

    for (i=0; i<VERT_ATTRIB_MAX; i++) 
    {
        pAsm->vfetch_instruction_ptr_array[ i ] = NULL;
    }

    pAsm->number_of_inputs = 0;

    pAsm->is_tex = GL_FALSE;
    pAsm->need_tex_barrier = GL_FALSE;

    pAsm->subs              = NULL;
    pAsm->unSubArraySize    = 0;
    pAsm->unSubArrayPointer = 0;
    pAsm->callers              = NULL;
    pAsm->unCallerArraySize    = 0;
    pAsm->unCallerArrayPointer = 0;

    pAsm->CALLSP = 0;
    pAsm->CALLSTACK[0].FCSP_BeforeEntry = 0;
    pAsm->CALLSTACK[0].plstCFInstructions_local
          = &(pAsm->pR700Shader->lstCFInstructions);

    pAsm->CALLSTACK[0].max = 0;
    pAsm->CALLSTACK[0].current = 0;

    SetActiveCFlist(pAsm->pR700Shader, pAsm->CALLSTACK[0].plstCFInstructions_local);

    pAsm->unCFflags = 0;

    pAsm->presubs           = NULL;
    pAsm->unPresubArraySize = 0;
    pAsm->unNumPresub       = 0;
    pAsm->unCurNumILInsts   = 0;

    pAsm->unVetTexBits      = 0;

    return 0;
}

GLboolean IsTex(gl_inst_opcode Opcode)
{
    if( (OPCODE_TEX==Opcode) || (OPCODE_TXP==Opcode) || (OPCODE_TXB==Opcode) ||
        (OPCODE_DDX==Opcode) || (OPCODE_DDY==Opcode) )
    {
        return GL_TRUE;
    }
    return GL_FALSE;
}

GLboolean IsAlu(gl_inst_opcode Opcode)
{
    //TODO : more for fc and ex for higher spec.
    if( IsTex(Opcode) )
    {
        return GL_FALSE;
    }
    return GL_TRUE;
}

int check_current_clause(r700_AssemblerBase* pAsm,
					     CF_CLAUSE_TYPE      new_clause_type)
{
	if (pAsm->cf_current_clause_type != new_clause_type) 
	{	//Close last open clause
		switch (pAsm->cf_current_clause_type) 
		{
		case CF_ALU_CLAUSE:
			if ( pAsm->cf_current_alu_clause_ptr != NULL) 
            {
                pAsm->cf_current_alu_clause_ptr = NULL;
            }
			break;
		case CF_VTX_CLAUSE:
			if ( pAsm->cf_current_vtx_clause_ptr != NULL) 
            {
                pAsm->cf_current_vtx_clause_ptr = NULL;
            }
			break;
		case CF_TEX_CLAUSE:
			if ( pAsm->cf_current_tex_clause_ptr != NULL) 
            {
                pAsm->cf_current_tex_clause_ptr = NULL;
            }
			break;
		case CF_EXPORT_CLAUSE:
			if ( pAsm->cf_current_export_clause_ptr != NULL) 
            {
                pAsm->cf_current_export_clause_ptr = NULL;
            }
			break;
		case CF_OTHER_CLAUSE:
			if ( pAsm->cf_current_cf_clause_ptr != NULL) 
            {
                pAsm->cf_current_cf_clause_ptr = NULL;
            }
			break;
		case CF_EMPTY_CLAUSE:
			break;
		default:
            radeon_error(
                       "Unknown CF_CLAUSE_TYPE (%d) in check_current_clause. \n", (int) new_clause_type);
			return GL_FALSE;
		}

        pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;

		// Create new clause
        switch (new_clause_type) 
	    {
        case CF_ALU_CLAUSE:
            pAsm->cf_current_clause_type = CF_ALU_CLAUSE;
            break;
        case CF_VTX_CLAUSE:
            pAsm->cf_current_clause_type = CF_VTX_CLAUSE;
            break;
        case CF_TEX_CLAUSE:        
            pAsm->cf_current_clause_type = CF_TEX_CLAUSE;
            break;
        case CF_EXPORT_CLAUSE:
            {
                R700ControlFlowSXClause* pR700ControlFlowSXClause 
                            = (R700ControlFlowSXClause*) CALLOC_STRUCT(R700ControlFlowSXClause); 
            
                // Add new export instruction to control flow program        
                if (pR700ControlFlowSXClause != 0) 
                {
                    pAsm->cf_current_export_clause_ptr = pR700ControlFlowSXClause;
                    Init_R700ControlFlowSXClause(pR700ControlFlowSXClause);
                    AddCFInstruction( pAsm->pR700Shader, 
                                      (R700ControlFlowInstruction *)pR700ControlFlowSXClause );
                }
                else 
                {
                    radeon_error(
                               "Error allocating new EXPORT CF instruction in check_current_clause. \n");
                    return GL_FALSE;
                }
                pAsm->cf_current_clause_type = CF_EXPORT_CLAUSE;
            }
            break;
        case CF_EMPTY_CLAUSE:
            break;
        case CF_OTHER_CLAUSE:
            pAsm->cf_current_clause_type = CF_OTHER_CLAUSE;
            break;
        default:
            radeon_error(
                       "Unknown CF_CLAUSE_TYPE (%d) in check_current_clause. \n", (int) new_clause_type);
            return GL_FALSE;
        }
    }

    return GL_TRUE;
}

GLboolean add_cf_instruction(r700_AssemblerBase* pAsm)
{
    if(GL_FALSE == check_current_clause(pAsm, CF_OTHER_CLAUSE))
    {
        return GL_FALSE;
    }

    pAsm->cf_current_cf_clause_ptr = 
      (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);

    if (pAsm->cf_current_cf_clause_ptr != NULL) 
	{
		Init_R700ControlFlowGenericClause(pAsm->cf_current_cf_clause_ptr);
		AddCFInstruction( pAsm->pR700Shader, 
                          (R700ControlFlowInstruction *)pAsm->cf_current_cf_clause_ptr );
	}
	else 
	{
        radeon_error("Could not allocate a new VFetch CF instruction.\n");
		return GL_FALSE;
	}

    return GL_TRUE;
}

GLboolean add_vfetch_instruction(r700_AssemblerBase*     pAsm,
								 R700VertexInstruction*  vertex_instruction_ptr)
{
	if( GL_FALSE == check_current_clause(pAsm,  CF_VTX_CLAUSE) )
	{
		return GL_FALSE;
	}

    if( pAsm->cf_current_vtx_clause_ptr == NULL ||
        ( (pAsm->cf_current_vtx_clause_ptr != NULL) && 
         (pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count >= GetCFMaxInstructions(pAsm->cf_current_vtx_clause_ptr->m_ShaderInstType)-1) 
        ) ) 
    { 
		// Create new Vfetch control flow instruction for this new clause
		pAsm->cf_current_vtx_clause_ptr = (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);

		if (pAsm->cf_current_vtx_clause_ptr != NULL) 
		{
			Init_R700ControlFlowGenericClause(pAsm->cf_current_vtx_clause_ptr);
			AddCFInstruction( pAsm->pR700Shader, 
                              (R700ControlFlowInstruction *)pAsm->cf_current_vtx_clause_ptr );
		}
		else 
		{
            radeon_error("Could not allocate a new VFetch CF instruction.\n");
			return GL_FALSE;
		}

		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.pop_count        = 0x0;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cf_const         = 0x0;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count            = 0x0;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.end_of_program   = 0x0;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_VTX;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.barrier          = 0x1;

		LinkVertexInstruction(pAsm->cf_current_vtx_clause_ptr, vertex_instruction_ptr );
	}
	else
	{
		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count++;
	}

	AddVTXInstruction(pAsm->pR700Shader, vertex_instruction_ptr);

	return GL_TRUE;
}

GLboolean add_tex_instruction(r700_AssemblerBase*     pAsm,
                              R700TextureInstruction* tex_instruction_ptr)
{ 
    if ( GL_FALSE == check_current_clause(pAsm, CF_TEX_CLAUSE) )
    {
        return GL_FALSE;
    }

    if ( pAsm->cf_current_tex_clause_ptr == NULL ||
         ( (pAsm->cf_current_tex_clause_ptr != NULL) && 
           (pAsm->cf_current_tex_clause_ptr->m_Word1.f.count >= GetCFMaxInstructions(pAsm->cf_current_tex_clause_ptr->m_ShaderInstType)-1) 
         ) ) 
    {
        // new tex cf instruction for this new clause  
        pAsm->cf_current_tex_clause_ptr = (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);

		if (pAsm->cf_current_tex_clause_ptr != NULL) 
		{
			Init_R700ControlFlowGenericClause(pAsm->cf_current_tex_clause_ptr);
			AddCFInstruction( pAsm->pR700Shader, 
                              (R700ControlFlowInstruction *)pAsm->cf_current_tex_clause_ptr );
		}
		else 
		{
            radeon_error("Could not allocate a new TEX CF instruction.\n");
			return GL_FALSE;
		}
        
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.pop_count        = 0x0;
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cf_const         = 0x0;
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

        pAsm->cf_current_tex_clause_ptr->m_Word1.f.end_of_program   = 0x0;
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_TEX;
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.barrier          = 0x0;   //0x1;
    }
    else 
    {        
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.count++;
    }

    // If this clause constains any TEX instruction that is dependent on a previous instruction, 
    // set the barrier bit
    if( pAsm->pInstDeps[pAsm->uiCurInst].nDstDep > (-1) || pAsm->need_tex_barrier == GL_TRUE )
    {
        pAsm->cf_current_tex_clause_ptr->m_Word1.f.barrier = 0x1;  
    }

    if(NULL == pAsm->cf_current_tex_clause_ptr->m_pLinkedTEXInstruction)
    {
        pAsm->cf_current_tex_clause_ptr->m_pLinkedTEXInstruction = tex_instruction_ptr;
        tex_instruction_ptr->m_pLinkedGenericClause = pAsm->cf_current_tex_clause_ptr;
    }

    AddTEXInstruction(pAsm->pR700Shader, tex_instruction_ptr);

    return GL_TRUE;
}

GLboolean assemble_vfetch_instruction(r700_AssemblerBase* pAsm,
								GLuint gl_client_id,
                                GLuint destination_register,
								GLuint number_of_elements,
                                GLenum dataElementType,
								VTX_FETCH_METHOD* pFetchMethod)
{
    GLuint client_size_inbyte;
	GLuint data_format;
    GLuint mega_fetch_count;
	GLuint is_mega_fetch_flag;

	R700VertexGenericFetch*   vfetch_instruction_ptr;
	R700VertexGenericFetch*   assembled_vfetch_instruction_ptr = pAsm->vfetch_instruction_ptr_array[ gl_client_id ];

	if (assembled_vfetch_instruction_ptr == NULL) 
	{
		vfetch_instruction_ptr = (R700VertexGenericFetch*) CALLOC_STRUCT(R700VertexGenericFetch);
		if (vfetch_instruction_ptr == NULL) 
		{
			return GL_FALSE;
		}
        Init_R700VertexGenericFetch(vfetch_instruction_ptr);
    }
	else 
	{
		vfetch_instruction_ptr = assembled_vfetch_instruction_ptr;
	}

	data_format = GetSurfaceFormat(dataElementType, number_of_elements, &client_size_inbyte);

	if(GL_TRUE == pFetchMethod->bEnableMini) //More conditions here
	{
		//TODO : mini fetch
		mega_fetch_count = 0;
		is_mega_fetch_flag = 0;
	}
	else
	{
		mega_fetch_count = MEGA_FETCH_BYTES - 1;
		is_mega_fetch_flag       = 0x1;
		pFetchMethod->mega_fetch_remainder = MEGA_FETCH_BYTES - client_size_inbyte;
	}

	vfetch_instruction_ptr->m_Word0.f.vtx_inst         = SQ_VTX_INST_FETCH;
	vfetch_instruction_ptr->m_Word0.f.fetch_type       = SQ_VTX_FETCH_VERTEX_DATA;
	vfetch_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;

	vfetch_instruction_ptr->m_Word0.f.buffer_id        = gl_client_id;
	vfetch_instruction_ptr->m_Word0.f.src_gpr          = 0x0; 
	vfetch_instruction_ptr->m_Word0.f.src_rel          = SQ_ABSOLUTE;
	vfetch_instruction_ptr->m_Word0.f.src_sel_x        = SQ_SEL_X;
	vfetch_instruction_ptr->m_Word0.f.mega_fetch_count = mega_fetch_count;

	vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (number_of_elements < 1) ? SQ_SEL_0 : SQ_SEL_X;
	vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (number_of_elements < 2) ? SQ_SEL_0 : SQ_SEL_Y;
	vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (number_of_elements < 3) ? SQ_SEL_0 : SQ_SEL_Z;
	vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (number_of_elements < 4) ? SQ_SEL_1 : SQ_SEL_W;

	vfetch_instruction_ptr->m_Word1.f.use_const_fields = 1;

	// Destination register
	vfetch_instruction_ptr->m_Word1_GPR.f.dst_gpr = destination_register; 
	vfetch_instruction_ptr->m_Word1_GPR.f.dst_rel = SQ_ABSOLUTE;

	vfetch_instruction_ptr->m_Word2.f.offset              = 0;
	vfetch_instruction_ptr->m_Word2.f.const_buf_no_stride = 0x0;

	vfetch_instruction_ptr->m_Word2.f.mega_fetch          = is_mega_fetch_flag;

	if (assembled_vfetch_instruction_ptr == NULL) 
	{
		if ( GL_FALSE == add_vfetch_instruction(pAsm, (R700VertexInstruction *)vfetch_instruction_ptr) ) 
        {   
			return GL_FALSE;
		}

		if (pAsm->vfetch_instruction_ptr_array[ gl_client_id ] != NULL) 
		{
			return GL_FALSE;
		}
		else 
		{
			pAsm->vfetch_instruction_ptr_array[ gl_client_id ] = vfetch_instruction_ptr;
		}
	}

	return GL_TRUE;
}

GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
                                       GLuint              destination_register,								       
                                       GLenum              type,
                                       GLint               size,
                                       GLubyte             element,
                                       GLuint              _signed,
                                       GLboolean           normalize,
                                       GLenum              format,
                                       VTX_FETCH_METHOD  * pFetchMethod)
{
    GLuint client_size_inbyte;
	GLuint data_format;
    GLuint mega_fetch_count;
	GLuint is_mega_fetch_flag;

	R700VertexGenericFetch*   vfetch_instruction_ptr;
	R700VertexGenericFetch*   assembled_vfetch_instruction_ptr 
                                     = pAsm->vfetch_instruction_ptr_array[element];

	if (assembled_vfetch_instruction_ptr == NULL) 
	{
		vfetch_instruction_ptr = (R700VertexGenericFetch*) CALLOC_STRUCT(R700VertexGenericFetch);
		if (vfetch_instruction_ptr == NULL) 
		{
			return GL_FALSE;
		}
        Init_R700VertexGenericFetch(vfetch_instruction_ptr);
    }
	else 
	{
		vfetch_instruction_ptr = assembled_vfetch_instruction_ptr;
	}

    data_format = GetSurfaceFormat(type, size, &client_size_inbyte);	

	if(GL_TRUE == pFetchMethod->bEnableMini) //More conditions here
	{
		//TODO : mini fetch
		mega_fetch_count = 0;
		is_mega_fetch_flag = 0;
	}
	else
	{
		mega_fetch_count = MEGA_FETCH_BYTES - 1;
		is_mega_fetch_flag       = 0x1;
		pFetchMethod->mega_fetch_remainder = MEGA_FETCH_BYTES - client_size_inbyte;
	}

	vfetch_instruction_ptr->m_Word0.f.vtx_inst         = SQ_VTX_INST_FETCH;
	vfetch_instruction_ptr->m_Word0.f.fetch_type       = SQ_VTX_FETCH_VERTEX_DATA;
	vfetch_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;

	vfetch_instruction_ptr->m_Word0.f.buffer_id        = element;
	vfetch_instruction_ptr->m_Word0.f.src_gpr          = 0x0; 
	vfetch_instruction_ptr->m_Word0.f.src_rel          = SQ_ABSOLUTE;
	vfetch_instruction_ptr->m_Word0.f.src_sel_x        = SQ_SEL_X;
	vfetch_instruction_ptr->m_Word0.f.mega_fetch_count = mega_fetch_count;

	if(format == GL_BGRA)
	{
		vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_Z;
		vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
		vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_X;
		vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
	}
	else
	{
		vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_X;
		vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
		vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_Z;
		vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;

	}

	vfetch_instruction_ptr->m_Word1.f.use_const_fields = 1;
    vfetch_instruction_ptr->m_Word1.f.data_format      = data_format;
    vfetch_instruction_ptr->m_Word2.f.endian_swap      = SQ_ENDIAN_NONE;

    if(1 == _signed)
    {
        vfetch_instruction_ptr->m_Word1.f.format_comp_all  = SQ_FORMAT_COMP_SIGNED;
    }
    else
    {
        vfetch_instruction_ptr->m_Word1.f.format_comp_all  = SQ_FORMAT_COMP_UNSIGNED;
    }

    if(GL_TRUE == normalize)
    {
        vfetch_instruction_ptr->m_Word1.f.num_format_all   = SQ_NUM_FORMAT_NORM;
    }
    else
    {
        vfetch_instruction_ptr->m_Word1.f.num_format_all   = SQ_NUM_FORMAT_INT;
    }

	// Destination register
	vfetch_instruction_ptr->m_Word1_GPR.f.dst_gpr = destination_register; 
	vfetch_instruction_ptr->m_Word1_GPR.f.dst_rel = SQ_ABSOLUTE;

	vfetch_instruction_ptr->m_Word2.f.offset              = 0;
	vfetch_instruction_ptr->m_Word2.f.const_buf_no_stride = 0x0;

	vfetch_instruction_ptr->m_Word2.f.mega_fetch          = is_mega_fetch_flag;

	if (assembled_vfetch_instruction_ptr == NULL) 
	{
		if ( GL_FALSE == add_vfetch_instruction(pAsm, (R700VertexInstruction *)vfetch_instruction_ptr) ) 
        {   
			return GL_FALSE;
		}

		if (pAsm->vfetch_instruction_ptr_array[element] != NULL) 
		{
			return GL_FALSE;
		}
		else 
		{
			pAsm->vfetch_instruction_ptr_array[element] = vfetch_instruction_ptr;
		}
	}

	return GL_TRUE;
}

GLboolean cleanup_vfetch_instructions(r700_AssemblerBase* pAsm)
{
    GLint i;
    pAsm->cf_current_clause_type    = CF_EMPTY_CLAUSE;
    pAsm->cf_current_vtx_clause_ptr = NULL;

    for (i=0; i<VERT_ATTRIB_MAX; i++) 
	{
		pAsm->vfetch_instruction_ptr_array[ i ] = NULL;
	}

    cleanup_vfetch_shaderinst(pAsm->pR700Shader);
    
    return GL_TRUE;
}

GLuint gethelpr(r700_AssemblerBase* pAsm) 
{
    GLuint r = pAsm->uHelpReg;
    pAsm->uHelpReg++;
    if (pAsm->uHelpReg > pAsm->number_used_registers)
    {
        pAsm->number_used_registers = pAsm->uHelpReg;
	}
    return r;
}
void resethelpr(r700_AssemblerBase* pAsm) 
{
    pAsm->uHelpReg = pAsm->uFirstHelpReg;
}

void checkop_init(r700_AssemblerBase* pAsm)
{
    resethelpr(pAsm);
    pAsm->aArgSubst[0] =
    pAsm->aArgSubst[1] =
    pAsm->aArgSubst[2] =
    pAsm->aArgSubst[3] = -1;
}

GLboolean mov_temp(r700_AssemblerBase* pAsm, int src)
{
    GLuint tmp = gethelpr(pAsm);

    //mov src to temp helper gpr.
    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
  
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg   = tmp;

    nomask_PVSDST(&(pAsm->D.dst));

    if( GL_FALSE == assemble_src(pAsm, src, 0) )
    {
        return GL_FALSE;
    }

    noswizzle_PVSSRC(&(pAsm->S[0].src));
    noneg_PVSSRC(&(pAsm->S[0].src));
   
    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    pAsm->aArgSubst[1 + src] = tmp;

    return GL_TRUE;
}

GLboolean checkop1(r700_AssemblerBase* pAsm)
{
    checkop_init(pAsm);
    return GL_TRUE;
}

GLboolean checkop2(r700_AssemblerBase* pAsm)
{
    GLboolean bSrcConst[2];
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    checkop_init(pAsm);

    if( (pILInst->SrcReg[0].File == PROGRAM_UNIFORM)     || 
        (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
        (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
        (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
        (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
    {
        bSrcConst[0] = GL_TRUE;
    }
    else
    {
        bSrcConst[0] = GL_FALSE;
    }
    if( (pILInst->SrcReg[1].File == PROGRAM_UNIFORM)     || 
        (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
        (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
        (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
        (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
    {
        bSrcConst[1] = GL_TRUE;
    }
    else
    {
        bSrcConst[1] = GL_FALSE;
    }

    if( (bSrcConst[0] == GL_TRUE) && (bSrcConst[1] == GL_TRUE) )
    {
        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[1].Index)
        {
            if( GL_FALSE == mov_temp(pAsm, 1) )
            {
                return GL_FALSE;
            }
        }
    }

    return GL_TRUE;
}

GLboolean checkop3(r700_AssemblerBase* pAsm)
{
    GLboolean bSrcConst[3];
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    checkop_init(pAsm);

    if( (pILInst->SrcReg[0].File == PROGRAM_UNIFORM)     || 
        (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
        (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
        (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
        (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
    {
        bSrcConst[0] = GL_TRUE;
    }
    else
    {
        bSrcConst[0] = GL_FALSE;
    }
    if( (pILInst->SrcReg[1].File == PROGRAM_UNIFORM)     || 
        (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
        (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
        (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
        (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
    {
        bSrcConst[1] = GL_TRUE;
    }
    else
    {
        bSrcConst[1] = GL_FALSE;
    }
    if( (pILInst->SrcReg[2].File == PROGRAM_UNIFORM)     || 
        (pILInst->SrcReg[2].File == PROGRAM_CONSTANT)    ||
        (pILInst->SrcReg[2].File == PROGRAM_LOCAL_PARAM) ||
        (pILInst->SrcReg[2].File == PROGRAM_ENV_PARAM)   ||
        (pILInst->SrcReg[2].File == PROGRAM_STATE_VAR) )
    {
        bSrcConst[2] = GL_TRUE;
    }
    else
    {
        bSrcConst[2] = GL_FALSE;
    }

    if( (GL_TRUE == bSrcConst[0]) && 
        (GL_TRUE == bSrcConst[1]) && 
        (GL_TRUE == bSrcConst[2]) ) 
    {
        if( GL_FALSE == mov_temp(pAsm, 1) )
        {
            return GL_FALSE;
        }
        if( GL_FALSE == mov_temp(pAsm, 2) )
        {
            return GL_FALSE;
        }

        return GL_TRUE;
    }
    else if( (GL_TRUE == bSrcConst[0]) && 
             (GL_TRUE == bSrcConst[1]) ) 
    {
        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[1].Index)    
	    {
            if( GL_FALSE == mov_temp(pAsm, 1) )
            {
                return 1;
            }
        }

        return GL_TRUE;
    }
    else if ( (GL_TRUE == bSrcConst[0]) && 
              (GL_TRUE == bSrcConst[2]) )  
    {
        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[2].Index)     
	    {
            if( GL_FALSE == mov_temp(pAsm, 2) )
            {
                return GL_FALSE;
            }
        }

        return GL_TRUE;
    }
    else if( (GL_TRUE == bSrcConst[1]) && 
             (GL_TRUE == bSrcConst[2]) ) 
    {
        if(pILInst->SrcReg[1].Index != pILInst->SrcReg[2].Index)
	    {
            if( GL_FALSE == mov_temp(pAsm, 2) )
            {
                return GL_FALSE;
            }
        }

        return GL_TRUE;
    }

    return GL_TRUE;
}

GLboolean assemble_src(r700_AssemblerBase *pAsm,
                       int src, 
                       int fld)
{
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    if (fld == -1)
    {
        fld = src;
    }

    if(pAsm->aArgSubst[1+src] >= 0) 
    {
        assert(fld >= 0);
        setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
        pAsm->S[fld].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[fld].src.reg   = pAsm->aArgSubst[1+src];
    }
    else 
    {
        switch (pILInst->SrcReg[src].File)
        {
        case PROGRAM_TEMPORARY:
            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
            pAsm->S[fld].src.rtype = SRC_REG_TEMPORARY;
            pAsm->S[fld].src.reg = pILInst->SrcReg[src].Index + pAsm->starting_temp_register_number;
            break;
        case PROGRAM_CONSTANT:
        case PROGRAM_LOCAL_PARAM:
        case PROGRAM_ENV_PARAM:
        case PROGRAM_STATE_VAR:
        case PROGRAM_UNIFORM:
            if (1 == pILInst->SrcReg[src].RelAddr)
            {
                setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_RELATIVE_A0);
            }
            else
            {
                setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);              
            }

            pAsm->S[fld].src.rtype = SRC_REG_CONSTANT;
            if(pILInst->SrcReg[src].Index < 0)
            {
                WARN_ONCE("Negative register offsets not supported yet!\n");
                pAsm->S[fld].src.reg  = 0;
            } 
            else
            {
                pAsm->S[fld].src.reg = pILInst->SrcReg[src].Index;
            }
            break;      
        case PROGRAM_INPUT:
            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE); 
            pAsm->S[fld].src.rtype = SRC_REG_INPUT;
            switch (pAsm->currentShaderType)
            {
            case SPT_FP:
                pAsm->S[fld].src.reg = pAsm->uiFP_AttributeMap[pILInst->SrcReg[src].Index];
                break;
            case SPT_VP:
                pAsm->S[fld].src.reg = pAsm->ucVP_AttributeMap[pILInst->SrcReg[src].Index];
                break;
            }
            break;      
        default:
            radeon_error("Invalid source argument type : %d \n", pILInst->SrcReg[src].File);
            return GL_FALSE;
        }
    } 

    pAsm->S[fld].src.swizzlex = pILInst->SrcReg[src].Swizzle & 0x7;
    pAsm->S[fld].src.swizzley = (pILInst->SrcReg[src].Swizzle >> 3) & 0x7;
    pAsm->S[fld].src.swizzlez = (pILInst->SrcReg[src].Swizzle >> 6) & 0x7;
    pAsm->S[fld].src.swizzlew = (pILInst->SrcReg[src].Swizzle >> 9) & 0x7;

    pAsm->S[fld].src.negx = pILInst->SrcReg[src].Negate & 0x1;
    pAsm->S[fld].src.negy = (pILInst->SrcReg[src].Negate >> 1) & 0x1;
    pAsm->S[fld].src.negz = (pILInst->SrcReg[src].Negate >> 2) & 0x1;
    pAsm->S[fld].src.negw = (pILInst->SrcReg[src].Negate >> 3) & 0x1;
     
    return GL_TRUE;
}

GLboolean assemble_dst(r700_AssemblerBase *pAsm)
{
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
    switch (pILInst->DstReg.File) 
    {
    case PROGRAM_TEMPORARY:
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg = pILInst->DstReg.Index + pAsm->starting_temp_register_number;
        break;
    case PROGRAM_ADDRESS:
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_A0;
        pAsm->D.dst.reg = 0;
        break;
    case PROGRAM_OUTPUT:
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_OUT;
        switch (pAsm->currentShaderType)
        {
        case SPT_FP:
            pAsm->D.dst.reg = pAsm->uiFP_OutputMap[pILInst->DstReg.Index];
            break;
        case SPT_VP:
            pAsm->D.dst.reg = pAsm->ucVP_OutputMap[pILInst->DstReg.Index];
            break;
        }
        break;   
    default:
        radeon_error("Invalid destination output argument type\n");
        return GL_FALSE;
    }

    pAsm->D.dst.writex = pILInst->DstReg.WriteMask & 0x1;
    pAsm->D.dst.writey = (pILInst->DstReg.WriteMask >> 1) & 0x1;
    pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
    pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
  
    if(pILInst->SaturateMode == SATURATE_ZERO_ONE)
    {
        pAsm->D2.dst2.SaturateMode = 1;
    }
    else
    {
        pAsm->D2.dst2.SaturateMode = 0;
    }

    return GL_TRUE;
}

GLboolean tex_dst(r700_AssemblerBase *pAsm)
{
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    if(PROGRAM_TEMPORARY == pILInst->DstReg.File)
    {
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = pAsm->pILInst[pAsm->uiCurInst].DstReg.Index + pAsm->starting_temp_register_number;

        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    }
    else if(PROGRAM_OUTPUT == pILInst->DstReg.File)
    {
        pAsm->D.dst.rtype = DST_REG_OUT;
        switch (pAsm->currentShaderType)
        {
        case SPT_FP:
            pAsm->D.dst.reg = pAsm->uiFP_OutputMap[pILInst->DstReg.Index];
            break;
        case SPT_VP:
            pAsm->D.dst.reg = pAsm->ucVP_OutputMap[pILInst->DstReg.Index];
            break;
        }

        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    }
    else 
    {
        radeon_error("Invalid destination output argument type\n");
        return GL_FALSE;
    }

    pAsm->D.dst.writex = pILInst->DstReg.WriteMask & 0x1;
    pAsm->D.dst.writey = (pILInst->DstReg.WriteMask >> 1) & 0x1;
    pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
    pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
  
    return GL_TRUE;
}

GLboolean tex_src(r700_AssemblerBase *pAsm)
{
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    GLboolean bValidTexCoord = GL_FALSE;

    if(pAsm->aArgSubst[1] >= 0)
    {
        bValidTexCoord = GL_TRUE;
        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = pAsm->aArgSubst[1];
    }
    else
    {
    switch (pILInst->SrcReg[0].File) {
        case PROGRAM_UNIFORM: 
        case PROGRAM_CONSTANT:
        case PROGRAM_LOCAL_PARAM:
        case PROGRAM_ENV_PARAM:
        case PROGRAM_STATE_VAR:
            break;
        case PROGRAM_TEMPORARY:
            bValidTexCoord = GL_TRUE;
            pAsm->S[0].src.reg   = pILInst->SrcReg[0].Index +
            pAsm->starting_temp_register_number;
            pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
            break;
        case PROGRAM_INPUT:
            if(SPT_VP == pAsm->currentShaderType)
            {
                switch (pILInst->SrcReg[0].Index)
                {
                    case VERT_ATTRIB_TEX0:
                    case VERT_ATTRIB_TEX1:
                    case VERT_ATTRIB_TEX2:
                    case VERT_ATTRIB_TEX3:
                    case VERT_ATTRIB_TEX4:
                    case VERT_ATTRIB_TEX5:
                    case VERT_ATTRIB_TEX6:
                    case VERT_ATTRIB_TEX7:
                        bValidTexCoord = GL_TRUE;
                        pAsm->S[0].src.reg   =
                            pAsm->ucVP_AttributeMap[pILInst->SrcReg[0].Index];
                        pAsm->S[0].src.rtype = SRC_REG_INPUT;
                        break;
                }
            }
            else
            {
                switch (pILInst->SrcReg[0].Index)
                {
                    case FRAG_ATTRIB_WPOS:
                    case FRAG_ATTRIB_COL0:
                    case FRAG_ATTRIB_COL1:
                    case FRAG_ATTRIB_FOGC:
                    case FRAG_ATTRIB_TEX0:
                    case FRAG_ATTRIB_TEX1:
                    case FRAG_ATTRIB_TEX2:
                    case FRAG_ATTRIB_TEX3:
                    case FRAG_ATTRIB_TEX4:
                    case FRAG_ATTRIB_TEX5:
                    case FRAG_ATTRIB_TEX6:
                    case FRAG_ATTRIB_TEX7:
                        bValidTexCoord = GL_TRUE;
                        pAsm->S[0].src.reg   =
                            pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
                        pAsm->S[0].src.rtype = SRC_REG_INPUT;
                        break;
                    case FRAG_ATTRIB_FACE:
                        fprintf(stderr, "FRAG_ATTRIB_FACE unsupported\n");
                        break;
                    case FRAG_ATTRIB_PNTC:
                        fprintf(stderr, "FRAG_ATTRIB_PNTC unsupported\n");
                        break;
                }

                if( (pILInst->SrcReg[0].Index >= FRAG_ATTRIB_VAR0) ||
                    (pILInst->SrcReg[0].Index < FRAG_ATTRIB_MAX) )
                {
				    bValidTexCoord = GL_TRUE;
                    pAsm->S[0].src.reg   =
                        pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
                    pAsm->S[0].src.rtype = SRC_REG_INPUT;
                }
            }

            break;
        }
    }

    if(GL_TRUE == bValidTexCoord)
    {
        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    }
    else
    {
        radeon_error("Invalid source texcoord for TEX instruction\n");
        return GL_FALSE;
    }

    pAsm->S[0].src.swizzlex = pILInst->SrcReg[0].Swizzle & 0x7;
    pAsm->S[0].src.swizzley = (pILInst->SrcReg[0].Swizzle >> 3) & 0x7;
    pAsm->S[0].src.swizzlez = (pILInst->SrcReg[0].Swizzle >> 6) & 0x7;
    pAsm->S[0].src.swizzlew = (pILInst->SrcReg[0].Swizzle >> 9) & 0x7;

    pAsm->S[0].src.negx = pILInst->SrcReg[0].Negate & 0x1;
    pAsm->S[0].src.negy = (pILInst->SrcReg[0].Negate >> 1) & 0x1;
    pAsm->S[0].src.negz = (pILInst->SrcReg[0].Negate >> 2) & 0x1;
    pAsm->S[0].src.negw = (pILInst->SrcReg[0].Negate >> 3) & 0x1;

    return GL_TRUE;
}

GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm, GLboolean normalized)
{
    PVSSRC *   texture_coordinate_source;
    PVSSRC *   texture_unit_source;
    
    R700TextureInstruction* tex_instruction_ptr = (R700TextureInstruction*) CALLOC_STRUCT(R700TextureInstruction);
	if (tex_instruction_ptr == NULL) 
	{
		return GL_FALSE;
	}
    Init_R700TextureInstruction(tex_instruction_ptr);

    texture_coordinate_source = &(pAsm->S[0].src);
    texture_unit_source       = &(pAsm->S[1].src);

    tex_instruction_ptr->m_Word0.f.tex_inst         = pAsm->D.dst.opcode;
    tex_instruction_ptr->m_Word0.f.bc_frac_mode     = 0x0;
    tex_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
    tex_instruction_ptr->m_Word0.f.alt_const        = 0;

    if(SPT_VP == pAsm->currentShaderType)
    {
        tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg + VERT_ATTRIB_MAX;
        pAsm->unVetTexBits |= 1 << texture_unit_source->reg;
    }
    else
    {
        tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg;
    }

    tex_instruction_ptr->m_Word1.f.lod_bias     = 0x0;
    if (normalized) {
	    tex_instruction_ptr->m_Word1.f.coord_type_x = SQ_TEX_NORMALIZED;
	    tex_instruction_ptr->m_Word1.f.coord_type_y = SQ_TEX_NORMALIZED;
	    tex_instruction_ptr->m_Word1.f.coord_type_z = SQ_TEX_NORMALIZED;
	    tex_instruction_ptr->m_Word1.f.coord_type_w = SQ_TEX_NORMALIZED;
    } else {
	    /* XXX: UNNORMALIZED tex coords have limited wrap modes */
	    tex_instruction_ptr->m_Word1.f.coord_type_x = SQ_TEX_UNNORMALIZED;
	    tex_instruction_ptr->m_Word1.f.coord_type_y = SQ_TEX_UNNORMALIZED;
	    tex_instruction_ptr->m_Word1.f.coord_type_z = SQ_TEX_UNNORMALIZED;
	    tex_instruction_ptr->m_Word1.f.coord_type_w = SQ_TEX_UNNORMALIZED;
    }

    tex_instruction_ptr->m_Word2.f.offset_x   = 0x0;
    tex_instruction_ptr->m_Word2.f.offset_y   = 0x0;
    tex_instruction_ptr->m_Word2.f.offset_z   = 0x0;
    tex_instruction_ptr->m_Word2.f.sampler_id = texture_unit_source->reg;

    // dst
    if ( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
         (pAsm->D.dst.rtype == DST_REG_OUT) ) 
    {
        tex_instruction_ptr->m_Word0.f.src_gpr    = texture_coordinate_source->reg;
        tex_instruction_ptr->m_Word0.f.src_rel    = SQ_ABSOLUTE;

        tex_instruction_ptr->m_Word1.f.dst_gpr    = pAsm->D.dst.reg;
        tex_instruction_ptr->m_Word1.f.dst_rel    = SQ_ABSOLUTE;

        tex_instruction_ptr->m_Word1.f.dst_sel_x  = (pAsm->D.dst.writex ? texture_unit_source->swizzlex : SQ_SEL_MASK);
        tex_instruction_ptr->m_Word1.f.dst_sel_y  = (pAsm->D.dst.writey ? texture_unit_source->swizzley : SQ_SEL_MASK);
        tex_instruction_ptr->m_Word1.f.dst_sel_z  = (pAsm->D.dst.writez ? texture_unit_source->swizzlez : SQ_SEL_MASK);
        tex_instruction_ptr->m_Word1.f.dst_sel_w  = (pAsm->D.dst.writew ? texture_unit_source->swizzlew : SQ_SEL_MASK);


        tex_instruction_ptr->m_Word2.f.src_sel_x  = texture_coordinate_source->swizzlex;
        tex_instruction_ptr->m_Word2.f.src_sel_y  = texture_coordinate_source->swizzley;
        tex_instruction_ptr->m_Word2.f.src_sel_z  = texture_coordinate_source->swizzlez;
        tex_instruction_ptr->m_Word2.f.src_sel_w  = texture_coordinate_source->swizzlew;
    }
    else 
    {
        radeon_error("Only temp destination registers supported for TEX dest regs.\n");
        return GL_FALSE;
    }

    if( GL_FALSE == add_tex_instruction(pAsm, tex_instruction_ptr) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

void initialize(r700_AssemblerBase *pAsm)
{
    GLuint cycle, component;

    for (cycle=0; cycle<NUMBER_OF_CYCLES; cycle++) 
    {
        for (component=0; component<NUMBER_OF_COMPONENTS; component++) 
        {
            pAsm->hw_gpr[cycle][component] = (-1);
        }
    }
    for (component=0; component<NUMBER_OF_COMPONENTS; component++) 
    {
        pAsm->hw_cfile_addr[component] = (-1);
        pAsm->hw_cfile_chan[component] = (-1);
    }
}

GLboolean assemble_alu_src(R700ALUInstruction*  alu_instruction_ptr,
                           int                  source_index,
                           PVSSRC*              pSource,
                           BITS                 scalar_channel_index)
{
    BITS src_sel;
    BITS src_rel;
    BITS src_chan;
    BITS src_neg;

    //--------------------------------------------------------------------------
    // Source for operands src0, src1. 
    // Values [0,127] correspond to GPR[0..127]. 
    // Values [256,511] correspond to cfile constants c[0..255]. 

    //--------------------------------------------------------------------------
    // Other special values are shown in the list below.

    // 248	SQ_ALU_SRC_0: special constant 0.0.
    // 249	SQ_ALU_SRC_1: special constant 1.0 float.

    // 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
    // 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.

    // 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
    // 253	SQ_ALU_SRC_LITERAL: literal constant.

    // 254	SQ_ALU_SRC_PV: previous vector result.
    // 255	SQ_ALU_SRC_PS: previous scalar result.
    //--------------------------------------------------------------------------

    BITS channel_swizzle;
    switch (scalar_channel_index) 
    {
        case 0: channel_swizzle = pSource->swizzlex; break;
        case 1: channel_swizzle = pSource->swizzley; break;
        case 2: channel_swizzle = pSource->swizzlez; break;
        case 3: channel_swizzle = pSource->swizzlew; break;
        default: channel_swizzle = SQ_SEL_MASK; break;
    }

    if(channel_swizzle == SQ_SEL_0) 
    {
        src_sel = SQ_ALU_SRC_0; 
    }
    else if (channel_swizzle == SQ_SEL_1) 
    {
        src_sel = SQ_ALU_SRC_1; 
    }
    else 
    {
        if ( (pSource->rtype == SRC_REG_TEMPORARY) || 
             (pSource->rtype == SRC_REG_INPUT)
        ) 
        {
            src_sel = pSource->reg;
        }
        else if (pSource->rtype == SRC_REG_CONSTANT)
        {
            src_sel = pSource->reg + CFILE_REGISTER_OFFSET;            
        }
        else if (pSource->rtype == SRC_REC_LITERAL)
        {
            src_sel = SQ_ALU_SRC_LITERAL;            
        }
        else
        {
            radeon_error("Source (%d) register type (%d) not one of TEMP, INPUT, or CONSTANT.\n",
                     source_index, pSource->rtype);
            return GL_FALSE;
        }
    }

    if( ADDR_ABSOLUTE == addrmode_PVSSRC(pSource) ) 
    {
        src_rel = SQ_ABSOLUTE;
    }
    else 
    {
        src_rel = SQ_RELATIVE;
    }

    switch (channel_swizzle) 
    {
        case SQ_SEL_X: 
            src_chan = SQ_CHAN_X; 
            break;
        case SQ_SEL_Y: 
            src_chan = SQ_CHAN_Y; 
            break;
        case SQ_SEL_Z: 
            src_chan = SQ_CHAN_Z; 
            break;
        case SQ_SEL_W: 
            src_chan = SQ_CHAN_W; 
            break;
        case SQ_SEL_0:
        case SQ_SEL_1:
            // Does not matter since src_sel controls
            src_chan = SQ_CHAN_X; 
            break;
        default:
            radeon_error("Unknown source select value (%d) in assemble_alu_src().\n", channel_swizzle);
            return GL_FALSE;
            break;
    }

    switch (scalar_channel_index) 
    {
        case 0: src_neg = pSource->negx; break;
        case 1: src_neg = pSource->negy; break;
        case 2: src_neg = pSource->negz; break;
        case 3: src_neg = pSource->negw; break;
        default: src_neg = 0; break;
    }

    switch (source_index) 
    {
        case 0:
            assert(alu_instruction_ptr);
            alu_instruction_ptr->m_Word0.f.src0_sel  = src_sel;
            alu_instruction_ptr->m_Word0.f.src0_rel  = src_rel;
            alu_instruction_ptr->m_Word0.f.src0_chan = src_chan;
            alu_instruction_ptr->m_Word0.f.src0_neg  = src_neg;
            break;
        case 1:
            assert(alu_instruction_ptr);
            alu_instruction_ptr->m_Word0.f.src1_sel  = src_sel;
            alu_instruction_ptr->m_Word0.f.src1_rel  = src_rel;
            alu_instruction_ptr->m_Word0.f.src1_chan = src_chan;
            alu_instruction_ptr->m_Word0.f.src1_neg  = src_neg;
            break;
        case 2:
            assert(alu_instruction_ptr);
            alu_instruction_ptr->m_Word1_OP3.f.src2_sel  = src_sel;
            alu_instruction_ptr->m_Word1_OP3.f.src2_rel  = src_rel;
            alu_instruction_ptr->m_Word1_OP3.f.src2_chan = src_chan;
            alu_instruction_ptr->m_Word1_OP3.f.src2_neg  = src_neg;
            break;
        default:
            radeon_error("Only three sources allowed in ALU opcodes.\n");
          return GL_FALSE;
          break;
    }

    return GL_TRUE;
}

GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
                              R700ALUInstruction* alu_instruction_ptr,
                              GLuint              contiguous_slots_needed)
{
    if( GL_FALSE == check_current_clause(pAsm, CF_ALU_CLAUSE) )
    {
        return GL_FALSE;
    }

    if ( pAsm->alu_x_opcode != 0 ||
         pAsm->cf_current_alu_clause_ptr == NULL ||
         ( (pAsm->cf_current_alu_clause_ptr != NULL) && 
           (pAsm->cf_current_alu_clause_ptr->m_Word1.f.count >= (GetCFMaxInstructions(pAsm->cf_current_alu_clause_ptr->m_ShaderInstType)-contiguous_slots_needed-1) )
         ) ) 
    {

        //new cf inst for this clause
        pAsm->cf_current_alu_clause_ptr = (R700ControlFlowALUClause*) CALLOC_STRUCT(R700ControlFlowALUClause);
            
        // link the new cf to cf segment    
        if(NULL != pAsm->cf_current_alu_clause_ptr) 
        {
            Init_R700ControlFlowALUClause(pAsm->cf_current_alu_clause_ptr);
			AddCFInstruction( pAsm->pR700Shader, 
                              (R700ControlFlowInstruction *)pAsm->cf_current_alu_clause_ptr );            
        }
        else 
        {
            radeon_error("Could not allocate a new ALU CF instruction.\n");
            return GL_FALSE;
        }

        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_bank0 = 0x0;
        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_bank1 = 0x0;
        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_mode0 = SQ_CF_KCACHE_NOP;

        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_mode1 = SQ_CF_KCACHE_NOP;
        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr0 = 0x0;
        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr1 = 0x0;

        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count           = 0x0;

        if(pAsm->alu_x_opcode != 0)
        {
            pAsm->cf_current_alu_clause_ptr->m_Word1.f.cf_inst = pAsm->alu_x_opcode;
            pAsm->alu_x_opcode = 0;
        }
        else
        {
            pAsm->cf_current_alu_clause_ptr->m_Word1.f.cf_inst = SQ_CF_INST_ALU;
        }

        pAsm->cf_current_alu_clause_ptr->m_Word1.f.whole_quad_mode = 0x0;

        pAsm->cf_current_alu_clause_ptr->m_Word1.f.barrier         = 0x1;
    }
    else 
    {
        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count += (GetInstructionSize(alu_instruction_ptr->m_ShaderInstType) / 2);
    }

    // If this clause constains any instruction that is forward dependent on a TEX instruction, 
    // set the whole_quad_mode for this clause
    if ( pAsm->pInstDeps[pAsm->uiCurInst].nDstDep > (-1) ) 
    {
        pAsm->cf_current_alu_clause_ptr->m_Word1.f.whole_quad_mode = 0x1;   
    }

    if (pAsm->cf_current_alu_clause_ptr->m_Word1.f.count >= (GetCFMaxInstructions(pAsm->cf_current_alu_clause_ptr->m_ShaderInstType)-1) ) 
    {
        alu_instruction_ptr->m_Word0.f.last = 1;
    }

    if(NULL == pAsm->cf_current_alu_clause_ptr->m_pLinkedALUInstruction)
    {
        pAsm->cf_current_alu_clause_ptr->m_pLinkedALUInstruction = alu_instruction_ptr;
        alu_instruction_ptr->m_pLinkedALUClause = pAsm->cf_current_alu_clause_ptr;
    }
    
    AddALUInstruction(pAsm->pR700Shader, alu_instruction_ptr);

    return GL_TRUE;
}

void get_src_properties(R700ALUInstruction*  alu_instruction_ptr,
                        int                  source_index,
                        BITS*                psrc_sel,
                        BITS*                psrc_rel,
                        BITS*                psrc_chan,
                        BITS*                psrc_neg)
{
    switch (source_index) 
    {
        case 0:
            *psrc_sel  = alu_instruction_ptr->m_Word0.f.src0_sel ;
            *psrc_rel  = alu_instruction_ptr->m_Word0.f.src0_rel ;
            *psrc_chan = alu_instruction_ptr->m_Word0.f.src0_chan;
            *psrc_neg  = alu_instruction_ptr->m_Word0.f.src0_neg ;
            break;

        case 1:
            *psrc_sel  = alu_instruction_ptr->m_Word0.f.src1_sel ;
            *psrc_rel  = alu_instruction_ptr->m_Word0.f.src1_rel ;
            *psrc_chan = alu_instruction_ptr->m_Word0.f.src1_chan;
            *psrc_neg  = alu_instruction_ptr->m_Word0.f.src1_neg ;
            break;

        case 2:
            *psrc_sel  = alu_instruction_ptr->m_Word1_OP3.f.src2_sel;
            *psrc_rel  = alu_instruction_ptr->m_Word1_OP3.f.src2_rel;
            *psrc_chan = alu_instruction_ptr->m_Word1_OP3.f.src2_chan;
            *psrc_neg  = alu_instruction_ptr->m_Word1_OP3.f.src2_neg;
            break;
    }
}

int is_cfile(BITS sel) 
{
    if (sel > 255 && sel < 512) 
    {
        return 1;
    }
    return 0;
}

int is_const(BITS sel) 
{
    if (is_cfile(sel)) 
    {
        return 1;
    }
    else if(sel >= SQ_ALU_SRC_0 && sel <= SQ_ALU_SRC_LITERAL) 
    {
        return 1;
    }
    return 0;
}

int is_gpr(BITS sel) 
{
    if (sel >= 0 && sel < 128) 
    {
        return 1;
    }
    return 0;
}

const GLuint BANK_SWIZZLE_VEC[8] = {SQ_ALU_VEC_210,  //000
                                    SQ_ALU_VEC_120,  //001
                                    SQ_ALU_VEC_102,  //010

                                    SQ_ALU_VEC_201,  //011
                                    SQ_ALU_VEC_012,  //100
                                    SQ_ALU_VEC_021,  //101

                                    SQ_ALU_VEC_012,  //110
                                    SQ_ALU_VEC_012}; //111

const GLuint BANK_SWIZZLE_SCL[8] = {SQ_ALU_SCL_210,  //000
                                    SQ_ALU_SCL_122,  //001 
                                    SQ_ALU_SCL_122,  //010

                                    SQ_ALU_SCL_221,  //011
                                    SQ_ALU_SCL_212,  //100
                                    SQ_ALU_SCL_122,  //101

                                    SQ_ALU_SCL_122,  //110
                                    SQ_ALU_SCL_122}; //111

GLboolean reserve_cfile(r700_AssemblerBase* pAsm, 
                        GLuint sel, 
                        GLuint chan)
{
    int res_match = (-1);
    int res_empty = (-1);

    GLint res;

    for (res=3; res>=0; res--) 
    {
        if(pAsm->hw_cfile_addr[ res] < 0)  
        {
            res_empty = res;
        }
        else if( (pAsm->hw_cfile_addr[res] == (int)sel)
                 &&
                 (pAsm->hw_cfile_chan[ res ] == (int) chan) ) 
        {
            res_match = res;
        }
    }

    if(res_match >= 0) 
    {
        // Read for this scalar component already reserved, nothing to do here.
        ;
    }
    else if(res_empty >= 0) 
    {
        pAsm->hw_cfile_addr[ res_empty ] = sel;
        pAsm->hw_cfile_chan[ res_empty ] = chan;
    }
    else 
    {
        radeon_error("All cfile read ports are used, cannot reference C$sel, channel $chan.\n");
        return GL_FALSE;
    }
    return GL_TRUE;
}

GLboolean reserve_gpr(r700_AssemblerBase* pAsm, GLuint sel, GLuint chan, GLuint cycle)
{
    if(pAsm->hw_gpr[cycle][chan] < 0) 
    {
        pAsm->hw_gpr[cycle][chan] = sel;
    }
    else if(pAsm->hw_gpr[cycle][chan] != (int)sel) 
    {
        radeon_error("Another scalar operation has already used GPR read port for given channel\n");
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean cycle_for_scalar_bank_swizzle(const int swiz, const int sel, GLuint* pCycle)
{
    switch (swiz) 
    {
        case SQ_ALU_SCL_210:
            {
                int table[3] = {2,	1,	0};
                *pCycle = table[sel];
                return GL_TRUE;
            }
            break;
        case SQ_ALU_SCL_122:
            {
                int table[3] = {1,	2,	2};
                *pCycle = table[sel];
                return GL_TRUE;
            }
            break;
        case SQ_ALU_SCL_212:
            {	
                int table[3] = {2,	1,	2};
                *pCycle = table[sel];
                return GL_TRUE;
            }
            break;
        case SQ_ALU_SCL_221:
            {
                int table[3] = {2, 2, 1};
                *pCycle = table[sel];
                return GL_TRUE;
            }
            break;
        default:
            radeon_error("Bad Scalar bank swizzle value\n");
            break;
    }

    return GL_FALSE;
}

GLboolean cycle_for_vector_bank_swizzle(const int swiz, const int sel, GLuint* pCycle)
{
    switch (swiz) 
    {
        case SQ_ALU_VEC_012:
            {
                int table[3] = {0, 1, 2};
                *pCycle = table[sel];
            }
            break;
        case SQ_ALU_VEC_021:
            {
                int table[3] = {0, 2,	1};
                *pCycle = table[sel];
            }
            break;        
        case SQ_ALU_VEC_120:
            {
                int table[3] = {1, 2,	0};
                *pCycle = table[sel];
            }
            break;
        case SQ_ALU_VEC_102:
            {
                int table[3] = {1, 0,	2};
                *pCycle = table[sel];
            }
            break;
        case SQ_ALU_VEC_201:
            {
                int table[3] = {2, 0,	1};
                *pCycle = table[sel];
            }
            break;
        case SQ_ALU_VEC_210:
            {
                int table[3] = {2, 1,	0};
                *pCycle = table[sel];
            }
            break;
        default:
            radeon_error("Bad Vec bank swizzle value\n");
            return GL_FALSE;
            break;
    }

    return GL_TRUE;
}

GLboolean check_scalar(r700_AssemblerBase* pAsm,
                       R700ALUInstruction* alu_instruction_ptr)
{
    GLuint cycle;
    GLuint bank_swizzle;
    GLuint const_count = 0;

    BITS sel;
    BITS chan;
    BITS rel;
    BITS neg;

    GLuint src;

    BITS src_sel [3] = {0,0,0};
    BITS src_chan[3] = {0,0,0};
    BITS src_rel [3] = {0,0,0};
    BITS src_neg [3] = {0,0,0};

    GLuint swizzle_key;

    GLuint number_of_operands = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);

    for (src=0; src<number_of_operands; src++) 
    {
        get_src_properties(alu_instruction_ptr,
                           src,
                           &(src_sel[src]), 
                           &(src_rel[src]), 
                           &(src_chan[src]), 
                           &(src_neg[src]) );
    }


    swizzle_key = ( (is_const( src_sel[0] ) ? 4 : 0) + 
                    (is_const( src_sel[1] ) ? 2 : 0) + 
                    (is_const( src_sel[2] ) ? 1 : 0) );
  
    alu_instruction_ptr->m_Word1.f.bank_swizzle = BANK_SWIZZLE_SCL[ swizzle_key ];

    for (src=0; src<number_of_operands; src++) 
    {
        sel  = src_sel [src];
        chan = src_chan[src];
        rel  = src_rel [src];
        neg  = src_neg [src];

        if (is_const( sel )) 
        {
            // Any constant, including literal and inline constants
            const_count++;

            if (is_cfile( sel )) 
            {
                reserve_cfile(pAsm, sel, chan);
            }

        }
    }

    for (src=0; src<number_of_operands; src++) 
    {
        sel  = src_sel [src];
        chan = src_chan[src];
        rel  = src_rel [src];
        neg  = src_neg [src];

        if( is_gpr(sel) ) 
        {
            bank_swizzle = alu_instruction_ptr->m_Word1.f.bank_swizzle;

            if( GL_FALSE == cycle_for_scalar_bank_swizzle(bank_swizzle, src, &cycle) )
            {
                return GL_FALSE;
            }

            if(cycle < const_count) 
            {
                if( GL_FALSE == reserve_gpr(pAsm, sel, chan, cycle) )
                {
                    return GL_FALSE;
                }
            }
        }
    }

    return GL_TRUE;
}

GLboolean check_vector(r700_AssemblerBase* pAsm,
                       R700ALUInstruction* alu_instruction_ptr)
{
    GLuint cycle;
    GLuint bank_swizzle;
    GLuint const_count = 0;

    GLuint src;

    BITS sel;
    BITS chan;
    BITS rel;
    BITS neg;

    BITS src_sel [3] = {0,0,0};
    BITS src_chan[3] = {0,0,0};
    BITS src_rel [3] = {0,0,0};
    BITS src_neg [3] = {0,0,0};

    GLuint swizzle_key;

    GLuint number_of_operands = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);

    for (src=0; src<number_of_operands; src++) 
    {
        get_src_properties(alu_instruction_ptr,
                           src,
                           &(src_sel[src]), 
                           &(src_rel[src]), 
                           &(src_chan[src]), 
                           &(src_neg[src]) );
    }


    swizzle_key = ( (is_const( src_sel[0] ) ? 4 : 0) + 
                           (is_const( src_sel[1] ) ? 2 : 0) + 
                           (is_const( src_sel[2] ) ? 1 : 0) 
                         );

    alu_instruction_ptr->m_Word1.f.bank_swizzle = BANK_SWIZZLE_VEC[swizzle_key];

    for (src=0; src<number_of_operands; src++) 
    {
        sel  = src_sel [src];
        chan = src_chan[src];
        rel  = src_rel [src];
        neg  = src_neg [src];


        bank_swizzle = alu_instruction_ptr->m_Word1.f.bank_swizzle;

        if( is_gpr(sel) ) 
        {
            if( GL_FALSE == cycle_for_vector_bank_swizzle(bank_swizzle, src, &cycle) )
            {             
                return GL_FALSE;
            }

            if ( (src  == 1)          && 
                 (sel  == src_sel[0]) &&
                 (chan == src_chan[0]) ) 
            {        
            }
            else 
            {
                if( GL_FALSE == reserve_gpr(pAsm, sel, chan, cycle) )
                {                    
                    return GL_FALSE;
                }
            }
        }
        else if( is_const(sel) ) 
        {                  
            const_count++;

            if( is_cfile(sel) ) 
            {        
                if( GL_FALSE == reserve_cfile(pAsm, sel, chan) )
                {                    
                    return GL_FALSE;
                }
            }
        }
    }

    return GL_TRUE;
}

GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
{
    R700ALUInstruction            * alu_instruction_ptr = NULL;
    R700ALUInstructionHalfLiteral * alu_instruction_ptr_hl;
    R700ALUInstructionFullLiteral * alu_instruction_ptr_fl;

    GLuint    number_of_scalar_operations;
    GLboolean is_single_scalar_operation;
    GLuint    scalar_channel_index;

    PVSSRC * pcurrent_source;
    int    current_source_index;
    GLuint contiguous_slots_needed;

    GLuint    uNumSrc = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);
    //GLuint    channel_swizzle, j;
    //GLuint    chan_counter[4] = {0, 0, 0, 0};
    //PVSSRC *  pSource[3];
    GLboolean bSplitInst = GL_FALSE;

    if (1 == pAsm->D.dst.math) 
    {
        is_single_scalar_operation = GL_TRUE;
        number_of_scalar_operations = 1;
    }
    else 
    {
        is_single_scalar_operation = GL_FALSE;
        number_of_scalar_operations = 4;

/* current assembler doesn't do more than 1 register per source */
#if 0
        /* check read port, only very preliminary algorithm, not count in 
           src0/1 same comp case and prev slot repeat case; also not count relative
           addressing. TODO: improve performance. */
        for(j=0; j<uNumSrc; j++)
        {
            pSource[j] = &(pAsm->S[j].src);
        }
        for(scalar_channel_index=0; scalar_channel_index<4; scalar_channel_index++) 
        {
            for(j=0; j<uNumSrc; j++) 
            {
                switch (scalar_channel_index) 
                {
                    case 0: channel_swizzle = pSource[j]->swizzlex; break;
                    case 1: channel_swizzle = pSource[j]->swizzley; break;
                    case 2: channel_swizzle = pSource[j]->swizzlez; break;
                    case 3: channel_swizzle = pSource[j]->swizzlew; break;
                    default: channel_swizzle = SQ_SEL_MASK; break;
                }
                if ( ((pSource[j]->rtype == SRC_REG_TEMPORARY) || 
                     (pSource[j]->rtype == SRC_REG_INPUT))
                     && (channel_swizzle <= SQ_SEL_W) )
                {                    
                    chan_counter[channel_swizzle]++;                        
                }
            }
        }
        if(   (chan_counter[SQ_SEL_X] > 3)
           || (chan_counter[SQ_SEL_Y] > 3)
           || (chan_counter[SQ_SEL_Z] > 3)
           || (chan_counter[SQ_SEL_W] > 3) ) /* each chan bank has only 3 ports. */
        {
            bSplitInst = GL_TRUE;
        }
#endif
    }

    contiguous_slots_needed = 0;

    if(!is_single_scalar_operation) 
    {
        contiguous_slots_needed = 4;
    }

    contiguous_slots_needed += pAsm->D2.dst2.literal_slots;

    initialize(pAsm);    

    for (scalar_channel_index=0;
            scalar_channel_index < number_of_scalar_operations; 
                scalar_channel_index++) 
    {
        if(scalar_channel_index == (number_of_scalar_operations-1))
        {
            switch(pAsm->D2.dst2.literal_slots)
            {
            case 0:
                alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
                Init_R700ALUInstruction(alu_instruction_ptr);
                break;
            case 1:
                alu_instruction_ptr_hl = (R700ALUInstructionHalfLiteral*) CALLOC_STRUCT(R700ALUInstructionHalfLiteral);
                Init_R700ALUInstructionHalfLiteral(alu_instruction_ptr_hl, pAsm->C[0].f, pAsm->C[1].f);
                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_hl;
                break;
            case 2:
                alu_instruction_ptr_fl = (R700ALUInstructionFullLiteral*) CALLOC_STRUCT(R700ALUInstructionFullLiteral);
                Init_R700ALUInstructionFullLiteral(alu_instruction_ptr_fl,pAsm->C[0].f, pAsm->C[1].f, pAsm->C[2].f, pAsm->C[3].f);
                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_fl;
            break;
            };
        }
        else
        {
            alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
            Init_R700ALUInstruction(alu_instruction_ptr);
        }
        
        //src 0
        current_source_index = 0;
        pcurrent_source = &(pAsm->S[0].src);

        if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
                                         current_source_index,
                                         pcurrent_source, 
                                         scalar_channel_index) )     
        {            
            return GL_FALSE;
        }
   
        if (uNumSrc > 1) 
        {            
            // Process source 1            
            current_source_index = 1;
            pcurrent_source = &(pAsm->S[current_source_index].src);

            if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
                                             current_source_index,
                                             pcurrent_source, 
                                             scalar_channel_index) ) 
            {                
                return GL_FALSE;
            }
        }

        //other bits
        alu_instruction_ptr->m_Word0.f.index_mode = pAsm->D2.dst2.index_mode;

        if(   (is_single_scalar_operation == GL_TRUE) 
           || (GL_TRUE == bSplitInst) )
        {
            alu_instruction_ptr->m_Word0.f.last = 1;
        }
        else 
        {
            alu_instruction_ptr->m_Word0.f.last = (scalar_channel_index == 3) ?  1 : 0;
        }

        alu_instruction_ptr->m_Word0.f.pred_sel = (pAsm->D.dst.pred_inv > 0) ? 1 : 0;
        if(1 == pAsm->D.dst.predicated)
        {
            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x1;
            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x1;
        }
        else
        {
            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
        }

        // dst
        if( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
            (pAsm->D.dst.rtype == DST_REG_OUT) ) 
        {
            alu_instruction_ptr->m_Word1.f.dst_gpr  = pAsm->D.dst.reg;
        }
        else 
        {            
            radeon_error("Only temp destination registers supported for ALU dest regs.\n");
            return GL_FALSE;
        }

        alu_instruction_ptr->m_Word1.f.dst_rel  = SQ_ABSOLUTE;  //D.rtype

        if ( is_single_scalar_operation == GL_TRUE ) 
        {
            // Override scalar_channel_index since only one scalar value will be written
            if(pAsm->D.dst.writex) 
            {
                scalar_channel_index = 0;
            }
            else if(pAsm->D.dst.writey) 
            {
                scalar_channel_index = 1;
            }
            else if(pAsm->D.dst.writez) 
            {
                scalar_channel_index = 2;
            }
            else if(pAsm->D.dst.writew) 
            {
                scalar_channel_index = 3;
            }
        }

        alu_instruction_ptr->m_Word1.f.dst_chan = scalar_channel_index;

        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->D2.dst2.SaturateMode;

        if (pAsm->D.dst.op3) 
        {            
            //op3

            alu_instruction_ptr->m_Word1_OP3.f.alu_inst = pAsm->D.dst.opcode;

            //There's 3rd src for op3
            current_source_index = 2;
            pcurrent_source = &(pAsm->S[current_source_index].src);

            if ( GL_FALSE == assemble_alu_src(alu_instruction_ptr,
                                              current_source_index,
                                              pcurrent_source, 
                                              scalar_channel_index) ) 
            {
                return GL_FALSE;
            }
        }
        else 
        {
            //op2
            if (pAsm->bR6xx)
            {
                alu_instruction_ptr->m_Word1_OP2.f6.alu_inst           = pAsm->D.dst.opcode;

                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = pAsm->S[0].src.abs;
                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = pAsm->S[1].src.abs;

                //alu_instruction_ptr->m_Word1_OP2.f6.update_execute_mask = 0x0;
                //alu_instruction_ptr->m_Word1_OP2.f6.update_pred         = 0x0;
                switch (scalar_channel_index) 
                {
                    case 0: 
                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writex; 
                        break;
                    case 1: 
                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writey; 
                        break;
                    case 2: 
                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writez; 
                        break;
                    case 3: 
                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writew; 
                        break;
                    default: 
                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = 1; //SQ_SEL_MASK;
                        break;
                }            
                alu_instruction_ptr->m_Word1_OP2.f6.omod               = SQ_ALU_OMOD_OFF;
            }
            else
            {
                alu_instruction_ptr->m_Word1_OP2.f.alu_inst           = pAsm->D.dst.opcode;

                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = pAsm->S[0].src.abs;
                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = pAsm->S[1].src.abs;

                //alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
                //alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
                switch (scalar_channel_index) 
                {
                    case 0: 
                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writex; 
                        break;
                    case 1: 
                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writey; 
                        break;
                    case 2: 
                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writez; 
                        break;
                    case 3: 
                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writew; 
                        break;
                    default: 
                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = 1; //SQ_SEL_MASK;
                        break;
                }            
                alu_instruction_ptr->m_Word1_OP2.f.omod               = SQ_ALU_OMOD_OFF;
            }
        }

        if(GL_FALSE == add_alu_instruction(pAsm, alu_instruction_ptr, contiguous_slots_needed) )
        {            
            return GL_FALSE;
        }

        /*
         * Judge the type of current instruction, is it vector or scalar 
         * instruction.
         */        
        if (is_single_scalar_operation) 
        {
            if(GL_FALSE == check_scalar(pAsm, alu_instruction_ptr) )
            {                
                return GL_FALSE;
            }
        }
        else 
        {
            if(GL_FALSE == check_vector(pAsm, alu_instruction_ptr) )
            {                
                return GL_FALSE; 
            }
        }

        contiguous_slots_needed -= 1;
    }

    return GL_TRUE;
}

GLboolean next_ins(r700_AssemblerBase *pAsm)
{
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    if( GL_TRUE == pAsm->is_tex )
    {
	    if (pILInst->TexSrcTarget == TEXTURE_RECT_INDEX) {
		    if( GL_FALSE == assemble_tex_instruction(pAsm, GL_FALSE) ) 
		    {
			    radeon_error("Error assembling TEX instruction\n");
			    return GL_FALSE;
		    }
	    } else {
		    if( GL_FALSE == assemble_tex_instruction(pAsm, GL_TRUE) ) 
		    {
			    radeon_error("Error assembling TEX instruction\n");
			    return GL_FALSE;
		    }
	    }
    }
    else 
    {   //ALU      
        if( GL_FALSE == assemble_alu_instruction(pAsm) ) 
        {
            radeon_error("Error assembling ALU instruction\n");
            return GL_FALSE;
        }
    } 
      
    if(pAsm->D.dst.rtype == DST_REG_OUT) 
    {
        if(pAsm->D.dst.op3) 
        {        
            // There is no mask for OP3 instructions, so all channels are written        
            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] = 0xF;
        }
        else 
        {
            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] 
               |= (unsigned char)pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask;
        }
    }
    
    //reset for next inst.
    pAsm->D.bits    = 0;
    pAsm->D2.bits   = 0;
    pAsm->S[0].bits = 0;
    pAsm->S[1].bits = 0;
    pAsm->S[2].bits = 0;
    pAsm->is_tex = GL_FALSE;
    pAsm->need_tex_barrier = GL_FALSE;
    pAsm->D2.bits = 0;
    pAsm->C[0].bits = pAsm->C[1].bits = pAsm->C[2].bits = pAsm->C[3].bits = 0;
    return GL_TRUE;
}

GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode)
{
    BITS tmp;

    checkop1(pAsm);

    tmp = gethelpr(pAsm);

    // opcode  tmp.x,    a.x
    // MOV     dst,      tmp.x

    pAsm->D.dst.opcode = opcode;
    pAsm->D.dst.math = 1;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
    pAsm->D.dst.reg    = tmp;
    pAsm->D.dst.writex = 1;

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    // Now replicate result to all necessary channels in destination
    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_ABS(r700_AssemblerBase *pAsm)
{
    checkop1(pAsm);

    pAsm->D.dst.opcode = SQ_OP2_INST_MAX;  

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }
    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }
 
    pAsm->S[1].bits = pAsm->S[0].bits;
    flipneg_PVSSRC(&(pAsm->S[1].src));

    if ( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_ADD(r700_AssemblerBase *pAsm)
{
    if( GL_FALSE == checkop2(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
 
    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }
 
    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
        return GL_FALSE;
    }

    if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_SUB)
    {
        flipneg_PVSSRC(&(pAsm->S[1].src));
    }

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_ARL(r700_AssemblerBase *pAsm)
{ /* TODO: ar values dont' persist between clauses */
    if( GL_FALSE == checkop1(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_MOVA_FLOOR;
    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg = 0;
    pAsm->D.dst.writex = 0;
    pAsm->D.dst.writey = 0;
    pAsm->D.dst.writez = 0;
    pAsm->D.dst.writew = 0;

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_BAD(char *opcode_str) 
{
    radeon_error("Not yet implemented instruction (%s)\n", opcode_str);
    return GL_FALSE;
}

GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
{
    int tmp;

    if( GL_FALSE == checkop3(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGE;
    pAsm->D.dst.op3     = 1;  

    tmp = (-1);

    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask)
    {
        //OP3 has no support for write mask
        tmp = gethelpr(pAsm);

        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp;

        nomask_PVSDST(&(pAsm->D.dst));
    }
    else 
    {
        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }
              
    if( GL_FALSE == assemble_src(pAsm, 2, 1) )  
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, 2) ) 
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    if (0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask) 
    {
        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

        //tmp for source
        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = tmp;

        noneg_PVSSRC(&(pAsm->S[0].src));
        noswizzle_PVSSRC(&(pAsm->S[0].src));

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
    }

    return GL_TRUE;
}

GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
{
    int tmp;
    checkop1(pAsm);

    tmp = gethelpr(pAsm);

    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
    pAsm->D.dst.reg    = tmp;
    pAsm->D.dst.writex = 1;

    assemble_src(pAsm, 0, -1);

    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
    pAsm->D2.dst2.literal_slots = 1;
    pAsm->C[0].f = 1/(3.1415926535 * 2);
    pAsm->C[1].f = 0.0F;
    next_ins(pAsm);

    pAsm->D.dst.opcode = opcode;
    pAsm->D.dst.math = 1;

    assemble_dst(pAsm);

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp;
    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    next_ins(pAsm);

    //TODO - replicate if more channels set in WriteMask
    return GL_TRUE;

}
 
GLboolean assemble_DOT(r700_AssemblerBase *pAsm)
{
    if( GL_FALSE == checkop2(pAsm) )
    {
        return GL_FALSE;
    }
 
    pAsm->D.dst.opcode = SQ_OP2_INST_DOT4;  

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
        return GL_FALSE;
    }

    if(OPCODE_DP3 == pAsm->pILInst[pAsm->uiCurInst].Opcode)
    {
        zerocomp_PVSSRC(&(pAsm->S[0].src), 3);
        zerocomp_PVSSRC(&(pAsm->S[1].src), 3);
    }
    else if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_DPH) 
    {
        onecomp_PVSSRC(&(pAsm->S[0].src), 3);
    } 

    if ( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_DST(r700_AssemblerBase *pAsm)
{
    if( GL_FALSE == checkop2(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
        return GL_FALSE;
    }

    onecomp_PVSSRC(&(pAsm->S[0].src), 0);
    onecomp_PVSSRC(&(pAsm->S[0].src), 3);

    onecomp_PVSSRC(&(pAsm->S[1].src), 0);
    onecomp_PVSSRC(&(pAsm->S[1].src), 2);

    if ( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_EX2(r700_AssemblerBase *pAsm)
{
    return assemble_math_function(pAsm, SQ_OP2_INST_EXP_IEEE);
}

GLboolean assemble_EXP(r700_AssemblerBase *pAsm)
{
    BITS tmp;

    checkop1(pAsm);

    tmp = gethelpr(pAsm);

    // FLOOR   tmp.x,    a.x
    // EX2     dst.x     tmp.x

    if (pAsm->pILInst->DstReg.WriteMask & 0x1) {
        pAsm->D.dst.opcode = SQ_OP2_INST_FLOOR;

        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
        pAsm->D.dst.reg    = tmp;
        pAsm->D.dst.writex = 1;

        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
        {
            return GL_FALSE;
        }

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
        pAsm->D.dst.math = 1;

        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.writey = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;

        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
        pAsm->S[0].src.reg   = tmp;

        setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
        noneg_PVSSRC(&(pAsm->S[0].src));

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
    }

    // FRACT   dst.y     a.x

    if ((pAsm->pILInst->DstReg.WriteMask >> 1) & 0x1) {
        pAsm->D.dst.opcode = SQ_OP2_INST_FRACT;

        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.writex = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
    }

    // EX2     dst.z,    a.x

    if ((pAsm->pILInst->DstReg.WriteMask >> 2) & 0x1) {
        pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
        pAsm->D.dst.math = 1;

        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writew = 0;

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
    }

    // MOV     dst.w     1.0

    if ((pAsm->pILInst->DstReg.WriteMask >> 3) & 0x1) {
        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writez = 0;

        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = tmp;

        setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_1);
        noneg_PVSSRC(&(pAsm->S[0].src));

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
    }

    return GL_TRUE;
}
 
GLboolean assemble_FLR(r700_AssemblerBase *pAsm)
{
    checkop1(pAsm);

    pAsm->D.dst.opcode = SQ_OP2_INST_FLOOR;  

    if ( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_FLR_INT(r700_AssemblerBase *pAsm)
{
    return assemble_math_function(pAsm, SQ_OP2_INST_FLT_TO_INT);
}

GLboolean assemble_FRC(r700_AssemblerBase *pAsm)
{
    checkop1(pAsm);

    pAsm->D.dst.opcode = SQ_OP2_INST_FRACT; 

    if ( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_KIL(r700_AssemblerBase *pAsm, GLuint opcode)
{  
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    if(pILInst->Opcode == OPCODE_KIL)
        checkop1(pAsm);

    pAsm->D.dst.opcode = opcode;  
    //pAsm->D.dst.math = 1;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg   = 0;
    pAsm->D.dst.writex = 0;
    pAsm->D.dst.writey = 0;
    pAsm->D.dst.writez = 0;
    pAsm->D.dst.writew = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg = 0;
    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_0);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if(pILInst->Opcode == OPCODE_KIL_NV)
    {
        setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
        pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[1].src.reg = 0;
        setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_1);
        neg_PVSSRC(&(pAsm->S[1].src));
    }
    else
    {
        if( GL_FALSE == assemble_src(pAsm, 0, 1) )
        {
            return GL_FALSE;
        }

    }

    if ( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    /* Doc says KILL has to be last(end) ALU clause */
    pAsm->pR700Shader->killIsUsed = GL_TRUE;
    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
    
    return GL_TRUE;
}

GLboolean assemble_LG2(r700_AssemblerBase *pAsm) 
{ 
    return assemble_math_function(pAsm, SQ_OP2_INST_LOG_IEEE);
}

GLboolean assemble_LRP(r700_AssemblerBase *pAsm) 
{
    BITS tmp;

    if( GL_FALSE == checkop3(pAsm) )
    {
        return GL_FALSE;
    }

    tmp = gethelpr(pAsm);

    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;

    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg   = tmp;
    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    nomask_PVSDST(&(pAsm->D.dst));

          
    if( GL_FALSE == assemble_src(pAsm, 1, 0) ) 
    {
	    return GL_FALSE;
    }

    if ( GL_FALSE == assemble_src(pAsm, 2, 1) )   
    {
	    return GL_FALSE;
    }

    neg_PVSSRC(&(pAsm->S[1].src));

    if( GL_FALSE == next_ins(pAsm) ) 
    {
	    return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
    pAsm->D.dst.op3    = 1;

    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg = tmp;
    nomask_PVSDST(&(pAsm->D.dst));
    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg = tmp;
    noswizzle_PVSSRC(&(pAsm->S[0].src));


    if( GL_FALSE == assemble_src(pAsm, 0, 1) ) 
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 2, -1) ) 
    {
        return GL_FALSE;
    }

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg = tmp;
    noswizzle_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_LOG(r700_AssemblerBase *pAsm)
{
    BITS tmp1, tmp2, tmp3;

    checkop1(pAsm);

    tmp1 = gethelpr(pAsm);
    tmp2 = gethelpr(pAsm);
    tmp3 = gethelpr(pAsm);

    // FIXME: The hardware can do fabs() directly on input
    //        elements, but the compiler doesn't have the
    //        capability to use that.

    // MAX     tmp1.x,   a.x,    -a.x   (fabs(a.x))

    pAsm->D.dst.opcode = SQ_OP2_INST_MAX;  

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
    pAsm->D.dst.reg    = tmp1;
    pAsm->D.dst.writex = 1;

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }
 
    pAsm->S[1].bits = pAsm->S[0].bits;
    flipneg_PVSSRC(&(pAsm->S[1].src));

    if ( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    // Entire algo:
    //
    // LG2     tmp2.x,   tmp1.x
    // FLOOR   tmp3.x,   tmp2.x
    // MOV     dst.x,    tmp3.x
    // ADD     tmp3.x,   tmp2.x,    -tmp3.x
    // EX2     dst.y,    tmp3.x
    // MOV     dst.z,    tmp2.x
    // MOV     dst.w,    1.0

    // LG2     tmp2.x,   tmp1.x
    // FLOOR   tmp3.x,   tmp2.x

    pAsm->D.dst.opcode = SQ_OP2_INST_LOG_IEEE;
    pAsm->D.dst.math = 1;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
    pAsm->D.dst.reg    = tmp2;
    pAsm->D.dst.writex = 1;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp1;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_FLOOR;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
    pAsm->D.dst.reg    = tmp3;
    pAsm->D.dst.writex = 1;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp2;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    // MOV     dst.x,    tmp3.x

    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.writey = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp3;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    // ADD     tmp3.x,   tmp2.x,    -tmp3.x
    // EX2     dst.y,    tmp3.x

    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
    pAsm->D.dst.reg    = tmp3;
    pAsm->D.dst.writex = 1;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp2;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[1].src.reg   = tmp3;

    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
    neg_PVSSRC(&(pAsm->S[1].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
    pAsm->D.dst.math = 1;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.writex = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp3;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    // MOV     dst.z,    tmp2.x

    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writew = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp2;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    // MOV     dst.w     1.0

    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writez = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp1;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_1);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_MAD(struct r700_AssemblerBase *pAsm) 
{
    int tmp, ii;
    GLboolean bReplaceDst = GL_FALSE;
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

	if( GL_FALSE == checkop3(pAsm) )
    {
        return GL_FALSE;
    }

	pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;  
	pAsm->D.dst.op3     = 1; 

	tmp = (-1);

    if(PROGRAM_TEMPORARY == pILInst->DstReg.File)
    {   /* TODO : more investigation on MAD src and dst using same register */
        for(ii=0; ii<3; ii++)
        {
            if(   (PROGRAM_TEMPORARY == pILInst->SrcReg[ii].File)
               && (pILInst->DstReg.Index == pILInst->SrcReg[ii].Index) )
            {
                bReplaceDst = GL_TRUE;
                break;
            }
        }
    }
    if(0xF != pILInst->DstReg.WriteMask)
    {   /* OP3 has no support for write mask */
        bReplaceDst = GL_TRUE;
    }

	if(GL_TRUE == bReplaceDst)
    {
        tmp = gethelpr(pAsm);

        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp;

        nomask_PVSDST(&(pAsm->D.dst));
    }
    else 
    {
        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }
    }

	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }
              
    if( GL_FALSE == assemble_src(pAsm, 1, -1) )  
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 2, -1) ) 
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

	if (GL_TRUE == bReplaceDst) 
    {
        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

        //tmp for source
        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = tmp;

        noneg_PVSSRC(&(pAsm->S[0].src));
        noswizzle_PVSSRC(&(pAsm->S[0].src));

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
    }

    return GL_TRUE;
}

/* LIT dst, src */
GLboolean assemble_LIT(r700_AssemblerBase *pAsm)
{
    unsigned int dstReg;
    unsigned int dstType;
    unsigned int srcReg;
    unsigned int srcType;
    checkop1(pAsm);
    int tmp = gethelpr(pAsm);

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }
    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }
    dstReg  = pAsm->D.dst.reg;
    dstType = pAsm->D.dst.rtype;
    srcReg  = pAsm->S[0].src.reg;
    srcType = pAsm->S[0].src.rtype;

    /* dst.xw, <- 1.0  */
    pAsm->D.dst.opcode   = SQ_OP2_INST_MOV;
    pAsm->D.dst.rtype    = dstType;
    pAsm->D.dst.reg      = dstReg;
    pAsm->D.dst.writex   = 1;
    pAsm->D.dst.writey   = 0;
    pAsm->D.dst.writez   = 0;
    pAsm->D.dst.writew   = 1;
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[0].src));
    pAsm->S[0].src.swizzlex = SQ_SEL_1;
    pAsm->S[0].src.swizzley = SQ_SEL_1;
    pAsm->S[0].src.swizzlez = SQ_SEL_1;
    pAsm->S[0].src.swizzlew = SQ_SEL_1;
    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    /* dst.y = max(src.x, 0.0) */
    pAsm->D.dst.opcode   = SQ_OP2_INST_MAX;
    pAsm->D.dst.rtype    = dstType;
    pAsm->D.dst.reg      = dstReg;
    pAsm->D.dst.writex   = 0;
    pAsm->D.dst.writey   = 1;
    pAsm->D.dst.writez   = 0;
    pAsm->D.dst.writew   = 0;
    pAsm->S[0].src.rtype = srcType;
    pAsm->S[0].src.reg   = srcReg;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X, SQ_SEL_X, SQ_SEL_X, SQ_SEL_X);
    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[1].src.reg   = tmp;
    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[1].src));
    pAsm->S[1].src.swizzlex = SQ_SEL_0;
    pAsm->S[1].src.swizzley = SQ_SEL_0;
    pAsm->S[1].src.swizzlez = SQ_SEL_0;
    pAsm->S[1].src.swizzlew = SQ_SEL_0;
    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Y, SQ_SEL_Y, SQ_SEL_Y, SQ_SEL_Y);

    /* dst.z = log(src.y) */
    pAsm->D.dst.opcode   = SQ_OP2_INST_LOG_CLAMPED;
    pAsm->D.dst.math     = 1;
    pAsm->D.dst.rtype    = dstType;
    pAsm->D.dst.reg      = dstReg;
    pAsm->D.dst.writex   = 0;
    pAsm->D.dst.writey   = 0;
    pAsm->D.dst.writez   = 1;
    pAsm->D.dst.writew   = 0;
    pAsm->S[0].src.rtype = srcType;
    pAsm->S[0].src.reg   = srcReg;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, 2) )
    {
        return GL_FALSE;
    }

    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_W, SQ_SEL_W, SQ_SEL_W, SQ_SEL_W);

    swizzleagain_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X, SQ_SEL_X, SQ_SEL_X, SQ_SEL_X);

    /* tmp.x = amd MUL_LIT(src.w, dst.z, src.x ) */
    pAsm->D.dst.opcode   = SQ_OP3_INST_MUL_LIT;
    pAsm->D.dst.math     = 1;
    pAsm->D.dst.op3      = 1;
    pAsm->D.dst.rtype    = DST_REG_TEMPORARY;
    pAsm->D.dst.reg      = tmp;
    pAsm->D.dst.writex   = 1;
    pAsm->D.dst.writey   = 0;
    pAsm->D.dst.writez   = 0;
    pAsm->D.dst.writew   = 0;

    pAsm->S[0].src.rtype = srcType;
    pAsm->S[0].src.reg   = srcReg;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);

    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[1].src.reg   = dstReg;
    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[1].src));
    pAsm->S[1].src.swizzlex = SQ_SEL_Z;
    pAsm->S[1].src.swizzley = SQ_SEL_Z;
    pAsm->S[1].src.swizzlez = SQ_SEL_Z;
    pAsm->S[1].src.swizzlew = SQ_SEL_Z;

    pAsm->S[2].src.rtype = srcType;
    pAsm->S[2].src.reg   = srcReg;
    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    /* dst.z = exp(tmp.x) */
    pAsm->D.dst.opcode   = SQ_OP2_INST_EXP_IEEE;
    pAsm->D.dst.math     = 1;
    pAsm->D.dst.rtype    = dstType;
    pAsm->D.dst.reg      = dstReg;
    pAsm->D.dst.writex   = 0;
    pAsm->D.dst.writey   = 0;
    pAsm->D.dst.writez   = 1;
    pAsm->D.dst.writew   = 0;

    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[0].src));
    pAsm->S[0].src.swizzlex = SQ_SEL_X;
    pAsm->S[0].src.swizzley = SQ_SEL_X;
    pAsm->S[0].src.swizzlez = SQ_SEL_X;
    pAsm->S[0].src.swizzlew = SQ_SEL_X;

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_MAX(r700_AssemblerBase *pAsm) 
{
	if( GL_FALSE == checkop2(pAsm) )
	{
		return GL_FALSE;
	}

	pAsm->D.dst.opcode = SQ_OP2_INST_MAX; 
	
	if( GL_FALSE == assemble_dst(pAsm) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == next_ins(pAsm) )
	{
		return GL_FALSE;
	}

    return GL_TRUE;
}
 
GLboolean assemble_MIN(r700_AssemblerBase *pAsm) 
{
	if( GL_FALSE == checkop2(pAsm) )
	{
		return GL_FALSE;
	}

	pAsm->D.dst.opcode = SQ_OP2_INST_MIN;  

	if( GL_FALSE == assemble_dst(pAsm) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
	{
		return GL_FALSE;
	}
 
	if( GL_FALSE == next_ins(pAsm) )
	{
		return GL_FALSE;
	}

    return GL_TRUE;
}
 
GLboolean assemble_MOV(r700_AssemblerBase *pAsm) 
{
    checkop1(pAsm);

    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    if (GL_FALSE == assemble_dst(pAsm))
    {
        return GL_FALSE;
    }

    if (GL_FALSE == assemble_src(pAsm, 0, -1))
    {
        return GL_FALSE;
    }

    if ( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_MUL(r700_AssemblerBase *pAsm) 
{
	if( GL_FALSE == checkop2(pAsm) )
	{
		return GL_FALSE;
	}

	pAsm->D.dst.opcode = SQ_OP2_INST_MUL;

	if( GL_FALSE == assemble_dst(pAsm) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
	{
		return GL_FALSE;
	}

	if( GL_FALSE == next_ins(pAsm) ) 
	{
		return GL_FALSE;
	}

    return GL_TRUE;
}
 
GLboolean assemble_POW(r700_AssemblerBase *pAsm) 
{
    BITS tmp;

    checkop1(pAsm);

    tmp = gethelpr(pAsm);

    // LG2 tmp.x,     a.swizzle
    pAsm->D.dst.opcode = SQ_OP2_INST_LOG_IEEE;  
    pAsm->D.dst.math = 1;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg   = tmp;
    nomask_PVSDST(&(pAsm->D.dst));

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    // MUL tmp.x,     tmp.x, b.swizzle
    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg = tmp;
    nomask_PVSDST(&(pAsm->D.dst));

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg = tmp;
    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    // EX2 dst.mask,          tmp.x
    // EX2 tmp.x,             tmp.x
    pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
    pAsm->D.dst.math = 1;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg = tmp;
    nomask_PVSDST(&(pAsm->D.dst));

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg = tmp;
    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    // Now replicate result to all necessary channels in destination
    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp;

    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_RCP(r700_AssemblerBase *pAsm) 
{
    return assemble_math_function(pAsm, SQ_OP2_INST_RECIP_IEEE);
}
 
GLboolean assemble_RSQ(r700_AssemblerBase *pAsm) 
{
    return assemble_math_function(pAsm, SQ_OP2_INST_RECIPSQRT_IEEE);
}
 
GLboolean assemble_SCS(r700_AssemblerBase *pAsm) 
{
    BITS tmp;

    checkop1(pAsm);

    tmp = gethelpr(pAsm);
    /* tmp.x = src /2*PI */
    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
    pAsm->D.dst.reg    = tmp;
    pAsm->D.dst.writex = 1;

    assemble_src(pAsm, 0, -1);

    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
    pAsm->D2.dst2.literal_slots = 1;
    pAsm->C[0].f = 1/(3.1415926535 * 2);
    pAsm->C[1].f = 0.0F;

    next_ins(pAsm);

    // COS dst.x,    a.x
    pAsm->D.dst.opcode = SQ_OP2_INST_COS;
    pAsm->D.dst.math = 1;

    assemble_dst(pAsm);
    /* mask y */
    pAsm->D.dst.writey = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp;
    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if ( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    // SIN dst.y,    a.x
    pAsm->D.dst.opcode = SQ_OP2_INST_SIN;
    pAsm->D.dst.math = 1;

    assemble_dst(pAsm);
    /* mask x */
    pAsm->D.dst.writex = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg   = tmp;
    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_LOGIC(r700_AssemblerBase *pAsm, BITS opcode) 
{
    if( GL_FALSE == checkop2(pAsm) )
    {
	    return GL_FALSE;
    }

    pAsm->D.dst.opcode = opcode;
    //pAsm->D.dst.math   = 1;

    if( GL_FALSE == assemble_dst(pAsm) )
    {
	    return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
	    return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
	    return GL_FALSE;
    }

    if( GL_FALSE == next_ins(pAsm) ) 
    {
	    return GL_FALSE;
    }

    return GL_TRUE;
}

GLboolean assemble_LOGIC_PRED(r700_AssemblerBase *pAsm, BITS opcode) 
{
    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);

    pAsm->D.dst.opcode = opcode;
    pAsm->D.dst.math   = 1;
    pAsm->D.dst.predicated = 1;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg = pAsm->uHelpReg;
    pAsm->D.dst.writex = 1;
    pAsm->D.dst.writey = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[0].src.reg = pAsm->last_cond_register + pAsm->starting_temp_register_number;
    pAsm->S[0].src.swizzlex = pILInst->DstReg.CondSwizzle & 0x7;
    noneg_PVSSRC(&(pAsm->S[0].src));

    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[1].src.reg   = pAsm->uHelpReg;
    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[1].src));
    pAsm->S[1].src.swizzlex = SQ_SEL_0;
    pAsm->S[1].src.swizzley = SQ_SEL_0;
    pAsm->S[1].src.swizzlez = SQ_SEL_0;
    pAsm->S[1].src.swizzlew = SQ_SEL_0;

    if( GL_FALSE == next_ins(pAsm) ) 
    {
	    return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_SGE(r700_AssemblerBase *pAsm) 
{
    if( GL_FALSE == checkop2(pAsm) )
    {
	    return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_SETGE;  

    if( GL_FALSE == assemble_dst(pAsm) )
    {
	    return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
	    return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
	    return GL_FALSE;
    }

    if( GL_FALSE == next_ins(pAsm) ) 
    {
	    return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_SLT(r700_AssemblerBase *pAsm) 
{
    if( GL_FALSE == checkop2(pAsm) )
    {
	    return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP2_INST_SETGT;  

    if( GL_FALSE == assemble_dst(pAsm) )
    {
        return GL_FALSE;
    }
                
    if( GL_FALSE == assemble_src(pAsm, 0, 1) )  
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, 0) )  
    {
        return GL_FALSE;
    }

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    return GL_TRUE;
}
 
GLboolean assemble_STP(r700_AssemblerBase *pAsm) 
{
    return GL_TRUE;
}
 
GLboolean assemble_TEX(r700_AssemblerBase *pAsm) 
{
    GLboolean src_const;
    GLboolean need_barrier = GL_FALSE; 

    checkop1(pAsm);
    
    switch (pAsm->pILInst[pAsm->uiCurInst].SrcReg[0].File)
    {
    case PROGRAM_UNIFORM: 
    case PROGRAM_CONSTANT:
    case PROGRAM_LOCAL_PARAM:
    case PROGRAM_ENV_PARAM:
    case PROGRAM_STATE_VAR:
        src_const = GL_TRUE;
        break;
    case PROGRAM_TEMPORARY:
    case PROGRAM_INPUT:
    default:
        src_const = GL_FALSE;
	break;
    }

    if (GL_TRUE == src_const)
    {
	    if ( GL_FALSE == mov_temp(pAsm, 0) )
		    return GL_FALSE;
	    need_barrier = GL_TRUE;
    }

    if (pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_TXP)
    {
        GLuint tmp = gethelpr(pAsm);
        pAsm->D.dst.opcode = SQ_OP2_INST_RECIP_IEEE;
        pAsm->D.dst.math = 1;
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp;
        pAsm->D.dst.writew = 1;

        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
        {
            return GL_FALSE;
        }
        swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_W, SQ_SEL_W, SQ_SEL_W, SQ_SEL_W);
        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp;
        pAsm->D.dst.writex = 1;
        pAsm->D.dst.writey = 1;
        pAsm->D.dst.writez = 1;
        pAsm->D.dst.writew = 0;

        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
        {
            return GL_FALSE;
        }
        setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
        pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[1].src.reg   = tmp;
        setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_W);

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
        
        pAsm->aArgSubst[1] = tmp;
        need_barrier = GL_TRUE;
    }

    if (pAsm->pILInst[pAsm->uiCurInst].TexSrcTarget == TEXTURE_CUBE_INDEX )
    {
        GLuint tmp1 = gethelpr(pAsm);
        GLuint tmp2 = gethelpr(pAsm);
        
        /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
        pAsm->D.dst.opcode = SQ_OP2_INST_CUBE;
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp1;
        nomask_PVSDST(&(pAsm->D.dst));
	
        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
        {
            return GL_FALSE;
        }

        if( GL_FALSE == assemble_src(pAsm, 0, 1) )
        {
            return GL_FALSE;
        }

        swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Z, SQ_SEL_Z, SQ_SEL_X, SQ_SEL_Y);
        swizzleagain_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Y, SQ_SEL_X, SQ_SEL_Z, SQ_SEL_Z); 

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
 
        /* tmp1.z = RCP_e(|tmp1.z|) */
        pAsm->D.dst.opcode = SQ_OP2_INST_RECIP_IEEE;
        pAsm->D.dst.math = 1;
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp1;
        pAsm->D.dst.writez = 1;

        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg = tmp1;
        pAsm->S[0].src.swizzlex = SQ_SEL_Z;
        pAsm->S[0].src.abs = 1;

        next_ins(pAsm);

        /* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
         * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
         * muladd has no writemask, have to use another temp 
         */
        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
        pAsm->D.dst.op3    = 1;
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp2;

        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = tmp1;
        noswizzle_PVSSRC(&(pAsm->S[0].src));
        setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
        pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[1].src.reg   = tmp1;
        setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Z);
        setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
        /* immediate c 1.5 */
        pAsm->D2.dst2.literal_slots = 1;
        pAsm->C[0].f = 1.5F;
        pAsm->S[2].src.rtype = SRC_REC_LITERAL;
        pAsm->S[2].src.reg   = tmp1;
        setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);

        next_ins(pAsm);

        /* tmp1.xy = temp2.xy */
        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp1;
        pAsm->D.dst.writex = 1;
        pAsm->D.dst.writey = 1;
        pAsm->D.dst.writez = 0;
        pAsm->D.dst.writew = 0;

        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = tmp2;
        noswizzle_PVSSRC(&(pAsm->S[0].src));

        next_ins(pAsm);
        pAsm->aArgSubst[1] = tmp1;
        need_barrier = GL_TRUE;

    }

    switch(pAsm->pILInst[pAsm->uiCurInst].Opcode)
    {
        case OPCODE_DDX:
            /* will these need WQM(1) on CF inst ? */
            pAsm->D.dst.opcode = SQ_TEX_INST_GET_GRADIENTS_H;
            break;
        case OPCODE_DDY:
            pAsm->D.dst.opcode = SQ_TEX_INST_GET_GRADIENTS_V;
            break;
        case OPCODE_TXB:
            pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_L;
            break;
        default:
            if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
                pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_C;
            else
                pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
    }

    pAsm->is_tex = GL_TRUE;
    if ( GL_TRUE == need_barrier )

    pAsm->is_tex = GL_TRUE;
    if ( GL_TRUE == need_barrier )
    {
        pAsm->need_tex_barrier = GL_TRUE;
    }
    // Set src1 to tex unit id
    pAsm->S[1].src.reg   = pAsm->SamplerUnits[pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit];
    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;

    //No sw info from mesa compiler, so hard code here.
    pAsm->S[1].src.swizzlex = SQ_SEL_X;
    pAsm->S[1].src.swizzley = SQ_SEL_Y;
    pAsm->S[1].src.swizzlez = SQ_SEL_Z;
    pAsm->S[1].src.swizzlew = SQ_SEL_W;

    if( GL_FALSE == tex_dst(pAsm) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == tex_src(pAsm) )
    {
        return GL_FALSE;
    }

    if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_TXP)
    {
        /* hopefully did swizzles before */
        noswizzle_PVSSRC(&(pAsm->S[0].src));
    }
   
    if(pAsm->pILInst[pAsm->uiCurInst].TexSrcTarget == TEXTURE_CUBE_INDEX)
    {
        /* SAMPLE dst, tmp.yxwy, CUBE */
        pAsm->S[0].src.swizzlex = SQ_SEL_Y;
        pAsm->S[0].src.swizzley = SQ_SEL_X;
        pAsm->S[0].src.swizzlez = SQ_SEL_W;
        pAsm->S[0].src.swizzlew = SQ_SEL_Y;
    }
 
    if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
    {
        /* compare value goes to w chan ? */
        pAsm->S[0].src.swizzlew = SQ_SEL_Z;
    }

    if ( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }

    /* add ARB shadow ambient but clamp to 0..1 */
    if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
    {
	/* ADD_SAT dst,  dst,  ambient[texunit] */
	pAsm->D.dst.opcode = SQ_OP2_INST_ADD;

	if( GL_FALSE == assemble_dst(pAsm) )
	{
	    return GL_FALSE;
	}
	pAsm->D2.dst2.SaturateMode = 1;

	pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
	pAsm->S[0].src.reg = pAsm->D.dst.reg;
	noswizzle_PVSSRC(&(pAsm->S[0].src));
	noneg_PVSSRC(&(pAsm->S[0].src));

	pAsm->S[1].src.rtype = SRC_REG_CONSTANT;
	pAsm->S[1].src.reg = pAsm->shadow_regs[pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit];
	noswizzle_PVSSRC(&(pAsm->S[1].src));
	noneg_PVSSRC(&(pAsm->S[1].src));

	if( GL_FALSE == next_ins(pAsm) )
	{
	    return GL_FALSE;
	}

    }

    return GL_TRUE;
}

GLboolean assemble_XPD(r700_AssemblerBase *pAsm) 
{
    BITS tmp1;
    BITS tmp2 = 0;

    if( GL_FALSE == checkop2(pAsm) )
    {
	    return GL_FALSE;
    }

    tmp1 = gethelpr(pAsm);

    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg   = tmp1;
    nomask_PVSDST(&(pAsm->D.dst));
  
    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
        return GL_FALSE;
    }
 
    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Z, SQ_SEL_X, SQ_SEL_Y, SQ_SEL_0);
    swizzleagain_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Y, SQ_SEL_Z, SQ_SEL_X, SQ_SEL_0);

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }

    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
    pAsm->D.dst.op3    = 1;

    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask)
    {
        tmp2 = gethelpr(pAsm);

        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = tmp2;

        nomask_PVSDST(&(pAsm->D.dst));
    }
    else 
    {
        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }
    }

    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
    {
        return GL_FALSE;
    }

    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
    {
        return GL_FALSE;
    }
 
    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Y, SQ_SEL_Z, SQ_SEL_X, SQ_SEL_0);
    swizzleagain_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Z, SQ_SEL_X, SQ_SEL_Y, SQ_SEL_0);

    // result1 + (neg) result0
    setaddrmode_PVSSRC(&(pAsm->S[2].src),ADDR_ABSOLUTE);
    pAsm->S[2].src.rtype = SRC_REG_TEMPORARY;
    pAsm->S[2].src.reg   = tmp1;

    neg_PVSSRC(&(pAsm->S[2].src));
    noswizzle_PVSSRC(&(pAsm->S[2].src));

    if( GL_FALSE == next_ins(pAsm) ) 
    {
        return GL_FALSE;
    }


    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask) 
    {
        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

        // Use tmp as source
        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = tmp2;

        noneg_PVSSRC(&(pAsm->S[0].src));
        noswizzle_PVSSRC(&(pAsm->S[0].src));

        if( GL_FALSE == next_ins(pAsm) )
        {
            return GL_FALSE;
        }
    }

    return GL_TRUE;
}

GLboolean assemble_EXPORT(r700_AssemblerBase *pAsm)
{
    return GL_TRUE;
}

static inline void decreaseCurrent(r700_AssemblerBase *pAsm, GLuint uReason)
{
    switch (uReason)
    {
    case FC_PUSH_VPM:
        pAsm->CALLSTACK[pAsm->CALLSP].current--;
        break;
    case FC_PUSH_WQM:
        pAsm->CALLSTACK[pAsm->CALLSP].current -= 4;
        break;
    case FC_LOOP:
        pAsm->CALLSTACK[pAsm->CALLSP].current -= 4;
        break;
    case FC_REP:
        /* TODO : for 16 vp asic, should -= 2; */
        pAsm->CALLSTACK[pAsm->CALLSP].current -= 1;
        break;
    };
}

static inline void checkStackDepth(r700_AssemblerBase *pAsm, GLuint uReason, GLboolean bCheckMaxOnly)
{
    if(GL_TRUE == bCheckMaxOnly)
    {
        switch (uReason)
        {
        case FC_PUSH_VPM:
            if((pAsm->CALLSTACK[pAsm->CALLSP].current + 1)
                    > pAsm->CALLSTACK[pAsm->CALLSP].max)
            {
                pAsm->CALLSTACK[pAsm->CALLSP].max =
                    pAsm->CALLSTACK[pAsm->CALLSP].current + 1;
            }
            break;
        case FC_PUSH_WQM:
            if((pAsm->CALLSTACK[pAsm->CALLSP].current + 4)
                    > pAsm->CALLSTACK[pAsm->CALLSP].max)
            {
                pAsm->CALLSTACK[pAsm->CALLSP].max =
                    pAsm->CALLSTACK[pAsm->CALLSP].current + 4;
            }
            break;
        }
        return;
    }

    switch (uReason)
    {
    case FC_PUSH_VPM:
        pAsm->CALLSTACK[pAsm->CALLSP].current++;
        break;
    case FC_PUSH_WQM:
        pAsm->CALLSTACK[pAsm->CALLSP].current += 4;
        break;
    case FC_LOOP:
        pAsm->CALLSTACK[pAsm->CALLSP].current += 4;
        break;
    case FC_REP:
        /* TODO : for 16 vp asic, should += 2; */
        pAsm->CALLSTACK[pAsm->CALLSP].current += 1;
        break;
    };

    if(pAsm->CALLSTACK[pAsm->CALLSP].current
         > pAsm->CALLSTACK[pAsm->CALLSP].max)
    {
        pAsm->CALLSTACK[pAsm->CALLSP].max =
            pAsm->CALLSTACK[pAsm->CALLSP].current;
    }
}

GLboolean jumpToOffest(r700_AssemblerBase *pAsm, GLuint pops, GLint offset)
{
    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = pops;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_JUMP;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr = pAsm->cf_current_cf_clause_ptr->m_uIndex + offset;

    return GL_TRUE;
}

GLboolean pops(r700_AssemblerBase *pAsm, GLuint pops)
{
    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = pops;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_POP;
 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr             = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;

    return GL_TRUE;
}

GLboolean assemble_IF(r700_AssemblerBase *pAsm, GLboolean bHasElse)
{
    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;

    assemble_LOGIC_PRED(pAsm, SQ_OP2_INST_PRED_SETNE);


    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    if(GL_TRUE != bHasElse)
    {
        pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count = 1; 
    }
    else
    {
        pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count = 0;
    }
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_JUMP;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->FCSP++;
	pAsm->fc_stack[pAsm->FCSP].type  = FC_IF;
    pAsm->fc_stack[pAsm->FCSP].mid   = NULL;
    pAsm->fc_stack[pAsm->FCSP].midLen= 0;
    pAsm->fc_stack[pAsm->FCSP].first = pAsm->cf_current_cf_clause_ptr;

#ifndef USE_CF_FOR_POP_AFTER
    if(GL_TRUE != bHasElse)
    {
        pAsm->alu_x_opcode = SQ_CF_INST_ALU_POP_AFTER;
    }
#endif /* USE_CF_FOR_POP_AFTER */

    checkStackDepth(pAsm, FC_PUSH_VPM, GL_FALSE); 

    return GL_TRUE;
}

GLboolean assemble_ELSE(r700_AssemblerBase *pAsm)
{
    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1; ///
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_ELSE;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->fc_stack[pAsm->FCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( (void *)pAsm->fc_stack[pAsm->FCSP].mid,
                                                                                     0,
                                                                                     sizeof(R700ControlFlowGenericClause *) );
    pAsm->fc_stack[pAsm->FCSP].mid[0] = pAsm->cf_current_cf_clause_ptr;
    //pAsm->fc_stack[pAsm->FCSP].unNumMid = 1;

#ifndef USE_CF_FOR_POP_AFTER
    pAsm->alu_x_opcode = SQ_CF_INST_ALU_POP_AFTER;
#endif /* USE_CF_FOR_POP_AFTER */

    pAsm->fc_stack[pAsm->FCSP].first->m_Word0.f.addr = pAsm->pR700Shader->plstCFInstructions_active->uNumOfNode - 1; 

    return GL_TRUE;
}

GLboolean assemble_ENDIF(r700_AssemblerBase *pAsm)
{
#ifdef USE_CF_FOR_POP_AFTER
    pops(pAsm, 1); 
#endif /* USE_CF_FOR_POP_AFTER */

    pAsm->alu_x_opcode = SQ_CF_INST_ALU;

    if(NULL == pAsm->fc_stack[pAsm->FCSP].mid)
    {
        /* no else in between */
        pAsm->fc_stack[pAsm->FCSP].first->m_Word0.f.addr = pAsm->pR700Shader->plstCFInstructions_active->uNumOfNode;
    }
    else
    {
        pAsm->fc_stack[pAsm->FCSP].mid[0]->m_Word0.f.addr = pAsm->pR700Shader->plstCFInstructions_active->uNumOfNode;
    }

    if(NULL != pAsm->fc_stack[pAsm->FCSP].mid)
    {
        FREE(pAsm->fc_stack[pAsm->FCSP].mid);
    }

    if(pAsm->fc_stack[pAsm->FCSP].type != FC_IF)
    {
        radeon_error("if/endif in shader code are not paired. \n");
        return GL_FALSE;
    }
    
    pAsm->FCSP--;

    decreaseCurrent(pAsm, FC_PUSH_VPM);

    return GL_TRUE;
}

GLboolean assemble_BGNLOOP(r700_AssemblerBase *pAsm)
{
    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_START_NO_AL;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->FCSP++;
	pAsm->fc_stack[pAsm->FCSP].type  = FC_LOOP;
    pAsm->fc_stack[pAsm->FCSP].mid   = NULL;
    pAsm->fc_stack[pAsm->FCSP].unNumMid = 0;
    pAsm->fc_stack[pAsm->FCSP].midLen   = 0;
    pAsm->fc_stack[pAsm->FCSP].first    = pAsm->cf_current_cf_clause_ptr;

    checkStackDepth(pAsm, FC_LOOP, GL_FALSE);

    return GL_TRUE;
}

GLboolean assemble_BRK(r700_AssemblerBase *pAsm)
{
#ifdef USE_CF_FOR_CONTINUE_BREAK

    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;

    assemble_LOGIC_PRED(pAsm, SQ_OP2_INST_PRED_SETNE);
    
    unsigned int unFCSP;
    for(unFCSP=pAsm->FCSP; unFCSP>0; unFCSP--)
    {
        if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
        {
            break;
        }
    }
    if(0 == FC_LOOP)
    {
        radeon_error("Break is not inside loop/endloop pair.\n");
        return GL_FALSE;
    }

    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_BREAK;
 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->fc_stack[unFCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( 
                                              (void *)pAsm->fc_stack[unFCSP].mid,
                                              sizeof(R700ControlFlowGenericClause *) * pAsm->fc_stack[unFCSP].unNumMid,
                                              sizeof(R700ControlFlowGenericClause *) * (pAsm->fc_stack[unFCSP].unNumMid + 1) );
    pAsm->fc_stack[unFCSP].mid[pAsm->fc_stack[unFCSP].unNumMid] = pAsm->cf_current_cf_clause_ptr;
    pAsm->fc_stack[unFCSP].unNumMid++;

    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_POP;
 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr             = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;

    checkStackDepth(pAsm, FC_PUSH_VPM, GL_TRUE);

#endif //USE_CF_FOR_CONTINUE_BREAK
    return GL_TRUE;
}

GLboolean assemble_CONT(r700_AssemblerBase *pAsm)
{
#ifdef USE_CF_FOR_CONTINUE_BREAK
    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;

    assemble_LOGIC_PRED(pAsm, SQ_OP2_INST_PRED_SETNE);

    unsigned int unFCSP;
    for(unFCSP=pAsm->FCSP; unFCSP>0; unFCSP--)
    {
        if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
        {
            break;
        }
    }
    if(0 == FC_LOOP)
    {
        radeon_error("Continue is not inside loop/endloop pair.\n");
        return GL_FALSE;
    }

    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_CONTINUE;
 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->fc_stack[unFCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( 
                                              (void *)pAsm->fc_stack[unFCSP].mid,
                                              sizeof(R700ControlFlowGenericClause *) * pAsm->fc_stack[unFCSP].unNumMid,
                                              sizeof(R700ControlFlowGenericClause *) * (pAsm->fc_stack[unFCSP].unNumMid + 1) );
    pAsm->fc_stack[unFCSP].mid[pAsm->fc_stack[unFCSP].unNumMid] = pAsm->cf_current_cf_clause_ptr;
    pAsm->fc_stack[unFCSP].unNumMid++;

    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_POP;
 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr             = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;

    checkStackDepth(pAsm, FC_PUSH_VPM, GL_TRUE);

#endif /* USE_CF_FOR_CONTINUE_BREAK */

    return GL_TRUE;
}

GLboolean assemble_ENDLOOP(r700_AssemblerBase *pAsm)
{
    GLuint i;

    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_END;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr   = pAsm->fc_stack[pAsm->FCSP].first->m_uIndex + 1;
    pAsm->fc_stack[pAsm->FCSP].first->m_Word0.f.addr = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;

#ifdef USE_CF_FOR_CONTINUE_BREAK
    for(i=0; i<pAsm->fc_stack[pAsm->FCSP].unNumMid; i++)
    {
        pAsm->fc_stack[pAsm->FCSP].mid[i]->m_Word0.f.addr = pAsm->cf_current_cf_clause_ptr->m_uIndex;
    }
    if(NULL != pAsm->fc_stack[pAsm->FCSP].mid)
    {
        FREE(pAsm->fc_stack[pAsm->FCSP].mid);
    }
#endif

    if(pAsm->fc_stack[pAsm->FCSP].type != FC_LOOP)
    {
        radeon_error("loop/endloop in shader code are not paired. \n");
        return GL_FALSE;
    }

    GLuint unFCSP;
    GLuint unIF = 0;
    if((pAsm->unCFflags & HAS_CURRENT_LOOPRET) > 0)
    {        
        for(unFCSP=(pAsm->FCSP-1); unFCSP>pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry; unFCSP--)
        {
            if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
            {
                breakLoopOnFlag(pAsm, unFCSP);
                break;
            }
            else if(FC_IF == pAsm->fc_stack[unFCSP].type)
            {
                unIF++;
            }
        }
        if(unFCSP <= pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry)
        {            
#ifdef USE_CF_FOR_POP_AFTER
            returnOnFlag(pAsm, unIF); 
#else
            returnOnFlag(pAsm, 0);
#endif /* USE_CF_FOR_POP_AFTER */
            pAsm->unCFflags &= ~HAS_CURRENT_LOOPRET;
        }
    }

    pAsm->FCSP--;

    decreaseCurrent(pAsm, FC_LOOP);
    
    return GL_TRUE;
}

void add_return_inst(r700_AssemblerBase *pAsm)
{
    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return;
    }
    //pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_RETURN;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
}

GLboolean assemble_BGNSUB(r700_AssemblerBase *pAsm, GLint nILindex, GLuint uiIL_Shift)
{
    /* Put in sub */
    if( (pAsm->unSubArrayPointer + 1) > pAsm->unSubArraySize )
    {
        pAsm->subs = (SUB_OFFSET*)_mesa_realloc( (void *)pAsm->subs,
                                  sizeof(SUB_OFFSET) * pAsm->unSubArraySize,
                                  sizeof(SUB_OFFSET) * (pAsm->unSubArraySize + 10) );
        if(NULL == pAsm->subs)
        {
            return GL_FALSE;
        }
        pAsm->unSubArraySize += 10;
    }

    pAsm->subs[pAsm->unSubArrayPointer].subIL_Offset = nILindex + uiIL_Shift;
    pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.pHead=NULL;  
    pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.pTail=NULL;  
    pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.uNumOfNode=0;

    pAsm->CALLSP++;
    pAsm->CALLSTACK[pAsm->CALLSP].subDescIndex = pAsm->unSubArrayPointer;
    pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry = pAsm->FCSP;
    pAsm->CALLSTACK[pAsm->CALLSP].plstCFInstructions_local
                   = &(pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local);
    pAsm->CALLSTACK[pAsm->CALLSP].max = 0;
    pAsm->CALLSTACK[pAsm->CALLSP].current = 0;
    SetActiveCFlist(pAsm->pR700Shader, 
                    pAsm->CALLSTACK[pAsm->CALLSP].plstCFInstructions_local);

    pAsm->unSubArrayPointer++;

    /* start sub */
    pAsm->alu_x_opcode = SQ_CF_INST_ALU;

    pAsm->FCSP++;
    pAsm->fc_stack[pAsm->FCSP].type  = FC_REP;

    checkStackDepth(pAsm, FC_REP, GL_FALSE);

    return GL_TRUE;
}

GLboolean assemble_ENDSUB(r700_AssemblerBase *pAsm)
{
    if(pAsm->fc_stack[pAsm->FCSP].type != FC_REP)
    {
        radeon_error("BGNSUB/ENDSUB in shader code are not paired. \n");
        return GL_FALSE;
    }

    /* copy max to sub structure */
    pAsm->subs[pAsm->CALLSTACK[pAsm->CALLSP].subDescIndex].unStackDepthMax
        = pAsm->CALLSTACK[pAsm->CALLSP].max;

    decreaseCurrent(pAsm, FC_REP);

    pAsm->CALLSP--;
    SetActiveCFlist(pAsm->pR700Shader, 
                    pAsm->CALLSTACK[pAsm->CALLSP].plstCFInstructions_local);
    
    pAsm->alu_x_opcode = SQ_CF_INST_ALU;

    pAsm->FCSP--;

    return GL_TRUE;
}

GLboolean assemble_RET(r700_AssemblerBase *pAsm)
{
    GLuint unIF = 0;

    if(pAsm->CALLSP > 0)
    {   /* in sub */
        GLuint unFCSP;        
        for(unFCSP=pAsm->FCSP; unFCSP>pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry; unFCSP--)
        {
            if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
            {
                setRetInLoopFlag(pAsm, SQ_SEL_1);
                breakLoopOnFlag(pAsm, unFCSP);
                pAsm->unCFflags |= LOOPRET_FLAGS;

                return GL_TRUE;
            }
            else if(FC_IF == pAsm->fc_stack[unFCSP].type)
            {
                unIF++;
            }
        }
    }

#ifdef USE_CF_FOR_POP_AFTER    
    if(unIF > 0)
    {
        pops(pAsm, unIF);
    }
#endif /* USE_CF_FOR_POP_AFTER */

    add_return_inst(pAsm);

    return GL_TRUE;
}

GLboolean assemble_CAL(r700_AssemblerBase *pAsm, 
                       GLint nILindex,
                       GLuint uiIL_Shift,
                       GLuint uiNumberInsts,
                       struct prog_instruction *pILInst,
                       PRESUB_DESC * pPresubDesc)
{
    GLint uiIL_Offset;

    pAsm->alu_x_opcode = SQ_CF_INST_ALU;

    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.call_count       = 1;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_CALL;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    /* Put in caller */
    if( (pAsm->unCallerArrayPointer + 1) > pAsm->unCallerArraySize )
    {
        pAsm->callers = (CALLER_POINTER*)_mesa_realloc( (void *)pAsm->callers, 
                       sizeof(CALLER_POINTER) * pAsm->unCallerArraySize, 
                       sizeof(CALLER_POINTER) * (pAsm->unCallerArraySize + 10) );
        if(NULL == pAsm->callers)
        {
            return GL_FALSE;
        }
        pAsm->unCallerArraySize += 10;
    }
    
    uiIL_Offset = nILindex + uiIL_Shift;
    pAsm->callers[pAsm->unCallerArrayPointer].subIL_Offset = uiIL_Offset; 
    pAsm->callers[pAsm->unCallerArrayPointer].cf_ptr       = pAsm->cf_current_cf_clause_ptr;
    
    pAsm->callers[pAsm->unCallerArrayPointer].finale_cf_ptr  = NULL; 
    pAsm->callers[pAsm->unCallerArrayPointer].prelude_cf_ptr = NULL; 

    pAsm->unCallerArrayPointer++;

    int j;
    GLuint max;
    GLuint unSubID;
    GLboolean bRet;
    for(j=0; j<pAsm->unSubArrayPointer; j++)
    {
        if(uiIL_Offset == pAsm->subs[j].subIL_Offset)
        {   /* compiled before */

            max = pAsm->subs[j].unStackDepthMax 
                + pAsm->CALLSTACK[pAsm->CALLSP].current;
            if(max > pAsm->CALLSTACK[pAsm->CALLSP].max)
            {
                pAsm->CALLSTACK[pAsm->CALLSP].max = max;
            }
            
            pAsm->callers[pAsm->unCallerArrayPointer - 1].subDescIndex = j; 
            return GL_TRUE;
        }
    }

    pAsm->callers[pAsm->unCallerArrayPointer - 1].subDescIndex = pAsm->unSubArrayPointer;
    unSubID = pAsm->unSubArrayPointer;

    bRet = AssembleInstr(nILindex, uiIL_Shift, uiNumberInsts, pILInst, pAsm);

    if(GL_TRUE == bRet)
    {
        max = pAsm->subs[unSubID].unStackDepthMax 
            + pAsm->CALLSTACK[pAsm->CALLSP].current;
        if(max > pAsm->CALLSTACK[pAsm->CALLSP].max)
        {
            pAsm->CALLSTACK[pAsm->CALLSP].max = max;
        }

        pAsm->subs[unSubID].pPresubDesc = pPresubDesc;
    }

    return bRet;
}

GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
{
    /*GLfloat fLiteral[2] = {0.1, 0.0};*/

    pAsm->D.dst.opcode   = SQ_OP2_INST_MOV;
    pAsm->D.dst.op3      = 0;
    pAsm->D.dst.rtype    = DST_REG_TEMPORARY;
    pAsm->D.dst.reg      = pAsm->flag_reg_index;
    pAsm->D.dst.writex   = 1;
    pAsm->D.dst.writey   = 0;
    pAsm->D.dst.writez   = 0;
    pAsm->D.dst.writew   = 0;
    pAsm->D2.dst2.literal_slots      = 1;
    pAsm->D2.dst2.SaturateMode = SATURATE_OFF;
    pAsm->D.dst.predicated     = 0;
    /* in reloc where dislink flag init inst, only one slot alu inst is handled. */
    pAsm->D.dst.math           = 1; /* TODO : not math really, but one channel op, more generic alu assembler needed */
    pAsm->D2.dst2.index_mode = SQ_INDEX_LOOP; /* Check this ! */
#if 0
    pAsm->S[0].src.rtype = SRC_REC_LITERAL;
    //pAsm->S[0].src.reg   = 0;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[0].src));
    pAsm->S[0].src.swizzlex = SQ_SEL_X;
    pAsm->S[0].src.swizzley = SQ_SEL_Y;
    pAsm->S[0].src.swizzlez = SQ_SEL_Z;
    pAsm->S[0].src.swizzlew = SQ_SEL_W;

    if( GL_FALSE == next_ins_literal(pAsm, &(fLiteral[0])) )
    {
        return GL_FALSE;
    }
#else
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = 0;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[0].src));
    pAsm->S[0].src.swizzlex = flagValue;
    pAsm->S[0].src.swizzley = flagValue;
    pAsm->S[0].src.swizzlez = flagValue;
    pAsm->S[0].src.swizzlew = flagValue;

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }
#endif

    return GL_TRUE;
}

GLboolean testFlag(r700_AssemblerBase *pAsm)
{
    /*GLfloat fLiteral[2] = {0.1, 0.0};*/

    //Test flag
    GLuint tmp = gethelpr(pAsm);
    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;

    pAsm->D.dst.opcode   = SQ_OP2_INST_PRED_SETE;
    pAsm->D.dst.math     = 1;
    pAsm->D.dst.rtype    = DST_REG_TEMPORARY;
    pAsm->D.dst.reg      = tmp;
    pAsm->D.dst.writex   = 1;
    pAsm->D.dst.writey   = 0;
    pAsm->D.dst.writez   = 0;
    pAsm->D.dst.writew   = 0;
    pAsm->D2.dst2.literal_slots      = 1;
    pAsm->D2.dst2.SaturateMode = SATURATE_OFF;
    pAsm->D.dst.predicated     = 1;
    pAsm->D2.dst2.index_mode = SQ_INDEX_LOOP; /* Check this ! */

    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = pAsm->flag_reg_index;
    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[0].src));
    pAsm->S[0].src.swizzlex = SQ_SEL_X;
    pAsm->S[0].src.swizzley = SQ_SEL_Y;
    pAsm->S[0].src.swizzlez = SQ_SEL_Z;
    pAsm->S[0].src.swizzlew = SQ_SEL_W;
#if 0
    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
    //pAsm->S[1].src.reg   = 0;
    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[1].src));
    pAsm->S[1].src.swizzlex = SQ_SEL_X;
    pAsm->S[1].src.swizzley = SQ_SEL_Y;
    pAsm->S[1].src.swizzlez = SQ_SEL_Z;
    pAsm->S[1].src.swizzlew = SQ_SEL_W;

    if( GL_FALSE == next_ins_literal(pAsm, &(fLiteral[0])) )
    {
        return GL_FALSE;
    }
#else
    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[1].src.reg   = 0;
    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
    noneg_PVSSRC(&(pAsm->S[1].src));
    pAsm->S[1].src.swizzlex = SQ_SEL_1;
    pAsm->S[1].src.swizzley = SQ_SEL_1;
    pAsm->S[1].src.swizzlez = SQ_SEL_1;
    pAsm->S[1].src.swizzlew = SQ_SEL_1;

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }
#endif

    checkStackDepth(pAsm, FC_PUSH_VPM, GL_TRUE);

    return GL_TRUE;
}

GLboolean returnOnFlag(r700_AssemblerBase *pAsm, GLuint unIF)
{
    testFlag(pAsm);
    jumpToOffest(pAsm, 1, 4);
    setRetInLoopFlag(pAsm, SQ_SEL_0);
    pops(pAsm, unIF + 1);
    add_return_inst(pAsm);

    return GL_TRUE;
}

GLboolean breakLoopOnFlag(r700_AssemblerBase *pAsm, GLuint unFCSP)
{
    testFlag(pAsm);
 
    //break
    if(GL_FALSE == add_cf_instruction(pAsm) )
    {
        return GL_FALSE;
    }
    
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_BREAK;
    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;

    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;

    pAsm->fc_stack[unFCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( 
                                              (void *)pAsm->fc_stack[unFCSP].mid,
                                              sizeof(R700ControlFlowGenericClause *) * pAsm->fc_stack[unFCSP].unNumMid,
                                              sizeof(R700ControlFlowGenericClause *) * (pAsm->fc_stack[unFCSP].unNumMid + 1) );
    pAsm->fc_stack[unFCSP].mid[pAsm->fc_stack[unFCSP].unNumMid] = pAsm->cf_current_cf_clause_ptr;
    pAsm->fc_stack[unFCSP].unNumMid++;

    pops(pAsm, 1);
               
    return GL_TRUE;
}

GLboolean AssembleInstr(GLuint uiFirstInst,
                        GLuint uiIL_Shift,
                        GLuint uiNumberInsts,
                        struct prog_instruction *pILInst, 
						r700_AssemblerBase *pR700AsmCode)
{
    GLuint i;

    pR700AsmCode->pILInst = pILInst;
	for(i=uiFirstInst; i<uiNumberInsts; i++)
    {
        pR700AsmCode->uiCurInst = i;

#ifndef USE_CF_FOR_CONTINUE_BREAK
        if(OPCODE_BRK == pILInst[i+1].Opcode)
        {
            switch(pILInst[i].Opcode)            
            {
            case OPCODE_SLE:
                pILInst[i].Opcode = OPCODE_SGT;
                break;
            case OPCODE_SLT:
                pILInst[i].Opcode = OPCODE_SGE;
                break;
            case OPCODE_SGE:
                pILInst[i].Opcode = OPCODE_SLT;
                break;
            case OPCODE_SGT:
                pILInst[i].Opcode = OPCODE_SLE;
                break;
            case OPCODE_SEQ:
                pILInst[i].Opcode = OPCODE_SNE;
                break;
            case OPCODE_SNE:
                pILInst[i].Opcode = OPCODE_SEQ;
                break;
            default:
                break;
            }
        }
#endif
        if(pILInst[i].CondUpdate == 1)
        {
            /* remember dest register used for cond evaluation */
            /* XXX also handle PROGRAM_OUTPUT registers here? */
            pR700AsmCode->last_cond_register = pILInst[i].DstReg.Index; 
        }

        switch (pILInst[i].Opcode)
        {
        case OPCODE_ABS: 
            if ( GL_FALSE == assemble_ABS(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_ADD: 
        case OPCODE_SUB: 
            if ( GL_FALSE == assemble_ADD(pR700AsmCode) ) 
                return GL_FALSE;
            break;  

        case OPCODE_ARL: 
            if ( GL_FALSE == assemble_ARL(pR700AsmCode) ) 
                return GL_FALSE;
            break;
        case OPCODE_ARR: 
            radeon_error("Not yet implemented instruction OPCODE_ARR \n");
            //if ( GL_FALSE == assemble_BAD("ARR") ) 
                return GL_FALSE;
            break;

        case OPCODE_CMP: 
            if ( GL_FALSE == assemble_CMP(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_COS: 
            if ( GL_FALSE == assemble_TRIG(pR700AsmCode, SQ_OP2_INST_COS) ) 
                return GL_FALSE;
            break;  

        case OPCODE_DP3: 
        case OPCODE_DP4: 
        case OPCODE_DPH: 
            if ( GL_FALSE == assemble_DOT(pR700AsmCode) ) 
                return GL_FALSE;
            break;  

        case OPCODE_DST: 
            if ( GL_FALSE == assemble_DST(pR700AsmCode) ) 
                return GL_FALSE;
            break;  

        case OPCODE_EX2: 
            if ( GL_FALSE == assemble_EX2(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_EXP: 
            if ( GL_FALSE == assemble_EXP(pR700AsmCode) ) 
                return GL_FALSE;
            break;

        case OPCODE_FLR:     
            if ( GL_FALSE == assemble_FLR(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        //case OP_FLR_INT: ;

        //    if ( GL_FALSE == assemble_FLR_INT() ) 
        //        return GL_FALSE;
        //    break;  

        case OPCODE_FRC: 
            if ( GL_FALSE == assemble_FRC(pR700AsmCode) ) 
                return GL_FALSE;
            break;  

        case OPCODE_KIL: 
        case OPCODE_KIL_NV: 
            if ( GL_FALSE == assemble_KIL(pR700AsmCode, SQ_OP2_INST_KILLGT) ) 
                return GL_FALSE;
            break;
        case OPCODE_LG2: 
            if ( GL_FALSE == assemble_LG2(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_LIT:
            if ( GL_FALSE == assemble_LIT(pR700AsmCode) ) 
                return GL_FALSE;
            break;
        case OPCODE_LRP: 
            if ( GL_FALSE == assemble_LRP(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_LOG: 
            if ( GL_FALSE == assemble_LOG(pR700AsmCode) ) 
                return GL_FALSE;
            break;

        case OPCODE_MAD: 
            if ( GL_FALSE == assemble_MAD(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_MAX: 
            if ( GL_FALSE == assemble_MAX(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_MIN: 
            if ( GL_FALSE == assemble_MIN(pR700AsmCode) ) 
                return GL_FALSE;
            break;  

        case OPCODE_MOV: 
            if ( GL_FALSE == assemble_MOV(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_MUL: 
            if ( GL_FALSE == assemble_MUL(pR700AsmCode) ) 
                return GL_FALSE;
            break;
            
        case OPCODE_NOISE1:
            {                                               
                callPreSub(pR700AsmCode, 
                           GLSL_NOISE1,                         
                           &noise1_presub,                                                  
                           pILInst->DstReg.Index + pR700AsmCode->starting_temp_register_number, 
                           1); 
                radeon_error("noise1: not yet supported shader instruction\n");
            };
            break; 
        case OPCODE_NOISE2: 
            radeon_error("noise2: not yet supported shader instruction\n");
            break; 
        case OPCODE_NOISE3: 
            radeon_error("noise3: not yet supported shader instruction\n");
            break; 
        case OPCODE_NOISE4: 
            radeon_error("noise4: not yet supported shader instruction\n");
            break; 

        case OPCODE_POW: 
            if ( GL_FALSE == assemble_POW(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_RCP: 
            if ( GL_FALSE == assemble_RCP(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_RSQ: 
            if ( GL_FALSE == assemble_RSQ(pR700AsmCode) ) 
                return GL_FALSE;
            break;  
        case OPCODE_SIN: 
            if ( GL_FALSE == assemble_TRIG(pR700AsmCode, SQ_OP2_INST_SIN) ) 
                return GL_FALSE;
            break;  
        case OPCODE_SCS: 
            if ( GL_FALSE == assemble_SCS(pR700AsmCode) ) 
                return GL_FALSE;
            break; 
            
        case OPCODE_SEQ:
            if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETE) ) 
            {
                return GL_FALSE;
            }
            break;

        case OPCODE_SGT: 
            if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETGT) ) 
            {
                return GL_FALSE;
            }
            break;

        case OPCODE_SGE: 
            if ( GL_FALSE == assemble_SGE(pR700AsmCode) ) 
            { 
                return GL_FALSE;
            }
            break;
        
        /* NO LT, LE, TODO : use GE => LE, GT => LT : reverse 2 src order would be simpliest. Or use SQ_CF_COND_FALSE for SQ_CF_COND_ACTIVE.*/
        case OPCODE_SLT: 
            {
                struct prog_src_register SrcRegSave[2];
                SrcRegSave[0] = pILInst[i].SrcReg[0];
                SrcRegSave[1] = pILInst[i].SrcReg[1];
                pILInst[i].SrcReg[0] = SrcRegSave[1];
                pILInst[i].SrcReg[1] = SrcRegSave[0];
                if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETGT) ) 
                {
                    pILInst[i].SrcReg[0] = SrcRegSave[0];
                    pILInst[i].SrcReg[1] = SrcRegSave[1];
                    return GL_FALSE;
                }
                pILInst[i].SrcReg[0] = SrcRegSave[0];
                pILInst[i].SrcReg[1] = SrcRegSave[1];
            }
            break;

        case OPCODE_SLE: 
            {
                struct prog_src_register SrcRegSave[2];
                SrcRegSave[0] = pILInst[i].SrcReg[0];
                SrcRegSave[1] = pILInst[i].SrcReg[1];
                pILInst[i].SrcReg[0] = SrcRegSave[1];
                pILInst[i].SrcReg[1] = SrcRegSave[0];
                if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETGE) ) 
                {
                    pILInst[i].SrcReg[0] = SrcRegSave[0];
                    pILInst[i].SrcReg[1] = SrcRegSave[1];
                    return GL_FALSE;
                }
                pILInst[i].SrcReg[0] = SrcRegSave[0];
                pILInst[i].SrcReg[1] = SrcRegSave[1];
            }
            break;

        case OPCODE_SNE: 
            if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETNE) ) 
            {
                return GL_FALSE;
            }
            break;

        //case OP_STP: 
        //    if ( GL_FALSE == assemble_STP(pR700AsmCode) ) 
        //        return GL_FALSE;
        //    break;

        case OPCODE_SWZ: 
            if ( GL_FALSE == assemble_MOV(pR700AsmCode) ) 
            {
                return GL_FALSE; 
            }
            else
            {
                if( (i+1)<uiNumberInsts )
                {
                    if(OPCODE_END != pILInst[i+1].Opcode)
                    {
                        if( GL_TRUE == IsTex(pILInst[i+1].Opcode) )
                        {
                            pR700AsmCode->pInstDeps[i+1].nDstDep = i+1; //=1?
                        }
                    }
                }
            }
            break;
        case OPCODE_DDX:
        case OPCODE_DDY:
        case OPCODE_TEX: 
        case OPCODE_TXB:  
        case OPCODE_TXP: 
            if ( GL_FALSE == assemble_TEX(pR700AsmCode) ) 
                return GL_FALSE;
            break;

        case OPCODE_TRUNC:
            if ( GL_FALSE == assemble_math_function(pR700AsmCode, SQ_OP2_INST_TRUNC) )
                return GL_FALSE;
            break;

        case OPCODE_XPD: 
            if ( GL_FALSE == assemble_XPD(pR700AsmCode) ) 
                return GL_FALSE;
            break;  

        case OPCODE_IF:
            {                
                GLboolean bHasElse = GL_FALSE;

                if(pILInst[pILInst[i].BranchTarget].Opcode == OPCODE_ELSE)
                {
                    bHasElse = GL_TRUE;
                }

                if ( GL_FALSE == assemble_IF(pR700AsmCode, bHasElse) ) 
                {
                    return GL_FALSE;
                }
            }
            break;

        case OPCODE_ELSE : 
            if ( GL_FALSE == assemble_ELSE(pR700AsmCode) ) 
                return GL_FALSE;
            break;

        case OPCODE_ENDIF: 
            if ( GL_FALSE == assemble_ENDIF(pR700AsmCode) ) 
                return GL_FALSE;
            break;

        case OPCODE_BGNLOOP:
            if( GL_FALSE == assemble_BGNLOOP(pR700AsmCode) )
            {
                return GL_FALSE;
            }
            break;

        case OPCODE_BRK:
            if( GL_FALSE == assemble_BRK(pR700AsmCode) )
            {
                return GL_FALSE;
            }
            break;

        case OPCODE_CONT:
            if( GL_FALSE == assemble_CONT(pR700AsmCode) )
            {
                return GL_FALSE;
            }
            break;

        case OPCODE_ENDLOOP:
            if( GL_FALSE == assemble_ENDLOOP(pR700AsmCode) )
            {
                return GL_FALSE;
            }
            break;

        case OPCODE_BGNSUB:
            if( GL_FALSE == assemble_BGNSUB(pR700AsmCode, i, uiIL_Shift) )
            {
                return GL_FALSE;
            }
            break;
        
        case OPCODE_RET:
            if( GL_FALSE == assemble_RET(pR700AsmCode) )
            {
                return GL_FALSE;
            }
            break;
        
        case OPCODE_CAL:
            if( GL_FALSE == assemble_CAL(pR700AsmCode, 
                                         pILInst[i].BranchTarget,
                                         uiIL_Shift,
                                         uiNumberInsts,
                                         pILInst,
                                         NULL) )
            {
                return GL_FALSE;
            }
            break;

        //case OPCODE_EXPORT: 
        //    if ( GL_FALSE == assemble_EXPORT() ) 
        //        return GL_FALSE;
        //    break;

        case OPCODE_ENDSUB:
            return assemble_ENDSUB(pR700AsmCode);

        case OPCODE_END: 
			//pR700AsmCode->uiCurInst = i;
			//This is to remaind that if in later exoort there is depth/stencil
			//export, we need a mov to re-arrange DST channel, where using a
			//psuedo inst, we will use this end inst to do it.
            return GL_TRUE;

        default:
            radeon_error("internal: unknown instruction\n");
            return GL_FALSE;
        }
    }

    return GL_TRUE;
}

GLboolean InitShaderProgram(r700_AssemblerBase * pAsm)
{
    setRetInLoopFlag(pAsm, SQ_SEL_0);
    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
    return GL_TRUE;
}

GLboolean RelocProgram(r700_AssemblerBase * pAsm, struct gl_program * pILProg)
{
    GLuint i;
    GLuint unCFoffset;
    TypedShaderList * plstCFmain;
    TypedShaderList * plstCFsub;

    R700ShaderInstruction *        pInst;
    R700ControlFlowGenericClause * pCFInst;

    R700ControlFlowALUClause * pCF_ALU;
    R700ALUInstruction       * pALU;
    GLuint                     unConstOffset = 0;
    GLuint                     unRegOffset;
    GLuint                     unMinRegIndex;

    plstCFmain = pAsm->CALLSTACK[0].plstCFInstructions_local;

    /* remove flags init if they are not used */
    if((pAsm->unCFflags & HAS_LOOPRET) == 0)
    {
        R700ControlFlowALUClause * pCF_ALU;
        pInst = plstCFmain->pHead;
        while(pInst)
        {
            if(SIT_CF_ALU == pInst->m_ShaderInstType)
            {
                pCF_ALU = (R700ControlFlowALUClause *)pInst;
                if(0 == pCF_ALU->m_Word1.f.count)
                {
                    pCF_ALU->m_Word1.f.cf_inst = SQ_CF_INST_NOP;
                }
                else
                {
                    R700ALUInstruction * pALU = pCF_ALU->m_pLinkedALUInstruction;
                    
                    pALU->m_pLinkedALUClause = NULL;
                    pALU = (R700ALUInstruction *)(pALU->pNextInst);
                    pALU->m_pLinkedALUClause = pCF_ALU;
                    pCF_ALU->m_pLinkedALUInstruction = pALU;

                    pCF_ALU->m_Word1.f.count--;
                }
                break;
            }
            pInst = pInst->pNextInst;
        };
    }

    if(pAsm->CALLSTACK[0].max > 0)
    {
        pAsm->pR700Shader->uStackSize = ((pAsm->CALLSTACK[0].max + 3)>>2) + 2;
    }

    if(0 == pAsm->unSubArrayPointer)
    {
        return GL_TRUE;
    }

    unCFoffset = plstCFmain->uNumOfNode;

    if(NULL != pILProg->Parameters)
    {        
        unConstOffset = pILProg->Parameters->NumParameters;
    }

    /* Reloc subs */
    for(i=0; i<pAsm->unSubArrayPointer; i++)
    {
        pAsm->subs[i].unCFoffset = unCFoffset;
        plstCFsub = &(pAsm->subs[i].lstCFInstructions_local);

        pInst = plstCFsub->pHead;

        /* reloc instructions */
        while(pInst)
        {
            if(SIT_CF_GENERIC == pInst->m_ShaderInstType)
            {
                pCFInst = (R700ControlFlowGenericClause *)pInst;

                switch (pCFInst->m_Word1.f.cf_inst)
                {
                case SQ_CF_INST_POP:
                case SQ_CF_INST_JUMP:
                case SQ_CF_INST_ELSE:
                case SQ_CF_INST_LOOP_END:
                case SQ_CF_INST_LOOP_START:
                case SQ_CF_INST_LOOP_START_NO_AL:
                case SQ_CF_INST_LOOP_CONTINUE:
                case SQ_CF_INST_LOOP_BREAK:
                    pCFInst->m_Word0.f.addr += unCFoffset;
                    break;
                default:
                    break;
                }
            }  
            
            pInst->m_uIndex += unCFoffset;

            pInst = pInst->pNextInst;
        };

        if(NULL != pAsm->subs[i].pPresubDesc)
        {
            GLuint                     uNumSrc;            
            
            unMinRegIndex  = pAsm->subs[i].pPresubDesc->pCompiledSub->MinRegIndex;
            unRegOffset    = pAsm->subs[i].pPresubDesc->maxStartReg;            
            unConstOffset += pAsm->subs[i].pPresubDesc->unConstantsStart;

            pInst = plstCFsub->pHead;
            while(pInst)
            {
                if(SIT_CF_ALU == pInst->m_ShaderInstType)
                {
                    pCF_ALU = (R700ControlFlowALUClause *)pInst;

                    pALU = pCF_ALU->m_pLinkedALUInstruction;
                    for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
                    {
                        pALU->m_Word1.f.dst_gpr = pALU->m_Word1.f.dst_gpr + unRegOffset - unMinRegIndex;

                        if(pALU->m_Word0.f.src0_sel < SQ_ALU_SRC_GPR_SIZE)
                        {   
                            pALU->m_Word0.f.src0_sel = pALU->m_Word0.f.src0_sel + unRegOffset - unMinRegIndex;
                        }
                        else if(pALU->m_Word0.f.src0_sel >= SQ_ALU_SRC_CFILE_BASE)
                        {   
                            pALU->m_Word0.f.src0_sel += unConstOffset;
                        }

                        if( ((pALU->m_Word1.val >> SQ_ALU_WORD1_OP3_ALU_INST_SHIFT) & 0x0000001F) 
                            >= SQ_OP3_INST_MUL_LIT )
                        {   /* op3 : 3 srcs */
                            if(pALU->m_Word1_OP3.f.src2_sel < SQ_ALU_SRC_GPR_SIZE)
                            {   
                                pALU->m_Word1_OP3.f.src2_sel = pALU->m_Word1_OP3.f.src2_sel + unRegOffset - unMinRegIndex;
                            }
                            else if(pALU->m_Word1_OP3.f.src2_sel >= SQ_ALU_SRC_CFILE_BASE)
                            {   
                                pALU->m_Word1_OP3.f.src2_sel += unConstOffset;
                            }    
                            if(pALU->m_Word0.f.src1_sel < SQ_ALU_SRC_GPR_SIZE)
                            {   
                                pALU->m_Word0.f.src1_sel = pALU->m_Word0.f.src1_sel + unRegOffset - unMinRegIndex;
                            }
                            else if(pALU->m_Word0.f.src1_sel >= SQ_ALU_SRC_CFILE_BASE)
                            {   
                                pALU->m_Word0.f.src1_sel += unConstOffset;
                            }                                 
                        }
                        else
                        {
                            if(pAsm->bR6xx)
                            {
                                uNumSrc = r700GetNumOperands(pALU->m_Word1_OP2.f6.alu_inst, 0);
                            }
                            else
                            {
                                uNumSrc = r700GetNumOperands(pALU->m_Word1_OP2.f.alu_inst, 0);
                            }
                            if(2 == uNumSrc)
                            {   /* 2 srcs */
                                if(pALU->m_Word0.f.src1_sel < SQ_ALU_SRC_GPR_SIZE)
                                {   
                                    pALU->m_Word0.f.src1_sel = pALU->m_Word0.f.src1_sel + unRegOffset - unMinRegIndex;
                                }
                                else if(pALU->m_Word0.f.src1_sel >= SQ_ALU_SRC_CFILE_BASE)
                                {   
                                    pALU->m_Word0.f.src1_sel += unConstOffset;
                                }                                  
                            }                            
                        }
                        pALU = (R700ALUInstruction*)(pALU->pNextInst);
                    }                    
                }             
                pInst = pInst->pNextInst;
            };
        }

        /* Put sub into main */
        plstCFmain->pTail->pNextInst = plstCFsub->pHead;
        plstCFmain->pTail            = plstCFsub->pTail;
        plstCFmain->uNumOfNode      += plstCFsub->uNumOfNode;

        unCFoffset += plstCFsub->uNumOfNode;
    }

    /* reloc callers */
    for(i=0; i<pAsm->unCallerArrayPointer; i++)
    {
        pAsm->callers[i].cf_ptr->m_Word0.f.addr
            = pAsm->subs[pAsm->callers[i].subDescIndex].unCFoffset; 

        if(NULL != pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc)
        {                 
            unMinRegIndex = pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc->pCompiledSub->MinRegIndex;
            unRegOffset = pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc->maxStartReg;

            if(NULL != pAsm->callers[i].prelude_cf_ptr)
            {                
                pCF_ALU = (R700ControlFlowALUClause * )(pAsm->callers[i].prelude_cf_ptr);
                pALU = pCF_ALU->m_pLinkedALUInstruction;
                for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
                {
                    pALU->m_Word1.f.dst_gpr = pALU->m_Word1.f.dst_gpr + unRegOffset - unMinRegIndex;
                    pALU = (R700ALUInstruction*)(pALU->pNextInst);
                }
            }
            if(NULL != pAsm->callers[i].finale_cf_ptr)
            {
                pCF_ALU = (R700ControlFlowALUClause * )(pAsm->callers[i].finale_cf_ptr);
                pALU = pCF_ALU->m_pLinkedALUInstruction;
                for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
                {
                    pALU->m_Word0.f.src0_sel = pALU->m_Word0.f.src0_sel + unRegOffset - unMinRegIndex;
                    pALU = (R700ALUInstruction*)(pALU->pNextInst);
                }
            }
        }
    }

    return GL_TRUE;
}

GLboolean callPreSub(r700_AssemblerBase* pAsm, 
                         LOADABLE_SCRIPT_SIGNITURE scriptSigniture,                          
                         COMPILED_SUB * pCompiledSub,                                               
                         GLshort uOutReg,
                         GLshort uNumValidSrc)
{
    /* save assemble context */
    GLuint starting_temp_register_number_save;
    GLuint number_used_registers_save;
    GLuint uFirstHelpReg_save;
    GLuint uHelpReg_save;
    GLuint uiCurInst_save;
    struct prog_instruction *pILInst_save;
    PRESUB_DESC * pPresubDesc;
    GLboolean     bRet;
    int i;

    R700ControlFlowGenericClause* prelude_cf_ptr = NULL;

    /* copy srcs to presub inputs */
    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
    for(i=0; i<uNumValidSrc; i++)
    {
        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
        pAsm->D.dst.reg   = pCompiledSub->srcRegIndex[i];
        pAsm->D.dst.writex = 1;
        pAsm->D.dst.writey = 1;
        pAsm->D.dst.writez = 1;
        pAsm->D.dst.writew = 1;

        if( GL_FALSE == assemble_src(pAsm, i, 0) )
        {
            return GL_FALSE;
        }

        next_ins(pAsm);
    }
    if(uNumValidSrc > 0)
    {
        prelude_cf_ptr     = pAsm->cf_current_alu_clause_ptr;
        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
    }

    /* browse thro existing presubs. */
    for(i=0; i<pAsm->unNumPresub; i++)
    {
        if(pAsm->presubs[i].sptSigniture == scriptSigniture)
        {
            break;
        }
    }

    if(i == pAsm->unNumPresub)
    {   /* not loaded yet */
        /* save assemble context */
        number_used_registers_save         = pAsm->number_used_registers;
        uFirstHelpReg_save                 = pAsm->uFirstHelpReg;
        uHelpReg_save                      = pAsm->uHelpReg;
        starting_temp_register_number_save = pAsm->starting_temp_register_number;
        pILInst_save                       = pAsm->pILInst;
        uiCurInst_save                     = pAsm->uiCurInst;

        /* alloc in presub */
        if( (pAsm->unNumPresub + 1) > pAsm->unPresubArraySize )
        {
            pAsm->presubs = (PRESUB_DESC*)_mesa_realloc( (void *)pAsm->presubs,
                                      sizeof(PRESUB_DESC) * pAsm->unPresubArraySize,
                                      sizeof(PRESUB_DESC) * (pAsm->unPresubArraySize + 4) );
            if(NULL == pAsm->presubs)
            {
                radeon_error("No memeory to allocate built in shader function description structures. \n");
                return GL_FALSE;
            }
            pAsm->unPresubArraySize += 4;
        }
        
        pPresubDesc = &(pAsm->presubs[i]);
        pPresubDesc->sptSigniture = scriptSigniture;

        /* constants offsets need to be final resolved at reloc. */
        if(0 == pAsm->unNumPresub)
        {
            pPresubDesc->unConstantsStart = 0; 
        }
        else
        {
            pPresubDesc->unConstantsStart =  pAsm->presubs[i-1].unConstantsStart
                                           + pAsm->presubs[i-1].pCompiledSub->NumParameters;
        }

        pPresubDesc->pCompiledSub = pCompiledSub;

        pPresubDesc->subIL_Shift = pAsm->unCurNumILInsts;
        pPresubDesc->maxStartReg  = uFirstHelpReg_save;
        pAsm->unCurNumILInsts    += pCompiledSub->NumInstructions;

        pAsm->unNumPresub++;

        /* setup new assemble context */
        pAsm->starting_temp_register_number = 0;
        pAsm->number_used_registers = pCompiledSub->NumTemporaries;
        pAsm->uFirstHelpReg         = pAsm->number_used_registers;
        pAsm->uHelpReg              = pAsm->uFirstHelpReg;

        bRet = assemble_CAL(pAsm, 
                            0, 
                            pPresubDesc->subIL_Shift, 
                            pCompiledSub->NumInstructions,
                            pCompiledSub->Instructions,
                            pPresubDesc);

        
        pPresubDesc->number_used_registers = pAsm->number_used_registers;        

        /* restore assemble context */
        pAsm->number_used_registers         = number_used_registers_save; 
        pAsm->uFirstHelpReg                 = uFirstHelpReg_save;
        pAsm->uHelpReg                      = uHelpReg_save;
        pAsm->starting_temp_register_number = starting_temp_register_number_save;
        pAsm->pILInst                       = pILInst_save; 
        pAsm->uiCurInst                     = uiCurInst_save;
    }
    else
    {   /* was loaded */
        pPresubDesc = &(pAsm->presubs[i]);  
        
        bRet = assemble_CAL(pAsm, 
                            0, 
                            pPresubDesc->subIL_Shift, 
                            pCompiledSub->NumInstructions,
                            pCompiledSub->Instructions,
                            pPresubDesc);
    }

    if(GL_FALSE == bRet)
    {
        radeon_error("Shader presub assemble failed. \n");
    }
    else
    {
        /* copy presub output to real dst */ 
        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

        if( GL_FALSE == assemble_dst(pAsm) )
        {
            return GL_FALSE;
        }

        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
        pAsm->S[0].src.reg   = pCompiledSub->dstRegIndex;
        pAsm->S[0].src.swizzlex = pCompiledSub->outputSwizzleX;
        pAsm->S[0].src.swizzley = pCompiledSub->outputSwizzleY;
        pAsm->S[0].src.swizzlez = pCompiledSub->outputSwizzleZ;
        pAsm->S[0].src.swizzlew = pCompiledSub->outputSwizzleW;

        next_ins(pAsm);        

        pAsm->callers[pAsm->unCallerArrayPointer - 1].finale_cf_ptr  = pAsm->cf_current_alu_clause_ptr;
        pAsm->callers[pAsm->unCallerArrayPointer - 1].prelude_cf_ptr = prelude_cf_ptr;
        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
    }

    if( (pPresubDesc->number_used_registers + pAsm->uFirstHelpReg) > pAsm->number_used_registers )
    {
        pAsm->number_used_registers = pPresubDesc->number_used_registers + pAsm->uFirstHelpReg;
    }
    if(pAsm->uFirstHelpReg > pPresubDesc->maxStartReg)
    {
        pPresubDesc->maxStartReg = pAsm->uFirstHelpReg;
    }

    return bRet;
}

GLboolean Process_Export(r700_AssemblerBase* pAsm,
                         GLuint type,
                         GLuint export_starting_index,
                         GLuint export_count, 
                         GLuint starting_register_number,
                         GLboolean is_depth_export)
{
    unsigned char ucWriteMask;

    check_current_clause(pAsm, CF_EMPTY_CLAUSE);
    check_current_clause(pAsm, CF_EXPORT_CLAUSE); //alloc the cf_current_export_clause_ptr

    pAsm->cf_current_export_clause_ptr->m_Word0.f.type = type;

    switch (type) 
    {
        case SQ_EXPORT_PIXEL:
            if(GL_TRUE == is_depth_export) 
            {
                pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_PIXEL_Z;
            }
            else 
            {
                pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_PIXEL_MRT0 + export_starting_index;
            }
            break;

        case SQ_EXPORT_POS:
            pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_POS_0 + export_starting_index; 
            break;

        case SQ_EXPORT_PARAM:
            pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = 0x0 + export_starting_index; 
            break;

        default:
            radeon_error("Unknown export type: %d\n", type);
            return GL_FALSE;
            break;
    }

    pAsm->cf_current_export_clause_ptr->m_Word0.f.rw_gpr      = starting_register_number;

    pAsm->cf_current_export_clause_ptr->m_Word0.f.rw_rel      = SQ_ABSOLUTE;
    pAsm->cf_current_export_clause_ptr->m_Word0.f.index_gpr   = 0x0;
    pAsm->cf_current_export_clause_ptr->m_Word0.f.elem_size   = 0x3; 

    pAsm->cf_current_export_clause_ptr->m_Word1.f.burst_count      = (export_count - 1);
    pAsm->cf_current_export_clause_ptr->m_Word1.f.end_of_program   = 0x0;
    pAsm->cf_current_export_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
    pAsm->cf_current_export_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_EXPORT;  // _DONE
    pAsm->cf_current_export_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
    pAsm->cf_current_export_clause_ptr->m_Word1.f.barrier          = 0x1;

    if (export_count == 1) 
    {
        ucWriteMask = pAsm->pucOutMask[starting_register_number - pAsm->starting_export_register_number];
	/* exports Z as a float into Red channel */
	if (GL_TRUE == is_depth_export)
	    ucWriteMask = 0x1;

        if( (ucWriteMask & 0x1) != 0)
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_X;
        }
        else
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_MASK;
        }
        if( ((ucWriteMask>>1) & 0x1) != 0)
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_Y;
        }
        else
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_MASK;
        }
        if( ((ucWriteMask>>2) & 0x1) != 0)
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_Z;
        }
        else
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_MASK;
        }
        if( ((ucWriteMask>>3) & 0x1) != 0)
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_W;
        }
        else
        {
            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_MASK;
        }
    }
    else 
    {
        // This should only be used if all components for all registers have been written
        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_X;
        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_Y;
        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_Z;
        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_W;
    }

    pAsm->cf_last_export_ptr = pAsm->cf_current_export_clause_ptr;

    return GL_TRUE;
}

GLboolean Move_Depth_Exports_To_Correct_Channels(r700_AssemblerBase *pAsm, BITS depth_channel_select)
{
	gl_inst_opcode Opcode_save = pAsm->pILInst[pAsm->uiCurInst].Opcode; //Should be OPCODE_END
    pAsm->pILInst[pAsm->uiCurInst].Opcode = OPCODE_MOV;

    // MOV depth_export_register.hw_depth_channel, depth_export_register.depth_channel_select

    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;

    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
    pAsm->D.dst.reg   = pAsm->depth_export_register_number;

    pAsm->D.dst.writex = 1;   // depth          goes in R channel for HW                       

    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
    pAsm->S[0].src.reg   = pAsm->depth_export_register_number;

    setswizzle_PVSSRC(&(pAsm->S[0].src), depth_channel_select);

    noneg_PVSSRC(&(pAsm->S[0].src));

    if( GL_FALSE == next_ins(pAsm) )
    {
        return GL_FALSE;
    }

    pAsm->pILInst[pAsm->uiCurInst].Opcode = Opcode_save;

    return GL_TRUE;
}
 
GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode,
                                   GLbitfield          OutputsWritten)  
{ 
    unsigned int unBit;
    GLuint export_count = 0;

    if(pR700AsmCode->depth_export_register_number >= 0) 
    {
        if( GL_FALSE == Move_Depth_Exports_To_Correct_Channels(pR700AsmCode, SQ_SEL_Z) )  // depth
		{
			return GL_FALSE;
		}
    }

    unBit = 1 << FRAG_RESULT_COLOR;
	if(OutputsWritten & unBit)
	{
		if( GL_FALSE == Process_Export(pR700AsmCode,
                                       SQ_EXPORT_PIXEL, 
                                       0, 
                                       1, 
                                       pR700AsmCode->uiFP_OutputMap[FRAG_RESULT_COLOR], 
                                       GL_FALSE) ) 
        {
            return GL_FALSE;
        }
        export_count++;
	}
	unBit = 1 << FRAG_RESULT_DEPTH;
	if(OutputsWritten & unBit)
	{
        if( GL_FALSE == Process_Export(pR700AsmCode,
                                       SQ_EXPORT_PIXEL, 
                                       0, 
                                       1, 
                                       pR700AsmCode->uiFP_OutputMap[FRAG_RESULT_DEPTH], 
                                       GL_TRUE)) 
        {
            return GL_FALSE;
        }
        export_count++;
	}
    /* Need to export something, otherwise we'll hang
     * results are undefined anyway */
    if(export_count == 0)
    {
        Process_Export(pR700AsmCode, SQ_EXPORT_PIXEL, 0, 1, 0, GL_FALSE);
    }
    
    if(pR700AsmCode->cf_last_export_ptr != NULL) 
    {
        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst        = SQ_CF_INST_EXPORT_DONE;
        pR700AsmCode->cf_last_export_ptr->m_Word1.f.end_of_program = 0x1;
    }

    return GL_TRUE;
}

GLboolean Process_Vertex_Exports(r700_AssemblerBase *pR700AsmCode,
                                 GLbitfield          OutputsWritten)  
{
    unsigned int unBit;
    unsigned int i;

    GLuint export_starting_index  = 0;
    GLuint export_count           = pR700AsmCode->number_of_exports;

    unBit = 1 << VERT_RESULT_HPOS;
	if(OutputsWritten & unBit)
	{
        if( GL_FALSE == Process_Export(pR700AsmCode, 
                                       SQ_EXPORT_POS, 
                                       export_starting_index, 
                                       1, 
                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_HPOS],
                                       GL_FALSE) )
        {
            return GL_FALSE;
        }

        export_count--;

        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;
	}

    pR700AsmCode->number_of_exports = export_count;

	unBit = 1 << VERT_RESULT_COL0;
	if(OutputsWritten & unBit)
	{
        if( GL_FALSE == Process_Export(pR700AsmCode, 
                                       SQ_EXPORT_PARAM, 
                                       export_starting_index, 
                                       1, 
                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_COL0],
                                       GL_FALSE) )
        {
            return GL_FALSE;
        }

        export_starting_index++;
	}

	unBit = 1 << VERT_RESULT_COL1;
	if(OutputsWritten & unBit)
	{
        if( GL_FALSE == Process_Export(pR700AsmCode, 
                                       SQ_EXPORT_PARAM, 
                                       export_starting_index, 
                                       1, 
                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_COL1],
                                       GL_FALSE) )
        {
            return GL_FALSE;
        }

        export_starting_index++;
	}

        unBit = 1 << VERT_RESULT_FOGC;
        if(OutputsWritten & unBit)
        {
        if( GL_FALSE == Process_Export(pR700AsmCode,
                                       SQ_EXPORT_PARAM,
                                       export_starting_index,
                                       1,
                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_FOGC],
                                       GL_FALSE) )
        {
            return GL_FALSE;
        }

        export_starting_index++;
        }

	for(i=0; i<8; i++)
	{
		unBit = 1 << (VERT_RESULT_TEX0 + i);
		if(OutputsWritten & unBit)
		{
            if( GL_FALSE == Process_Export(pR700AsmCode,
                                          SQ_EXPORT_PARAM, 
                                          export_starting_index, 
                                          1, 
                                          pR700AsmCode->ucVP_OutputMap[VERT_RESULT_TEX0 + i],
                                          GL_FALSE) )
            {
                return GL_FALSE;
            }

            export_starting_index++;
		}
	}
    
    for(i=VERT_RESULT_VAR0; i<VERT_RESULT_MAX; i++)
	{
        unBit = 1 << i;
        if(OutputsWritten & unBit)
		{
            if( GL_FALSE == Process_Export(pR700AsmCode,
                                          SQ_EXPORT_PARAM, 
                                          export_starting_index, 
                                          1, 
                                          pR700AsmCode->ucVP_OutputMap[i],
                                          GL_FALSE) )
            {                
                return GL_FALSE;
            }

            export_starting_index++;
		}
    }

    // At least one param should be exported
    if (export_count) 
    {
        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;    
    }
    else
    {
        if( GL_FALSE == Process_Export(pR700AsmCode,
                                       SQ_EXPORT_PARAM, 
                                       0, 
                                       1, 
                                       pR700AsmCode->starting_export_register_number,
                                       GL_FALSE) )
        {
            return GL_FALSE;
        }
      
        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_0;
        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_0;
        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_0;
        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_1;
        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;
    }

    pR700AsmCode->cf_last_export_ptr->m_Word1.f.end_of_program = 0x1;

    return GL_TRUE;
}

GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode)
{
    FREE(pR700AsmCode->pucOutMask);
    FREE(pR700AsmCode->pInstDeps);

    if(NULL != pR700AsmCode->subs)
    {
        FREE(pR700AsmCode->subs);
    }
    if(NULL != pR700AsmCode->callers)
    {
        FREE(pR700AsmCode->callers);
    }

    if(NULL != pR700AsmCode->presubs)
    {
        FREE(pR700AsmCode->presubs);
    }

    return GL_TRUE;
}