void Vc4Shader::Emit_Mov(CInstruction &Inst)
{
    assert(this->uShaderType == D3D10_SB_PIXEL_SHADER ||
           this->uShaderType == D3D10_SB_VERTEX_SHADER);

    VC4_ASSERT(Inst.m_NumOperands == 2);

    {
        for (uint8_t i = 0, aCurrent = D3D10_SB_OPERAND_4_COMPONENT_MASK_X; i < 4; i++)
        {
            if (Inst.m_Operands[0].m_WriteMask & aCurrent)
            {
                Vc4Register dst = Find_Vc4Register_M(Inst.m_Operands[0], aCurrent);
                Vc4Register src[1];
                Setup_SourceRegisters(Inst, 1, ARRAYSIZE(src), i , src);

                {
                    Vc4Instruction Vc4Inst;
                    Vc4Inst.Vc4_m_MOV(dst, src[0]);
                    Vc4Inst.Vc4_m_Pack(dst.GetPack(i));
                    Vc4Inst.Emit(CurrentStorage);
                }
            }

            aCurrent <<= 1;
        }
    }

    { // Emit a NOP
        Vc4Instruction Vc4Inst;
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_with_Mul_pipe(CInstruction &Inst)
{
    assert(this->uShaderType == D3D10_SB_PIXEL_SHADER ||
           this->uShaderType == D3D10_SB_VERTEX_SHADER);
    
    VC4_ASSERT(Inst.m_NumOperands == 3);

    {
        for (uint8_t i = 0, aCurrent = D3D10_SB_OPERAND_4_COMPONENT_MASK_X; i < 4; i++)
        {
            if (Inst.m_Operands[0].m_WriteMask & aCurrent)
            {
                Vc4Register dst = Find_Vc4Register_M(Inst.m_Operands[0], aCurrent);
                uint8_t pack = VC4_QPU_PACK_A_32;
                if (dst.GetFlags().packed)
                {
                    pack = VC4_QPU_PACK_MUL_8a + i;
                }

                Vc4Register src[2];
                this->Setup_SourceRegisters(Inst, 1, ARRAYSIZE(src), i, src);

                {
                    Vc4Instruction Vc4Inst;
                    switch (Inst.m_OpCode)
                    {
                    case D3D10_SB_OPCODE_MUL:
                        Vc4Inst.Vc4_m_FMUL(dst, src[0], src[1]);
                        break;
                    default:
                        VC4_ASSERT(false);
                    }
                    Vc4Inst.Vc4_m_Pack(pack);
                    Vc4Inst.Emit(CurrentStorage);
                }
            }

            aCurrent <<= 1;
        }
    }

    { // Emit a NOP
        Vc4Instruction Vc4Inst;
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_Prologue_PS()
{
    assert(this->uShaderType == D3D10_SB_PIXEL_SHADER);
    
    for (uint8_t i = 0, iRegUsed = 0; iRegUsed < this->cInput; i++)
    {
        Vc4Register raX = this->InputRegister[i / 4][i % 4];

        if (raX.GetFlags().valid)
        {
            assert(raX.GetMux() == VC4_QPU_ALU_REG_A || raX.GetMux() == VC4_QPU_ALU_REG_B);

            Vc4Register r0(VC4_QPU_ALU_R0, VC4_QPU_WADDR_ACC0);

            // Issue mul inputs (from varying) with ra15 (W).
            {
                Vc4Instruction Vc4Inst;
                Vc4Register varying(VC4_QPU_ALU_REG_B, VC4_QPU_RADDR_VERYING);
                Vc4Register ra15(VC4_QPU_ALU_REG_A, 15);
                Vc4Inst.Vc4_m_FMUL(r0, varying, ra15);
                Vc4Inst.Emit(CurrentStorage);
            }

            // Issue add r5 to each input data to complete interpolation.
            {
                Vc4Instruction Vc4Inst;
                Vc4Register r5(VC4_QPU_ALU_R5);
                Vc4Inst.Vc4_a_FADD(raX, r0, r5);
                Vc4Inst.Emit(CurrentStorage);
            }

            iRegUsed++;
        }
    }

    { // Emit a NOP with 'sbwait'
        Vc4Instruction Vc4Inst;
        Vc4Inst.Vc4_Sig(VC4_QPU_SIG_WAIT_FOR_SCOREBOARD);
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_Prologue_VS()
{
    assert(this->uShaderType == D3D10_SB_VERTEX_SHADER);

    VC4_ASSERT(cInput < 16); // VR_SETUP:NUM limitation, vpm only can read up to 16 values.

    {
        Vc4Instruction Vc4Inst(vc4_load_immediate_32);
        Vc4Register vr_setup(VC4_QPU_ALU_REG_A, VC4_QPU_WADDR_VPMVCD_RD_SETUP);
        Vc4Register value; value.SetImmediateI(MAKE_VR_SETUP(cInput, 1, true, false, VC4_QPU_32BIT_VECTOR, 0));
        Vc4Inst.Vc4_a_LOAD32(vr_setup, value);
        Vc4Inst.Emit(CurrentStorage);
    }

    {
        Vc4Instruction Vc4Inst(vc4_load_immediate_32);
        Vc4Register vw_setup(VC4_QPU_ALU_REG_B, VC4_QPU_WADDR_VPMVCD_WR_SETUP);
        Vc4Register value; value.SetImmediateI(MAKE_VW_SETUP(1, true, false, VC4_QPU_32BIT_VECTOR, 0));
        Vc4Inst.Vc4_a_LOAD32(vw_setup, value);
        Vc4Inst.Emit(CurrentStorage);
    }

    for (uint8_t iRegUsed = 0, iRegIndex = 0; iRegUsed < this->cInput; iRegIndex++)
    {
        Vc4Instruction Vc4Inst;
        Vc4Register raX = this->InputRegister[iRegIndex / 4][iRegIndex % 4];
        if (raX.GetFlags().valid)
        {
            assert(raX.GetMux() == VC4_QPU_ALU_REG_A || raX.GetMux() == VC4_QPU_ALU_REG_B);
            Vc4Register vpm(raX.GetMux(), VC4_QPU_RADDR_VPM);
            Vc4Inst.Vc4_a_MOV(raX, vpm);
            Vc4Inst.Emit(CurrentStorage);
            iRegUsed++;
        }
    }

    { // Emit a NOP
        Vc4Instruction Vc4Inst;
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_Sample(CInstruction &Inst)
{
    assert(this->uShaderType == D3D10_SB_PIXEL_SHADER);
    
    VC4_ASSERT(Inst.m_NumOperands == 4);

    boolean bUnpack = false;

    Vc4Register o[4];

    VC4_ASSERT(Inst.m_Operands[0].m_NumComponents == D3D10_SB_OPERAND_4_COMPONENT);
    VC4_ASSERT(Inst.m_Operands[0].m_IndexDimension == D3D10_SB_OPERAND_INDEX_1D);
    VC4_ASSERT(Inst.m_Operands[0].m_IndexType[0] == D3D10_SB_OPERAND_INDEX_IMMEDIATE32);
    VC4_ASSERT(Inst.m_Operands[0].m_ComponentSelection == D3D10_SB_OPERAND_4_COMPONENT_MASK_MODE);
    VC4_ASSERT(Inst.m_Operands[0].m_WriteMask == (D3D10_SB_OPERAND_4_COMPONENT_MASK_R | D3D10_SB_OPERAND_4_COMPONENT_MASK_G | D3D10_SB_OPERAND_4_COMPONENT_MASK_B | D3D10_SB_OPERAND_4_COMPONENT_MASK_A));
    switch (Inst.m_Operands[0].m_Type)
    {
    case D3D10_SB_OPERAND_TYPE_OUTPUT:
        o[0] = Find_Vc4Register_M(Inst.m_Operands[0], (Inst.m_Operands[0].m_WriteMask & D3D10_SB_OPERAND_4_COMPONENT_MASK_MASK));
        VC4_ASSERT(o[0].GetFlags().packed);
        break;
    case D3D10_SB_OPERAND_TYPE_TEMP:
        for (uint8_t i = 0, aCurrent = D3D10_SB_OPERAND_4_COMPONENT_MASK_X; i < 4; i++)
        {
            if (Inst.m_Operands[0].m_WriteMask & aCurrent)
            {
                o[i] = Find_Vc4Register_M(Inst.m_Operands[0], aCurrent);
            }
            aCurrent <<= 1;
        }
        bUnpack = true;
        break;
    default:
        VC4_ASSERT(false);
    }

    // Resource
    VC4_ASSERT(Inst.m_Operands[2].m_Type == D3D10_SB_OPERAND_TYPE_RESOURCE);
    VC4_ASSERT(Inst.m_Operands[2].m_NumComponents == D3D10_SB_OPERAND_4_COMPONENT);
    VC4_ASSERT(Inst.m_Operands[2].m_IndexDimension == D3D10_SB_OPERAND_INDEX_1D);
    VC4_ASSERT(Inst.m_Operands[2].m_IndexType[0] == D3D10_SB_OPERAND_INDEX_IMMEDIATE32);
    uint32_t resourceIndex = Inst.m_Operands[2].m_Index[0].m_RegIndex;
    uint32_t texDimension = this->ResourceDimension[resourceIndex];

    DXGI_FORMAT texFormat = UmdCompiler->GetShaderResourceFormat((uint8_t)resourceIndex);
    VC4_ASSERT((texFormat == DXGI_FORMAT_B8G8R8A8_UNORM) || (texFormat == DXGI_FORMAT_R8G8B8A8_UNORM));
        
    // TODO: more generic color channel swizzle support.
    boolean bSwapColorChannel = (texFormat != DXGI_FORMAT_R8G8B8A8_UNORM);
    
    // Texture coordinate
    VC4_ASSERT(Inst.m_Operands[1].m_NumComponents == D3D10_SB_OPERAND_4_COMPONENT);
    VC4_ASSERT(Inst.m_Operands[1].m_IndexDimension == D3D10_SB_OPERAND_INDEX_1D);
    VC4_ASSERT(Inst.m_Operands[1].m_IndexType[0] == D3D10_SB_OPERAND_INDEX_IMMEDIATE32);
    VC4_ASSERT(Inst.m_Operands[1].m_ComponentSelection == D3D10_SB_OPERAND_4_COMPONENT_SWIZZLE_MODE);

    Vc4Register s;
    Vc4Register t;
    Vc4Register r;

    switch (texDimension)
    {
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURECUBE:
        r = Find_Vc4Register_M(Inst.m_Operands[1], D3D10_SB_OPERAND_4_COMPONENT_MASK(Inst.m_Operands[1].m_Swizzle[2]));
        __fallthrough;
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURE2D:
        t = Find_Vc4Register_M(Inst.m_Operands[1], D3D10_SB_OPERAND_4_COMPONENT_MASK(Inst.m_Operands[1].m_Swizzle[1]));
        __fallthrough;
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURE1D:
        s = Find_Vc4Register_M(Inst.m_Operands[1], D3D10_SB_OPERAND_4_COMPONENT_MASK(Inst.m_Operands[1].m_Swizzle[0]));
        break;
    case D3D10_SB_RESOURCE_DIMENSION_BUFFER:
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURE3D:
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURE2DMS:
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURE1DARRAY:
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURE2DARRAY:
    case D3D10_SB_RESOURCE_DIMENSION_TEXTURE2DMSARRAY:
    default:
        assert(false);
    }
    
    // Sampler
    VC4_ASSERT(Inst.m_Operands[3].m_Type == D3D10_SB_OPERAND_TYPE_SAMPLER);
    VC4_ASSERT(Inst.m_Operands[3].m_IndexDimension == D3D10_SB_OPERAND_INDEX_1D);
    VC4_ASSERT(Inst.m_Operands[3].m_IndexType[0] == D3D10_SB_OPERAND_INDEX_IMMEDIATE32);
    uint32_t samplerIndex = Inst.m_Operands[3].m_Index[0].m_RegIndex;
               
    // texture address : z
    if (r.GetFlags().valid)
    {
        Vc4Instruction Vc4Inst;
        Vc4Register tmu0_r(VC4_QPU_ALU_REG_A, VC4_QPU_WADDR_TMU0_R);
        Vc4Inst.Vc4_a_MOV(tmu0_r, r);
        Vc4Inst.Emit(CurrentStorage);
    }

    // texture address : y
    if (t.GetFlags().valid)
    {
        Vc4Instruction Vc4Inst;
        Vc4Register tmu0_t(VC4_QPU_ALU_REG_A, VC4_QPU_WADDR_TMU0_T);
        Vc4Inst.Vc4_a_MOV(tmu0_t, t);
        Vc4Inst.Emit(CurrentStorage);
    }

    // texture address : x and must write 's' at last.
    assert(s.GetFlags().valid);
    {
        Vc4Instruction Vc4Inst;
        Vc4Register tmu0_s(VC4_QPU_ALU_REG_A, VC4_QPU_WADDR_TMU0_S);
        Vc4Inst.Vc4_a_MOV(tmu0_s, s);
        Vc4Inst.Emit(CurrentStorage);
    }

    // add uniform references.
    {
        {
            VC4_UNIFORM_FORMAT u;
            u.Type = VC4_UNIFORM_TYPE_SAMPLER_CONFIG_P0;
            u.samplerConfiguration.samplerIndex = samplerIndex;
            u.samplerConfiguration.resourceIndex = resourceIndex;
            this->AddUniformReference(u);
        }

        {
            VC4_UNIFORM_FORMAT u;
            u.Type = VC4_UNIFORM_TYPE_SAMPLER_CONFIG_P1;
            u.samplerConfiguration.samplerIndex = samplerIndex;
            u.samplerConfiguration.resourceIndex = resourceIndex;
            this->AddUniformReference(u);
        }

        if (r.GetFlags().valid) // only cube needs P2 config.
        {
            VC4_UNIFORM_FORMAT u;
            u.Type = VC4_UNIFORM_TYPE_SAMPLER_CONFIG_P2;
            u.samplerConfiguration.samplerIndex = samplerIndex;
            u.samplerConfiguration.resourceIndex = resourceIndex;
            this->AddUniformReference(u);
        }
    }

    // Sample texture, result come up in r4.
    {
        Vc4Instruction Vc4Inst;
        Vc4Inst.Vc4_Sig(VC4_QPU_SIG_LOAD_TMU0);
        Vc4Inst.Emit(CurrentStorage);
    }

    // Sample result is now at r4.
    Vc4Register r4(VC4_QPU_ALU_R4);

    // Move result at r4 to output register.
    if (Inst.m_Operands[0].m_Type == D3D10_SB_OPERAND_TYPE_OUTPUT)
    {
        if (bSwapColorChannel == o[0].GetFlags().swap_color_channel)
        {
            Vc4Instruction Vc4Inst;
            Vc4Inst.Vc4_a_MOV(o[0], r4);
            Vc4Inst.Emit(CurrentStorage);
        }
        else
        {
            // R, G, B channel
            for (uint8_t i = 0; i < 3; i++)
            {
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_m_MOV(o[0], r4);
                Vc4Inst.Vc4_m_Pack(VC4_QPU_PACK_MUL_8c - i);
                Vc4Inst.Vc4_m_Unpack(VC4_QPU_UNPACK_8a + i, true); // Use R4 unpack.
                Vc4Inst.Emit(CurrentStorage);
            }

            // A channel
            {
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_m_MOV(o[0], r4);
                Vc4Inst.Vc4_m_Pack(VC4_QPU_PACK_MUL_8d);
                Vc4Inst.Vc4_m_Unpack(VC4_QPU_UNPACK_8d, true); // Use R4 unpack.
                Vc4Inst.Emit(CurrentStorage);
            }
        }
    }
    else
    {
        // Move each color channel at r4 to o[i].
        // R, G, B channel
        for (uint8_t i = 0; i < 3; i++)
        {
            Vc4Register out = bSwapColorChannel ? o[2 - i] : o[i];
            if (out.GetFlags().valid)
            {
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_m_MOV(out, r4);
                Vc4Inst.Vc4_m_Unpack(VC4_QPU_UNPACK_8a + i, true); // Use R4 unpack.
                Vc4Inst.Emit(CurrentStorage);
            }
        }

        // A channel
        if (o[3].GetFlags().valid)
        {
            Vc4Instruction Vc4Inst;
            Vc4Inst.Vc4_m_MOV(o[3], r4);
            Vc4Inst.Vc4_m_Unpack(VC4_QPU_UNPACK_8d, true); // Use R4 unpack.
            Vc4Inst.Emit(CurrentStorage);
        }
    }

    { // Emit a NOP
        Vc4Instruction Vc4Inst;
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_with_Add_pipe(CInstruction &Inst)
{
    assert(this->uShaderType == D3D10_SB_PIXEL_SHADER ||
           this->uShaderType == D3D10_SB_VERTEX_SHADER);

    VC4_ASSERT(Inst.m_NumOperands == 3);

    {
        for (uint8_t i = 0, aCurrent = D3D10_SB_OPERAND_4_COMPONENT_MASK_X; i < 4; i++)
        {
            if (Inst.m_Operands[0].m_WriteMask & aCurrent)
            {
                Vc4Register dst = Find_Vc4Register_M(Inst.m_Operands[0], aCurrent);

                Vc4Register src[2];
                this->Setup_SourceRegisters(Inst, 1, ARRAYSIZE(src), i, src);

                Vc4Register _dst;
                if (dst.GetFlags().packed)
                {
                    // pack has to be done at mul pipe, so result to r3, 
                    // then use mul pipe to move to final dst (with pack).
                    Vc4Register r3(VC4_QPU_ALU_R3, VC4_QPU_WADDR_ACC3);
                    _dst = r3;
                }
                else
                {
                    _dst = dst;
                }
                
                {
                    Vc4Instruction Vc4Inst;
                    switch (Inst.m_OpCode)
                    {
                    case D3D10_SB_OPCODE_ADD:
                        Vc4Inst.Vc4_a_FADD(_dst, src[0], src[1]);
                        break;
                    case D3D10_SB_OPCODE_MAX:
                        Vc4Inst.Vc4_a_FMAX(_dst, src[0], src[1]);
                        break;
                    case D3D10_SB_OPCODE_MIN:
                        Vc4Inst.Vc4_a_FMIN(_dst, src[0], src[1]);
                        break;
                    case D3D10_SB_OPCODE_IADD:
                        Vc4Inst.Vc4_a_IADD(_dst, src[0], src[1]);
                        break;
                    default:
                        VC4_ASSERT(false);
                    }
                    Vc4Inst.Emit(CurrentStorage);
                }

                if (dst.GetFlags().packed)
                {
                    Vc4Instruction Vc4Inst;
                    Vc4Inst.Vc4_m_MOV(dst, _dst);
                    Vc4Inst.Vc4_m_Pack(dst.GetPack(i));
                    Vc4Inst.Emit(CurrentStorage);
                }
            }

            aCurrent <<= 1;
        }
    }

    { // Emit a NOP
        Vc4Instruction Vc4Inst;
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_DPx(CInstruction &Inst)
{
    assert(this->uShaderType == D3D10_SB_PIXEL_SHADER ||
           this->uShaderType == D3D10_SB_VERTEX_SHADER); 

    VC4_ASSERT(Inst.m_NumOperands == 3);

    {
        // DP2 loop 2 times.
        // DP3 loop 3 times.
        // DP4 loop 4 times.
        uint8_t c = (uint8_t)(Inst.m_OpCode - 13);

        // where to accumulate result of mul.
        Vc4Register accum(VC4_QPU_ALU_R3, VC4_QPU_WADDR_ACC3);

        {
            Vc4Register zero(VC4_QPU_ALU_REG_B, 0); // 0 as small immediate in raddr_b
            Vc4Instruction Vc4Inst(vc4_alu_small_immediate);
            Vc4Inst.Vc4_m_MOV(accum, zero);
            Vc4Inst.Emit(CurrentStorage);
        }
           
        for(uint8_t i = 0; i < c; i++)
        {
            Vc4Register temp(VC4_QPU_ALU_R1, VC4_QPU_WADDR_ACC1);
            Vc4Register src[2];
            Setup_SourceRegisters(Inst, 1, ARRAYSIZE(src), i, src);

            {
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_m_FMUL(temp, src[0], src[1]);
                if (i > 0)
                {
                    Vc4Inst.Vc4_a_FADD(accum, accum, temp);
                }
                Vc4Inst.Emit(CurrentStorage);
            }

            if (i+1 == c)
            {
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_a_FADD(accum, accum, temp);
                Vc4Inst.Emit(CurrentStorage);
            }
        }

        // replicate ouput where specified.
        for (uint8_t i = 0, aCurrent = D3D10_SB_OPERAND_4_COMPONENT_MASK_X; i < 4; i++)
        {
            if (Inst.m_Operands[0].m_WriteMask & aCurrent)
            {
                Vc4Register dst = Find_Vc4Register_M(Inst.m_Operands[0], aCurrent);
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_m_MOV(dst, accum);
                Vc4Inst.Vc4_m_Pack(dst.GetPack(i));
                Vc4Inst.Emit(CurrentStorage);
            }
     
            aCurrent <<= 1;
        }
    }

    { // Emit a NOP
        Vc4Instruction Vc4Inst;
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_Mad(CInstruction &Inst)
{
    assert(this->uShaderType == D3D10_SB_PIXEL_SHADER ||
           this->uShaderType == D3D10_SB_VERTEX_SHADER);

    VC4_ASSERT(Inst.m_NumOperands == 4);

    {
        for (uint8_t i = 0, aCurrent = D3D10_SB_OPERAND_4_COMPONENT_MASK_X; i < 4; i++)
        {
            if (Inst.m_Operands[0].m_WriteMask & aCurrent)
            {
                Vc4Register accum(VC4_QPU_ALU_R3, VC4_QPU_WADDR_ACC3);
                Vc4Register dst = Find_Vc4Register_M(Inst.m_Operands[0], aCurrent);

                // perform mul first 2 operands
                {
                    Vc4Register src[2];
                    this->Setup_SourceRegisters(Inst, 1, ARRAYSIZE(src), i, src);

                    {
                        Vc4Instruction Vc4Inst;
                        Vc4Inst.Vc4_m_FMUL(accum, src[0], src[1]);
                        Vc4Inst.Emit(CurrentStorage);
                    }
                }

                Vc4Register _dst;
                if (dst.GetFlags().packed)
                {
                    // pack has to be done at mul pipe, so result to r3, 
                    // then use mul pipe to move to final dst (with pack).
                    _dst = accum;
                }
                else
                {
                    _dst = dst;
                }

                // perform add with 3rd operand.
                {
                    Vc4Register src[1];
                    this->Setup_SourceRegisters(Inst, 3, ARRAYSIZE(src), i, src);

                    {
                        Vc4Instruction Vc4Inst;
                        Vc4Inst.Vc4_a_FADD(_dst, accum, src[0]);
                        Vc4Inst.Emit(CurrentStorage);
                    }
                }

                // move to destination (with packing).
                if (dst.GetFlags().packed)
                {
                    Vc4Instruction Vc4Inst;
                    Vc4Inst.Vc4_m_MOV(dst, accum);
                    Vc4Inst.Vc4_m_Pack(dst.GetPack(i));
                    Vc4Inst.Emit(CurrentStorage);
                }
            }

            aCurrent <<= 1;
        }
    }

    { // Emit a NOP
        Vc4Instruction Vc4Inst;
        Vc4Inst.Emit(CurrentStorage);
    }
}
void Vc4Shader::Emit_ShaderOutput_VS(boolean bVS)
{
    assert(this->uShaderType == D3D10_SB_VERTEX_SHADER);
    
    Vc4Register vpm(VC4_QPU_ALU_REG_B, VC4_QPU_WADDR_VPM);

    Vc4Register pos[4]; // X/Y/Z/W.
    for (uint8_t iPos = 0; iPos < this->cOutput; iPos++)
    {
        if (this->OutputRegister[iPos / 4][iPos % 4].GetFlags().position)
        {
            for (uint8_t i = iPos; i < iPos + 4; i++)
            {
                Vc4Register reg = this->OutputRegister[i / 4][i % 4];
                assert(reg.GetFlags().position);
                pos[this->SwizzleMaskToIndex(reg.GetSwizzleMask())] = reg;
            }
            break;
        }
    }

    // Only CS emits raw coordinates.
    if (!bVS)
    {
        // Xc, Yc, Zc, Wc
        for (uint8_t i = 0; i < 4; i++)
        {
            Vc4Register src = pos[i];
            Vc4Instruction Vc4Inst;
            Vc4Inst.Vc4_m_MOV(vpm, src);
            Vc4Inst.Emit(CurrentStorage);
        }
    }

    // calculate 1/W
    {
        // send W to sfu_recip.
        {
            Vc4Register sfu_recip((pos[3].GetMux() == VC4_QPU_ALU_REG_A ? VC4_QPU_ALU_REG_B : VC4_QPU_ALU_REG_A), VC4_QPU_WADDR_SFU_RECIP);
            Vc4Instruction Vc4Inst;
            Vc4Inst.Vc4_a_MOV(sfu_recip, pos[3]);
            Vc4Inst.Emit(CurrentStorage);
        }

        // Issue 2 NOPs to wait result of sfu_recip.
        for (uint8_t i = 0; i < 2; i++)
        {
            Vc4Instruction Vc4Inst;
            Vc4Inst.Emit(CurrentStorage);
        }
    }

    // Now, r4 is 1/W.
    Vc4Register r4(VC4_QPU_ALU_R4);
    
    // Ys/Xs
    {
        // scale by RT dimension (read from uniform). Result in r0/r1.
        {
            for (uint8_t i = 0; i < 2; i++)
            {
                Vc4Register rX(VC4_QPU_ALU_R0 + i, VC4_QPU_WADDR_ACC0 + i); // r0 and r1.
                
                { // divide Xc/Yc by W.
                    Vc4Instruction Vc4Inst;
                    Vc4Inst.Vc4_m_FMUL(rX, pos[i], r4);
                    Vc4Inst.Emit(CurrentStorage);
                }

                { // Scale Xc/Yc with viewport.
                    Vc4Instruction Vc4Inst;
                    Vc4Register unif((rX.GetMux() == VC4_QPU_ALU_REG_A ? VC4_QPU_ALU_REG_B : VC4_QPU_ALU_REG_A), VC4_QPU_RADDR_UNIFORM); // use opossit register file.
                    Vc4Inst.Vc4_m_FMUL(rX, rX, unif);
                    Vc4Inst.Emit(CurrentStorage);
                    {
                        VC4_UNIFORM_FORMAT u;
                        u.Type = (i == 0 ? VC4_UNIFORM_TYPE_VIEWPORT_SCALE_X : VC4_UNIFORM_TYPE_VIEWPORT_SCALE_Y);
                        this->AddUniformReference(u);
                    }
                }
            }
        }

        // Convert r0/r1 to 16bits float and pack them into ra16, then output to vpm.
        {
            Vc4Register ra16(VC4_QPU_ALU_REG_A, 16);

            for (uint8_t i = 0; i < 2; i++)
            {
                Vc4Register rX(VC4_QPU_ALU_R0 + i); // r0 and r1.
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_a_FTOI(ra16, rX); // REUSE ra16 as temp is no longer needed.
                Vc4Inst.Vc4_a_Pack(VC4_QPU_PACK_A_16a + i); // Pack to 16a or 16b.
                Vc4Inst.Emit(CurrentStorage);
            }

            // Issue NOP as ra16 is just written.
            {
                Vc4Instruction Vc4Inst;
                Vc4Inst.Emit(CurrentStorage);
            }

            // Output to vpm.
            {
                Vc4Instruction Vc4Inst;
                Vc4Inst.Vc4_m_MOV(vpm, ra16);
                Vc4Inst.Emit(CurrentStorage);
            }
        }
    }

    // Zs
    { // Zs = Zc / W // TODO: Z offset
        Vc4Instruction Vc4Inst;
        Vc4Inst.Vc4_m_FMUL(vpm, pos[2], r4);
        Vc4Inst.Emit(CurrentStorage);
    }

    // 1/Wc
    {
        // Move result of sfu_recip (come up at r4) to vpm.
        {
            Vc4Instruction Vc4Inst;
            Vc4Inst.Vc4_m_MOV(vpm, r4);
            Vc4Inst.Emit(CurrentStorage);
        }
    }

    // Only VS emits "varying" (everything read from vpm except position data).
    if (bVS)
    {
        for (uint8_t i = 0, iRegUsed = 0; iRegUsed < this->cOutput; i++ )
        {
            Vc4Register src = this->OutputRegister[i / 4][i % 4];
            if (src.GetFlags().valid)
            {
                if (src.GetFlags().linkage)
                {
                    Vc4Instruction Vc4Inst;
                    Vc4Inst.Vc4_m_MOV(vpm, src);
                    // Vc4Inst.Vc4_m_FMUL(vpm, src, r4);
                    Vc4Inst.Emit(CurrentStorage);
                }
                iRegUsed++;
            }
        }
    }
}