Example #1
0
inline void
internal::Trr2kNTTT
( UpperOrLower uplo,
  Orientation orientationOfB, 
  Orientation orientationOfC, Orientation orientationOfD,
  T alpha, const DistMatrix<T,MC,MR>& A, const DistMatrix<T,MC,MR>& B,
           const DistMatrix<T,MC,MR>& C, const DistMatrix<T,MC,MR>& D,
  T beta,        DistMatrix<T,MC,MR>& E )
{
#ifndef RELEASE
    PushCallStack("internal::Trr2kNTTT");
    if( E.Height() != E.Width()  || A.Width()  != C.Height() ||
        A.Height() != E.Height() || C.Width()  != E.Height() ||
        B.Height() != E.Width()  || D.Height() != E.Width()  ||
        A.Width()  != B.Width()  || C.Height() != D.Width() )
        throw std::logic_error("Nonconformal Trr2kNTTT");
#endif
    const Grid& g = E.Grid();

    DistMatrix<T,MC,MR> AL(g), AR(g),
                        A0(g), A1(g), A2(g);
    DistMatrix<T,MC,MR> BL(g), BR(g),
                        B0(g), B1(g), B2(g);

    DistMatrix<T,MC,MR> CT(g),  C0(g),
                        CB(g),  C1(g),
                                C2(g);
    DistMatrix<T,MC,MR> DL(g), DR(g),
                        D0(g), D1(g), D2(g);

    DistMatrix<T,MC,  STAR> A1_MC_STAR(g);
    DistMatrix<T,VR,  STAR> B1_VR_STAR(g);
    DistMatrix<T,STAR,MR  > B1AdjOrTrans_STAR_MR(g);
    DistMatrix<T,STAR,MC  > C1_STAR_MC(g);
    DistMatrix<T,VR,  STAR> D1_VR_STAR(g);
    DistMatrix<T,STAR,MR  > D1AdjOrTrans_STAR_MR(g);

    LockedPartitionRight( A, AL, AR, 0 );
    LockedPartitionRight( B, BL, BR, 0 );
    LockedPartitionDown
    ( C, CT,
         CB, 0 );
    LockedPartitionRight( D, DL, DR, 0 );
    while( AL.Width() < A.Width() )
    {
        LockedRepartitionRight
        ( AL, /**/ AR,
          A0, /**/ A1, A2 );
        LockedRepartitionRight
        ( BL, /**/ BR,
          B0, /**/ B1, B2 );
        LockedRepartitionDown
        ( CT,  C0,
         /**/ /**/
               C1,
          CB,  C2 );
        LockedRepartitionRight
        ( DL, /**/ DR,
          D0, /**/ D1, D2 );

        A1_MC_STAR.AlignWith( E );
        B1_VR_STAR.AlignWith( E );
        B1AdjOrTrans_STAR_MR.AlignWith( E );
        C1_STAR_MC.AlignWith( E );
        D1_VR_STAR.AlignWith( E );
        D1AdjOrTrans_STAR_MR.AlignWith( E );
        //--------------------------------------------------------------------//
        A1_MC_STAR = A1;
        C1_STAR_MC = C1;
        B1_VR_STAR = B1;
        D1_VR_STAR = D1;
        if( orientationOfB == ADJOINT )
            B1AdjOrTrans_STAR_MR.AdjointFrom( B1_VR_STAR );
        else
            B1AdjOrTrans_STAR_MR.TransposeFrom( B1_VR_STAR );
        if( orientationOfD == ADJOINT )
            D1AdjOrTrans_STAR_MR.AdjointFrom( D1_VR_STAR );
        else
            D1AdjOrTrans_STAR_MR.TransposeFrom( D1_VR_STAR );
        internal::LocalTrr2k
        ( uplo, orientationOfC,
          alpha, A1_MC_STAR, B1AdjOrTrans_STAR_MR, 
                 C1_STAR_MC, D1AdjOrTrans_STAR_MR,
          beta,  E );
        //--------------------------------------------------------------------//
        D1AdjOrTrans_STAR_MR.FreeAlignments();
        D1_VR_STAR.FreeAlignments();
        C1_STAR_MC.FreeAlignments();
        B1AdjOrTrans_STAR_MR.FreeAlignments();
        B1_VR_STAR.FreeAlignments();
        A1_MC_STAR.FreeAlignments();

        SlideLockedPartitionRight
        ( DL,     /**/ DR,
          D0, D1, /**/ D2 );
        SlideLockedPartitionDown
        ( CT,  C0,
               C1,
         /**/ /**/
          CB,  C2 );
        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );
        SlideLockedPartitionRight
        ( AL,     /**/ AR,
          A0, A1, /**/ A2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
Example #2
0
inline void
HemmRUC
( T alpha, const DistMatrix<T>& A,
           const DistMatrix<T>& B,
  T beta,        DistMatrix<T>& C )
{
#ifndef RELEASE
    PushCallStack("internal::HemmRUC");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error("{A,B,C} must be distributed on the same grid");
#endif
    const Grid& g = A.Grid();

    // Matrix views
    DistMatrix<T> 
        ATL(g), ATR(g),  A00(g), A01(g), A02(g),  AColPan(g),
        ABL(g), ABR(g),  A10(g), A11(g), A12(g),  ARowPan(g),
                         A20(g), A21(g), A22(g);
    DistMatrix<T> BL(g), BR(g),
                  B0(g), B1(g), B2(g);
    DistMatrix<T> CL(g), CR(g),
                  C0(g), C1(g), C2(g),
                  CLeft(g), CRight(g);

    // Temporary distributions
    DistMatrix<T,MC,STAR> B1_MC_STAR(g);
    DistMatrix<T,VR,  STAR> AColPan_VR_STAR(g);
    DistMatrix<T,STAR,MR  > AColPanAdj_STAR_MR(g);
    DistMatrix<T,MR,  STAR> ARowPanAdj_MR_STAR(g);

    B1_MC_STAR.AlignWith( C );

    // Start the algorithm
    Scale( beta, C );
    LockedPartitionDownDiagonal
    ( A, ATL, ATR,
         ABL, ABR, 0 );
    LockedPartitionRight( B, BL, BR, 0 );
    PartitionRight( C, CL, CR, 0 );
    while( CR.Width() > 0 )
    {
        LockedRepartitionDownDiagonal
        ( ATL, /**/ ATR,  A00, /**/ A01, A02,
         /*************/ /******************/
               /**/       A10, /**/ A11, A12,
          ABL, /**/ ABR,  A20, /**/ A21, A22 );

        LockedRepartitionRight
        ( BL, /**/ BR,
          B0, /**/ B1, B2 );

        RepartitionRight
        ( CL, /**/ CR,
          C0, /**/ C1, C2 );

        ARowPan.LockedView1x2( A11, A12 );
        AColPan.LockedView2x1
        ( A01,
          A11 );

        CLeft.View1x2( C0, C1 );
        CRight.View1x2( C1, C2 );

        AColPan_VR_STAR.AlignWith( CLeft );
        AColPanAdj_STAR_MR.AlignWith( CLeft );
        ARowPanAdj_MR_STAR.AlignWith( CRight );
        //--------------------------------------------------------------------//
        B1_MC_STAR = B1;

        AColPan_VR_STAR = AColPan;
        AColPanAdj_STAR_MR.AdjointFrom( AColPan_VR_STAR );
        ARowPanAdj_MR_STAR.AdjointFrom( ARowPan );
        MakeTrapezoidal( LEFT,  LOWER,  0, ARowPanAdj_MR_STAR );
        MakeTrapezoidal( RIGHT, LOWER, -1, AColPanAdj_STAR_MR );

        LocalGemm
        ( NORMAL, ADJOINT, 
          alpha, B1_MC_STAR, ARowPanAdj_MR_STAR, T(1), CRight );

        LocalGemm
        ( NORMAL, NORMAL,
          alpha, B1_MC_STAR, AColPanAdj_STAR_MR, T(1), CLeft );
        //--------------------------------------------------------------------//
        AColPan_VR_STAR.FreeAlignments();
        AColPanAdj_STAR_MR.FreeAlignments();
        ARowPanAdj_MR_STAR.FreeAlignments();

        SlideLockedPartitionDownDiagonal
        ( ATL, /**/ ATR,  A00, A01, /**/ A02,
               /**/       A10, A11, /**/ A12,
         /*************/ /******************/
          ABL, /**/ ABR,  A20, A21, /**/ A22 );

        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );

        SlidePartitionRight
        ( CL,     /**/ CR,
          C0, C1, /**/ C2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
Example #3
0
void JitArmILAsmRoutineManager::Generate()
{
	enterCode = GetCodePtr();
	PUSH(9, R4, R5, R6, R7, R8, R9, R10, R11, _LR);
	// Take care to 8-byte align stack for function calls.
	// We are misaligned here because of an odd number of args for PUSH.
	// It's not like x86 where you need to account for an extra 4 bytes
	// consumed by CALL.
	SUB(_SP, _SP, 4);

	MOVI2R(R0, (u32)&CoreTiming::downcount);
	MOVI2R(R9, (u32)&PowerPC::ppcState.spr[0]);

	FixupBranch skipToRealDispatcher = B();
	dispatcher = GetCodePtr();
		printf("ILDispatcher is %p\n", dispatcher);

		// Downcount Check
		// The result of slice decrementation should be in flags if somebody jumped here
		// IMPORTANT - We jump on negative, not carry!!!
		FixupBranch bail = B_CC(CC_MI);

		SetJumpTarget(skipToRealDispatcher);
		dispatcherNoCheck = GetCodePtr();

		// This block of code gets the address of the compiled block of code
		// It runs though to the compiling portion if it isn't found
			LDR(R12, R9, PPCSTATE_OFF(pc));// Load the current PC into R12

			Operand2 iCacheMask = Operand2(0xE, 2); // JIT_ICACHE_MASK
			BIC(R12, R12, iCacheMask); // R12 contains PC & JIT_ICACHE_MASK here.

			MOVI2R(R14, (u32)jit->GetBlockCache()->iCache);

			LDR(R12, R14, R12); // R12 contains iCache[PC & JIT_ICACHE_MASK] here
			// R12 Confirmed this is the correct iCache Location loaded.
			TST(R12, 0x80); // Test  to see if it is a JIT block.

			SetCC(CC_EQ);
				// Success, it is our Jitblock.
				MOVI2R(R14, (u32)jit->GetBlockCache()->GetCodePointers());
				// LDR R14 right here to get CodePointers()[0] pointer.
				LSL(R12, R12, 2); // Multiply by four because address locations are u32 in size
				LDR(R14, R14, R12); // Load the block address in to R14

				B(R14);
				// No need to jump anywhere after here, the block will go back to dispatcher start
			SetCC();

		// If we get to this point, that means that we don't have the block cached to execute
		// So call ArmJit to compile the block and then execute it.
		MOVI2R(R14, (u32)&Jit);
		BL(R14);

		B(dispatcherNoCheck);

		// fpException()
		// Floating Point Exception Check, Jumped to if false
		fpException = GetCodePtr();
			LDR(R0, R9, PPCSTATE_OFF(Exceptions));
			ORR(R0, R0, EXCEPTION_FPU_UNAVAILABLE);
			STR(R0, R9, PPCSTATE_OFF(Exceptions));
				QuickCallFunction(R14, (void*)&PowerPC::CheckExceptions);
			LDR(R0, R9, PPCSTATE_OFF(npc));
			STR(R0, R9, PPCSTATE_OFF(pc));
		B(dispatcher);

		SetJumpTarget(bail);
		doTiming = GetCodePtr();
		// XXX: In JIT64, Advance() gets called /after/ the exception checking
		// once it jumps back to the start of outerLoop
		QuickCallFunction(R14, (void*)&CoreTiming::Advance);

		// Does exception checking
		testExceptions = GetCodePtr();
			LDR(R0, R9, PPCSTATE_OFF(pc));
			STR(R0, R9, PPCSTATE_OFF(npc));
				QuickCallFunction(R14, (void*)&PowerPC::CheckExceptions);
			LDR(R0, R9, PPCSTATE_OFF(npc));
			STR(R0, R9, PPCSTATE_OFF(pc));
		// Check the state pointer to see if we are exiting
		// Gets checked on every exception check
			MOVI2R(R0, (u32)PowerPC::GetStatePtr());
			MVN(R1, 0);
			LDR(R0, R0);
			TST(R0, R1);
			FixupBranch Exit = B_CC(CC_NEQ);

	B(dispatcher);

	SetJumpTarget(Exit);

	ADD(_SP, _SP, 4);

	POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC);  // Returns

	GenerateCommon();

	FlushIcache();
}
Example #4
0
void JitArm::SafeLoadToReg(bool fastmem, u32 dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse)
{
	ARMReg RD = gpr.R(dest);

	if (Core::g_CoreStartupParameter.bFastmem && fastmem)
	{
		// Preload for fastmem
		if (offsetReg != -1)
			gpr.R(offsetReg);

		if (addr != -1)
			MOV(R10, gpr.R(addr));
		else
			MOV(R10, 0);

		UnsafeLoadToReg(RD, R10, accessSize, offsetReg, offset);
		return;
	}
	ARMReg rA = gpr.GetReg();
	ARMReg rB = gpr.GetReg();

	if (offsetReg == -1)
	{
		MOVI2R(rA, offset);
		if (addr != -1)
			ADD(rA, rA, gpr.R(addr));
	}
	else
	{
		if (addr != -1)
			ADD(rA, gpr.R(addr), gpr.R(offsetReg));
		else
			MOV(rA, gpr.R(offsetReg));
	}

	switch (accessSize)
	{
		case 8:
			MOVI2R(rB, (u32)&Memory::Read_U8);
		break;
		case 16:
			MOVI2R(rB, (u32)&Memory::Read_U16);
		break;
		case 32:
			MOVI2R(rB, (u32)&Memory::Read_U32);
		break;
	}
	PUSH(4, R0, R1, R2, R3);
	MOV(R0, rA);
	BL(rB);
	MOV(rA, R0);
	POP(4, R0, R1, R2, R3);
	MOV(RD, rA);
	if (signExtend) // Only on 16 loads
		SXTH(RD, RD);
	if (reverse)
	{
		if (accessSize == 32)
			REV(RD, RD);
		else if (accessSize == 16)
			REV16(RD, RD);
	}
	gpr.Unlock(rA, rB);
}
Example #5
0
inline void
SymmLLA
( T alpha, const DistMatrix<T,MC,MR>& A, 
           const DistMatrix<T,MC,MR>& B,
  T beta,        DistMatrix<T,MC,MR>& C )
{
#ifndef RELEASE
    PushCallStack("internal::SymmLLA");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error
        ("{A,B,C} must be distributed over the same grid");
#endif
    const Grid& g = A.Grid();

    DistMatrix<T,MC,MR> 
        BL(g), BR(g),
        B0(g), B1(g), B2(g);

    DistMatrix<T,MC,MR>
        CL(g), CR(g),
        C0(g), C1(g), C2(g);

    DistMatrix<T,MC,STAR> B1_MC_STAR(g);
    DistMatrix<T,VR,STAR> B1_VR_STAR(g);
    DistMatrix<T,STAR,MR> B1Trans_STAR_MR(g);
    DistMatrix<T,MC,MR  > Z1(g);
    DistMatrix<T,MC,STAR> Z1_MC_STAR(g);
    DistMatrix<T,MR,STAR> Z1_MR_STAR(g);
    DistMatrix<T,MR,MC  > Z1_MR_MC(g);

    B1_MC_STAR.AlignWith( A );
    B1_VR_STAR.AlignWith( A );
    B1Trans_STAR_MR.AlignWith( A );
    Z1_MC_STAR.AlignWith( A );
    Z1_MR_STAR.AlignWith( A );

    Scale( beta, C );
    LockedPartitionRight
    ( B, BL, BR, 0 );
    PartitionRight
    ( C, CL, CR, 0 );
    while( CL.Width() < C.Width() )
    {
        LockedRepartitionRight 
        ( BL, /**/ BR,
          B0, /**/ B1, B2 );

        RepartitionRight
        ( CL, /**/ CR,
          C0, /**/ C1, C2 );

        Z1.AlignWith( C1 );
        Zeros( C1.Height(), C1.Width(), Z1_MC_STAR );
        Zeros( C1.Height(), C1.Width(), Z1_MR_STAR );
        //--------------------------------------------------------------------//
        B1_MC_STAR = B1;
        B1_VR_STAR = B1_MC_STAR;
        B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR );
        LocalSymmetricAccumulateLL
        ( TRANSPOSE, 
          alpha, A, B1_MC_STAR, B1Trans_STAR_MR, Z1_MC_STAR, Z1_MR_STAR );

        Z1_MR_MC.SumScatterFrom( Z1_MR_STAR );
        Z1 = Z1_MR_MC;
        Z1.SumScatterUpdate( T(1), Z1_MC_STAR );
        Axpy( T(1), Z1, C1 );
        //--------------------------------------------------------------------//
        Z1.FreeAlignments();

        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );

        SlidePartitionRight
        ( CL,     /**/ CR,
          C0, C1, /**/ C2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
Example #6
0
unsigned char GR(int i,int j)
{
  BL(i,j);
}
Example #7
0
inline void
internal::GemmTNA
( Orientation orientationOfA,
  T alpha, const DistMatrix<T,MC,MR>& A,
           const DistMatrix<T,MC,MR>& B,
  T beta,        DistMatrix<T,MC,MR>& C )
{
#ifndef RELEASE
    PushCallStack("internal::GemmTNA");    
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error
        ("{A,B,C} must be distributed over the same grid");
    if( orientationOfA == NORMAL )
        throw std::logic_error("GemmTNA assumes A is (Conjugate)Transposed");
    if( A.Width()  != C.Height() ||
        B.Width()  != C.Width()  ||
        A.Height() != B.Height()   )
    {
        std::ostringstream msg;
        msg << "Nonconformal GemmTNA: \n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        throw std::logic_error( msg.str().c_str() );
    }
#endif
    const Grid& g = A.Grid();

    // Matrix views
    DistMatrix<T,MC,MR> BL(g), BR(g),
                        B0(g), B1(g), B2(g);

    DistMatrix<T,MC,MR> CL(g), CR(g),
                        C0(g), C1(g), C2(g);

    // Temporary distributions
    DistMatrix<T,MC,STAR> B1_MC_STAR(g);
    DistMatrix<T,MR,STAR> D1_MR_STAR(g);
    DistMatrix<T,MR,MC  > D1_MR_MC(g);
    DistMatrix<T,MC,MR  > D1(g);

    // Start the algorithm
    Scal( beta, C );
    LockedPartitionRight( B, BL, BR, 0 );
    PartitionRight( C, CL, CR, 0 );
    while( BR.Width() > 0 )
    {
        LockedRepartitionRight
        ( BL, /**/     BR,
          B0, /**/ B1, B2 );
 
        RepartitionRight
        ( CL, /**/     CR,
          C0, /**/ C1, C2 );

        B1_MC_STAR.AlignWith( A );
        D1_MR_STAR.AlignWith( A );
        D1_MR_STAR.ResizeTo( C1.Height(), C1.Width() );
        D1.AlignWith( C1 );
        //--------------------------------------------------------------------//
        B1_MC_STAR = B1; // B1[MC,*] <- B1[MC,MR]

        // D1[MR,*] := alpha (A1[MC,MR])^T B1[MC,*]
        //           = alpha (A1^T)[MR,MC] B1[MC,*]
        internal::LocalGemm
        ( orientationOfA, NORMAL, alpha, A, B1_MC_STAR, (T)0, D1_MR_STAR );

        // C1[MC,MR] += scattered & transposed D1[MR,*] summed over grid cols
        D1_MR_MC.SumScatterFrom( D1_MR_STAR );
        D1 = D1_MR_MC; 
        Axpy( (T)1, D1, C1 );
        //--------------------------------------------------------------------//
        B1_MC_STAR.FreeAlignments();
        D1_MR_STAR.FreeAlignments();
        D1.FreeAlignments();

        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );

        SlidePartitionRight
        ( CL,     /**/ CR,
          C0, C1, /**/ C2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
Example #8
0
inline void
internal::Syr2kLN
( T alpha, const DistMatrix<T,MC,MR>& A,
           const DistMatrix<T,MC,MR>& B,
  T beta,        DistMatrix<T,MC,MR>& C )
{
#ifndef RELEASE
    PushCallStack("internal::Syr2kLN");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        throw std::logic_error
        ("{A,B,C} must be distributed over the same grid");
    if( A.Height() != C.Height() || A.Height() != C.Width() ||
        B.Height() != C.Height() || B.Height() != C.Width() ||
        A.Width() != B.Width()                                 )
    {
        std::ostringstream msg;
        msg << "Nonconformal Syr2kLN:\n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        throw std::logic_error( msg.str().c_str() );
    }
#endif
    const Grid& g = A.Grid();

    // Matrix views 
    DistMatrix<T,MC,MR> AL(g), AR(g),
                        A0(g), A1(g), A2(g);

    DistMatrix<T,MC,MR> BL(g), BR(g),
                        B0(g), B1(g), B2(g);

    // Temporary distributions
    DistMatrix<T,MC,  STAR> A1_MC_STAR(g);
    DistMatrix<T,MC,  STAR> B1_MC_STAR(g);
    DistMatrix<T,VR,  STAR> A1_VR_STAR(g);
    DistMatrix<T,VR,  STAR> B1_VR_STAR(g);
    DistMatrix<T,STAR,MR  > A1Trans_STAR_MR(g);
    DistMatrix<T,STAR,MR  > B1Trans_STAR_MR(g);

    // Start the algorithm
    ScaleTrapezoid( beta, LEFT, LOWER, 0, C );
    LockedPartitionRight( A, AL, AR, 0 );
    LockedPartitionRight( B, BL, BR, 0 );
    while( AR.Width() > 0 )
    {
        LockedRepartitionRight
        ( AL, /**/ AR,
          A0, /**/ A1, A2 );

        LockedRepartitionRight
        ( BL, /**/ BR,
          B0, /**/ B1, B2 );

        A1_MC_STAR.AlignWith( C );
        B1_MC_STAR.AlignWith( C );
        A1_VR_STAR.AlignWith( C );
        B1_VR_STAR.AlignWith( C );
        A1Trans_STAR_MR.AlignWith( C );
        B1Trans_STAR_MR.AlignWith( C );
        //--------------------------------------------------------------------//
        A1_VR_STAR = A1_MC_STAR = A1;
        A1Trans_STAR_MR.TransposeFrom( A1_VR_STAR );

        B1_VR_STAR = B1_MC_STAR = B1;
        B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR );

        internal::LocalTrr2k
        ( LOWER, 
          alpha, A1_MC_STAR, B1Trans_STAR_MR,
                 B1_MC_STAR, A1Trans_STAR_MR,
          (T)1,  C );
        //--------------------------------------------------------------------//
        A1_MC_STAR.FreeAlignments();
        B1_MC_STAR.FreeAlignments();
        A1_VR_STAR.FreeAlignments();
        B1_VR_STAR.FreeAlignments();
        A1Trans_STAR_MR.FreeAlignments();
        B1Trans_STAR_MR.FreeAlignments();

        SlideLockedPartitionRight
        ( AL,     /**/ AR,
          A0, A1, /**/ A2 );

        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );
    }
#ifndef RELEASE
    PopCallStack();
#endif
}
Example #9
0
inline void 
SUMMA_NNDot
( T alpha, const DistMatrix<T>& A,
           const DistMatrix<T>& B,
  T beta,        DistMatrix<T>& C )
{
#ifndef RELEASE
    CallStackEntry entry("gemm::SUMMA_NNDot");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        LogicError("{A,B,C} must have the same grid");
    if( A.Height() != C.Height() ||
        B.Width()  != C.Width()  ||
        A.Width()  != B.Height() )
    {
        std::ostringstream msg;
        msg << "Nonconformal matrices: \n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        LogicError( msg.str() );
    }
#endif
    const Grid& g = A.Grid();

    if( A.Height() > B.Width() )
    {
        // Matrix views
        DistMatrix<T> AT(g), AB(g),
                      A0(g), A1(g), A2(g);         
        DistMatrix<T> BL(g),  B0(g),
                      BR(g),  B1(g),
                              B2(g);
        DistMatrix<T> CT(g), C0(g), C1L(g), C1R(g),
                      CB(g), C1(g), C10(g), C11(g), C12(g),
                             C2(g);

        // Temporary distributions
        DistMatrix<T,STAR,VC> A1_STAR_VC(g);
        DistMatrix<T,VC,STAR> B1_VC_STAR(g);
        DistMatrix<T,STAR,STAR> C11_STAR_STAR(g);

        // Star the algorithm
        Scale( beta, C );
        LockedPartitionDown
        ( A, AT,
             AB, 0 );
        PartitionDown
        ( C, CT,
             CB, 0 );
        while( AB.Height() > 0 )
        {
            LockedRepartitionDown
            ( AT,  A0,
             /**/ /**/
                   A1,
              AB,  A2 );

            RepartitionDown
            ( CT,  C0,
             /**/ /**/
                   C1,
              CB,  C2 );

            A1_STAR_VC = A1; 
            B1_VC_STAR.AlignWith( A1_STAR_VC );

            LockedPartitionRight( B, BL, BR, 0 );
            PartitionRight( C1, C1L, C1R, 0 );
            while( BR.Width() > 0 )
            {
                LockedRepartitionRight
                ( BL, /**/ BR,
                  B0, /**/ B1, B2 );

                RepartitionRight
                ( C1L, /**/ C1R,
                  C10, /**/ C11, C12 );

                //------------------------------------------------------------//
                B1_VC_STAR = B1;
                LocalGemm
                ( NORMAL, NORMAL, 
                  alpha, A1_STAR_VC, B1_VC_STAR, C11_STAR_STAR );
                C11.SumScatterUpdate( T(1), C11_STAR_STAR );
                //------------------------------------------------------------//

                SlideLockedPartitionRight
                ( BL,     /**/ BR,
                  B0, B1, /**/ B2 );

                SlidePartitionRight
                ( C1L,      /**/ C1R,
                  C10, C11, /**/ C12 );
            }

            SlideLockedPartitionDown
            ( AT,  A0,
                   A1,
             /**/ /**/
              AB,  A2 );

            SlidePartitionDown
            ( CT,  C0,
                   C1,
             /**/ /**/
              CB,  C2 );
        }
    }
    else
    {
        // Matrix views
        DistMatrix<T> AT(g), AB(g),
                      A0(g), A1(g), A2(g);         
        DistMatrix<T> BL(g),  B0(g),
                      BR(g),  B1(g),
                              B2(g);
        DistMatrix<T> 
            CL(g), CR(g),         C1T(g),  C01(g),
            C0(g), C1(g), C2(g),  C1B(g),  C11(g),
                                           C21(g);

        // Temporary distributions
        DistMatrix<T,STAR,VR> A1_STAR_VR(g);
        DistMatrix<T,VR,STAR> B1_VR_STAR(g);
        DistMatrix<T,STAR,STAR> C11_STAR_STAR(g);

        // Star the algorithm
        Scale( beta, C );
        LockedPartitionRight( B, BL, BR, 0 );
        PartitionRight( C, CL, CR, 0 );
        while( BR.Width() > 0 )
        {
            LockedRepartitionRight
            ( BL, /**/ BR,
              B0, /**/ B1, B2 );

            RepartitionRight
            ( CL, /**/ CR,
              C0, /**/ C1, C2 );

            B1_VR_STAR = B1;
            A1_STAR_VR.AlignWith( B1_VR_STAR );

            LockedPartitionDown
            ( A, AT,
                 AB, 0 );
            PartitionDown
            ( C1, C1T,
                  C1B, 0 );
            while( AB.Height() > 0 )
            {
                LockedRepartitionDown
                ( AT,  A0,
                 /**/ /**/
                       A1,
                  AB,  A2 );

                RepartitionDown
                ( C1T,  C01,
                 /***/ /***/
                        C11,
                  C1B,  C21 );

                //------------------------------------------------------------//
                A1_STAR_VR = A1;
                LocalGemm
                ( NORMAL, NORMAL, 
                  alpha, A1_STAR_VR, B1_VR_STAR, C11_STAR_STAR );
                C11.SumScatterUpdate( T(1), C11_STAR_STAR );
                //------------------------------------------------------------//

                SlideLockedPartitionDown
                ( AT,  A0,
                       A1,
                 /**/ /**/
                  AB,  A2 );

                SlidePartitionDown
                ( C1T,  C01,
                        C11,
                 /***/ /***/
                  C1B,  C21 );
            }

            SlideLockedPartitionRight
            ( BL,     /**/ BR,
              B0, B1, /**/ B2 ); 

            SlidePartitionRight
            ( CL,     /**/ CR,
              C0, C1, /**/ C2 );
        }
    }
}
void
MAST::NPSOLOptimizationInterface::optimize() {
    
#if MAST_ENABLE_NPSOL == 1
    // make sure that functions have been provided
    libmesh_assert(_funobj);
    libmesh_assert(_funcon);
    
    int
    N      =  _feval->n_vars(),
    NCLIN  =  0,
    NCNLN  =  _feval->n_eq()+_feval->n_ineq(),
    NCTOTL =  N+NCLIN+NCNLN,
    LDA    =  std::max(NCLIN, 1),
    LDJ    =  std::max(NCNLN, 1),
    LDR    =  N,
    INFORM =  0,           // on exit: Reports result of call to NPSOL
                           // < 0 either funobj or funcon has set this to -ve
                           // 0 => converged to point x
                           // 1 => x satisfies optimality conditions, but sequence of iterates has not converged
                           // 2 => Linear constraints and bounds cannot be satisfied. No feasible solution
                           // 3 => Nonlinear constraints and bounds cannot be satisfied. No feasible solution
                           // 4 => Major iter limit was reached
                           // 6 => x does not satisfy first-order optimality to required accuracy
                           // 7 => function derivatives seem to be incorrect
                           // 9 => input parameter invalid
    ITER   = 0,            // iter count
    LENIW  = 3*N + NCLIN + 2*NCNLN,
    LENW   = 2*N*N + N*NCLIN + 2*N*NCNLN + 20*N + 11*NCLIN + 21*NCNLN;
    
    Real
    F      =  0.;          // on exit: final objective

    std::vector<int>
    IW      (LENIW,  0),
    ISTATE  (NCTOTL, 0);    // status of constraints l <= r(x) <= u,
                            // -2 => lower bound is violated by more than delta
                            // -1 => upper bound is violated by more than delta
                            // 0  => both bounds are satisfied by more than delta
                            // 1  => lower bound is active (to within delta)
                            // 2  => upper bound is active (to within delta)
                            // 3  => boundars are equal and equality constraint is satisfied
    
    std::vector<Real>
    A       (LDA,    0.),   // this is used for liear constraints, not currently handled
    BL      (NCTOTL, 0.),
    BU      (NCTOTL, 0.),
    C       (NCNLN,  0.),   // on exit: nonlinear constraints
    CJAC    (LDJ* N, 0.),   //
                            // on exit: CJAC(i,j) is the partial derivative of ith nonlinear constraint
    CLAMBDA (NCTOTL, 0.),   // on entry: need not be initialized for cold start
                            // on exit: QP multiplier from the QP subproblem, >=0 if istate(j)=1, <0 if istate(j)=2
    G       (N,      0.),   // on exit: objective gradient
    R       (LDR*N,  0.),   // on entry: need not be initialized if called with Cold Statrt
                            // on exit: information about Hessian, if Hessian=Yes, R is upper Cholesky factor of approx H
    X       (N,      0.),   // on entry: initial point
                            // on exit: final estimate of solution
    W       (LENW,   0.),   // workspace
    xmin    (N,      0.),
    xmax    (N,      0.);
    
    
    // now setup the lower and upper limits for the variables and constraints
    _feval->init_dvar(X, xmin, xmax);
    for (unsigned int i=0; i<N; i++) {
        BL[i] = xmin[i];
        BU[i] = xmax[i];
    }
    
    // all constraints are assumed to be g_i(x) <= 0, so that the upper
    // bound is 0 and lower bound is -infinity
    for (unsigned int i=0; i<NCNLN; i++) {
        BL[i+N] = -1.e20;
        BU[i+N] =     0.;
    }
    
    std::string nm;
//    nm = "List";
//    npoptn_(nm.c_str(), (int)nm.length());
//    nm = "Verify level 3";
//    npoptn_(nm.c_str(), (int)nm.length());
    
    npsol_(&N,
           &NCLIN,
           &NCNLN,
           &LDA,
           &LDJ,
           &LDR,
           &A[0],
           &BL[0],
           &BU[0],
           _funcon,
           _funobj,
           &INFORM,
           &ITER,
           &ISTATE[0],
           &C[0],
           &CJAC[0],
           &CLAMBDA[0],
           &F,
           &G[0],
           &R[0],
           &X[0],
           &IW[0],
           &LENIW,
           &W[0],
           &LENW);
    
#endif // MAST_ENABLE_NPSOL 1
}
Example #11
0
inline void
SUMMA_NNA
( T alpha, const DistMatrix<T>& A,
           const DistMatrix<T>& B,
  T beta,        DistMatrix<T>& C )
{
#ifndef RELEASE
    CallStackEntry entry("gemm::SUMMA_NNA");
    if( A.Grid() != B.Grid() || B.Grid() != C.Grid() )
        LogicError("{A,B,C} must have the same grid");
    if( A.Height() != C.Height() ||
        B.Width()  != C.Width()  ||
        A.Width()  != B.Height() )
    {
        std::ostringstream msg;
        msg << "Nonconformal matrices: \n"
            << "  A ~ " << A.Height() << " x " << A.Width() << "\n"
            << "  B ~ " << B.Height() << " x " << B.Width() << "\n"
            << "  C ~ " << C.Height() << " x " << C.Width() << "\n";
        LogicError( msg.str() );
    }
#endif
    const Grid& g = A.Grid();

    // Matrix views
    DistMatrix<T> BL(g), BR(g),
                  B0(g), B1(g), B2(g);
    DistMatrix<T> CL(g), CR(g),
                  C0(g), C1(g), C2(g);

    // Temporary distributions
    DistMatrix<T,VR,STAR> B1_VR_STAR(g);
    DistMatrix<T,STAR,MR> B1Trans_STAR_MR(g);
    DistMatrix<T,MC,STAR> D1_MC_STAR(g);

    B1_VR_STAR.AlignWith( A );
    B1Trans_STAR_MR.AlignWith( A );
    D1_MC_STAR.AlignWith( A );

    // Start the algorithm
    Scale( beta, C );
    LockedPartitionRight( B, BL, BR, 0 );
    PartitionRight( C, CL, CR, 0 );
    while( BR.Width() > 0 )
    {
        LockedRepartitionRight
        ( BL, /**/     BR,
          B0, /**/ B1, B2 );

        RepartitionRight
        ( CL, /**/     CR,
          C0, /**/ C1, C2 );

        //--------------------------------------------------------------------//
        B1_VR_STAR = B1;
        B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR );

        // D1[MC,*] := alpha A[MC,MR] B1[MR,*]
        LocalGemm( NORMAL, TRANSPOSE, alpha, A, B1Trans_STAR_MR, D1_MC_STAR );

        // C1[MC,MR] += scattered result of D1[MC,*] summed over grid rows
        C1.SumScatterUpdate( T(1), D1_MC_STAR );
        //--------------------------------------------------------------------//

        SlideLockedPartitionRight
        ( BL,     /**/ BR,
          B0, B1, /**/ B2 );

        SlidePartitionRight
        ( CL,     /**/ CR,
          C0, C1, /**/ C2 );
    }
}
Example #12
0
void test_basic_types() {
    bcon basic_types[] = {"string", BS("a string"), "f(double)", BF(3.14159), "boolean", BB(1), "time", BT(time(0)), "null", BNULL, "symbol", BX("a symbol"), "int", BI(123), "long", BL(456789L), BEND};
    test_bson_from_bcon( basic_types, BCON_OK, BSON_VALID );
}
Example #13
0
void decodeInstruction(instruction_t instruction,unsigned long *r[],unsigned long *bandera,unsigned long *PC,unsigned long*LR,uint8_t*memoria,unsigned long *codificacion)
{
    int auxban;
    unsigned long aux1,aux2,des;
	//            codificacion funciones de la alu
	if(strcmp(instruction.mnemonic,"ADDS") == 0)
	{
		if(instruction.op1_type=='R')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
				r[instruction.op1_value]=ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera);
			}
            if((instruction.op2_type== '#' )&&(instruction.op3_type =='R' ))
            {
				r[instruction.op1_value]=ADD(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='#' ))
            {
				r[instruction.op1_value]=ADD(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
            if((instruction.op2_type== '#' )&&(instruction.op3_type =='#' ))
			{
				r[instruction.op1_value]=ADD(instruction.op2_value,instruction.op3_value,&bandera);
            }
            mostrar(r[instruction.op1_value]);
        }
		if(instruction.op1_type=='N')
		{
			if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
				ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
				ADD(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				ADD(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
				ADD(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
	}
	if(strcmp(instruction.mnemonic,"CMN") == 0)
	{
        if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' ))
        {
            ADD(r[instruction.op1_value],r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' ))
        {
            ADD(instruction.op1_value,r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' ))
        {
            ADD(r[instruction.op1_value],instruction.op2_value,&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' ))
        {
            ADD(instruction.op1_value,instruction.op2_value,&bandera);
        }
	}
	if( strcmp(instruction.mnemonic,"ADCS") == 0)
	{
		if(instruction.op1_type=='R')
		{
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
				r[instruction.op1_value]=ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
			{
				r[instruction.op1_value]=ADC(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
			{
                r[instruction.op1_value]=ADC(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
				r[instruction.op1_value]=ADC(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
				ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
				ADC(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				ADC(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
				ADC(instruction.op2_value,instruction.op3_value,&bandera);
            }
            mostrar(r[instruction.op1_value]);
		}
    }
    if( strcmp(instruction.mnemonic,"ANDS") == 0)
    {
        if(instruction.op1_type=='R')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                r[instruction.op1_value]=AND(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                r[instruction.op1_value]=AND(instruction.op2_value,r[instruction.op3_value],&bandera);
			}
            if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=AND(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=AND(instruction.op2_value,instruction.op3_value,&bandera);
            }
           mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
			if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
                AND(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                AND(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                AND(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                AND(instruction.op2_value,instruction.op3_value,&bandera);
            }
            mostrar(r[instruction.op1_value]);
		}
    }
    if(strcmp(instruction.mnemonic,"TEST") == 0)
    {
        if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' ))
        {
            AND(r[instruction.op1_value],r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' ))
        {
            AND(instruction.op1_value,r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' ))
        {
            AND(r[instruction.op1_value],instruction.op2_value,&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' ))
        {
            AND(instruction.op1_value,instruction.op2_value,&bandera);
        }
    }
    if( strcmp(instruction.mnemonic,"EORS") == 0)
    {
        if(instruction.op1_type=='R')
       {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
                r[instruction.op1_value]=EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
			{
                r[instruction.op1_value]=EOR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=EOR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=EOR(instruction.op2_value,instruction.op3_value,&bandera);
            }
           mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
				EOR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				EOR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                EOR(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
    }
    if( (strcmp(instruction.mnemonic,"MOVS") == 0)||(strcmp(instruction.mnemonic,"MOV") == 0))
    {

		if((instruction.op1_type == 'R')&&(instruction.op2_type=='R') )
		{
			r[instruction.op1_value]=MOV(r[instruction.op1_value],r[instruction.op2_value],&bandera);
			mostrar(r[instruction.op1_value]);
		}
        if((instruction.op1_type == 'R')&&(instruction.op2_type=='#') )
        {
            r[instruction.op1_value]=MOV(instruction.op1_value,instruction.op2_value,&bandera);
            mostrar(r[instruction.op1_value]);
        }
    }
    if( strcmp(instruction.mnemonic,"ORRS") == 0)
    {
        if(instruction.op1_type=='R')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                r[instruction.op1_value]=ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
			}
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                r[instruction.op1_value]=ORR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=ORR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=ORR(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                ORR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				ORR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                ORR(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
        }
    }
	if( strcmp(instruction.mnemonic,"SUBS") == 0)
    {
        if(instruction.op1_type== 'R' )
		{
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                r[instruction.op1_value]=SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                r[instruction.op1_value]=SUB(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
           if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=SUB(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=SUB(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
                SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                SUB(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                SUB(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                SUB(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
        }
    }
    if(strcmp(instruction.mnemonic,"CMP") == 0)
    {
        if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' ))
        {
            SUB(r[instruction.op1_value],r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' ))
        {
            SUB(instruction.op1_value,r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' ))
		{
            SUB(r[instruction.op1_value],instruction.op2_value,&bandera);
        }
		if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
        {
            SUB(instruction.op1_value,instruction.op2_value,&bandera);
        }
    }
    //                 decodificacion funciones branch
	if(strcmp(instruction.mnemonic,"B")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(28<<11)+(instruction.op1_value);
			B(&PC,instruction.op1_value);
		}
	}
	if(strcmp(instruction.mnemonic,"BEQ")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(instruction.op1_value);
			BEQ(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BNE")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(1<<8)+(instruction.op1_value);
			BNE(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BCS")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(2<<8)+(instruction.op1_value);
			BCS(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BCC")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(3<<8)+(instruction.op1_value);
			BCC(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BMI")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(4<<8)+(instruction.op1_value);
			BMI(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BPL")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(5<<8)+(instruction.op1_value);
			BPL(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BVS")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(6<<8)+(instruction.op1_value);
			BVS(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BVC")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(2<<7)+(instruction.op1_value);
			BVC(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BHI")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(8<<8)+(instruction.op1_value);
			BHI(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BLS")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(9<<8)+(instruction.op1_value);
			BLS(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BGE")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(10<<8)+(instruction.op1_value);
			BGE(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BLT")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(11<<8)+(instruction.op1_value);
			BLT(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BGT")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(12<<8)+(instruction.op1_value);
			BGT(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BLE")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(13<<8)+(instruction.op1_value);
			BLE(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BL")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(31<<11)+(2047&instruction.op1_value+(((1<<31)&instruction.op1_value)>>20));
			BL(&PC,instruction.op1_value,&LR);
		}
	}
Example #14
0
void JitArm::lXX(UGeckoInstruction inst)
{
    INSTRUCTION_START
    JITDISABLE(bJITLoadStoreOff)

    u32 a = inst.RA, b = inst.RB, d = inst.RD;
    s32 offset = inst.SIMM_16;
    u32 accessSize = 0;
    s32 offsetReg = -1;
    bool zeroA = true;
    bool update = false;
    bool signExtend = false;
    bool reverse = false;
    bool fastmem = false;

    switch (inst.OPCD)
    {
    case 31:
        switch (inst.SUBOP10)
        {
        case 55: // lwzux
            zeroA = false;
            update = true;
        case 23: // lwzx
            accessSize = 32;
            offsetReg = b;
            break;
        case 119: //lbzux
            zeroA = false;
            update = true;
        case 87: // lbzx
            accessSize = 8;
            offsetReg = b;
            break;
        case 311: // lhzux
            zeroA = false;
            update = true;
        case 279: // lhzx
            accessSize = 16;
            offsetReg = b;
            break;
        case 375: // lhaux
            zeroA = false;
            update = true;
        case 343: // lhax
            accessSize = 16;
            signExtend = true;
            offsetReg = b;
            break;
        case 534: // lwbrx
            accessSize = 32;
            reverse = true;
            break;
        case 790: // lhbrx
            accessSize = 16;
            reverse = true;
            break;
        }
        break;
    case 33: // lwzu
        zeroA = false;
        update = true;
    case 32: // lwz
        fastmem = true;
        accessSize = 32;
        break;
    case 35: // lbzu
        zeroA = false;
        update = true;
    case 34: // lbz
        fastmem = true;
        accessSize = 8;
        break;
    case 41: // lhzu
        zeroA = false;
        update = true;
    case 40: // lhz
        fastmem = true;
        accessSize = 16;
        break;
    case 43: // lhau
        zeroA = false;
        update = true;
    case 42: // lha
        signExtend = true;
        accessSize = 16;
        break;
    }

    // Check for exception before loading
    ARMReg rA = gpr.GetReg(false);

    LDR(rA, R9, PPCSTATE_OFF(Exceptions));
    CMP(rA, EXCEPTION_DSI);
    FixupBranch DoNotLoad = B_CC(CC_EQ);

    SafeLoadToReg(fastmem, d, zeroA ? a ? a : -1 : a, offsetReg, accessSize, offset, signExtend, reverse);

    if (update)
    {
        rA = gpr.GetReg(false);
        ARMReg RA = gpr.R(a);
        if (offsetReg == -1)
            MOVI2R(rA, offset);
        else
            MOV(RA, gpr.R(offsetReg));
        ADD(RA, RA, rA);
    }

    SetJumpTarget(DoNotLoad);

    // LWZ idle skipping
    if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSkipIdle &&
            inst.OPCD == 32 &&
            (inst.hex & 0xFFFF0000) == 0x800D0000 &&
            (Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 ||
             (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) &&
            Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
    {
        ARMReg RD = gpr.R(d);
        gpr.Flush();
        fpr.Flush();

        // if it's still 0, we can wait until the next event
        TST(RD, RD);
        FixupBranch noIdle = B_CC(CC_NEQ);
        rA = gpr.GetReg();

        MOVI2R(rA, (u32)&PowerPC::OnIdle);
        MOVI2R(R0, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
        BL(rA);

        gpr.Unlock(rA);
        WriteExceptionExit();

        SetJumpTarget(noIdle);

        //js.compilerPC += 8;
        return;
    }

}
Example #15
0
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
             void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  const dt_iop_rawprepare_data_t *const d = (dt_iop_rawprepare_data_t *)piece->data;

  // fprintf(stderr, "roi in %d %d %d %d\n", roi_in->x, roi_in->y, roi_in->width, roi_in->height);
  // fprintf(stderr, "roi out %d %d %d %d\n", roi_out->x, roi_out->y, roi_out->width, roi_out->height);

  const float scale = roi_in->scale / piece->iscale;
  const int csx = (int)roundf((float)d->x * scale), csy = (int)roundf((float)d->y * scale);

  if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && piece->pipe->filters)
  { // raw mosaic

    const uint16_t *const in = (const uint16_t *const)ivoid;
    float *const out = (float *const)ovoid;

#ifdef _OPENMP
#pragma omp parallel for SIMD() default(none) schedule(static) collapse(2)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      for(int i = 0; i < roi_out->width; i++)
      {
        const size_t pin = (size_t)(roi_in->width * (j + csy) + csx) + i;
        const size_t pout = (size_t)j * roi_out->width + i;

        const int id = BL(roi_out, d, j, i);
        out[pout] = (in[pin] - d->sub[id]) / d->div[id];
      }
    }

    piece->pipe->filters = dt_rawspeed_crop_dcraw_filters(piece->pipe->filters, csx, csy);
    adjust_xtrans_filters(piece->pipe->xtrans, csx, csy);
  }
  else
  { // pre-downsampled buffer that needs black/white scaling

    const float *const in = (const float *const)ivoid;
    float *const out = (float *const)ovoid;

    const float sub = d->sub[0], div = d->div[0];

    const int ch = piece->colors;

#ifdef _OPENMP
#pragma omp parallel for SIMD() default(none) schedule(static) collapse(3)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      for(int i = 0; i < roi_out->width; i++)
      {
        for(int c = 0; c < ch; c++)
        {
          const size_t pin = (size_t)ch * (roi_in->width * (j + csy) + csx + i) + c;
          const size_t pout = (size_t)ch * (j * roi_out->width + i) + c;

          out[pout] = (in[pin] - sub) / div;
        }
      }
    }
  }
}
Example #16
0
void JitArm::SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset)
{
    if (Core::g_CoreStartupParameter.bFastmem && fastmem)
    {
        ARMReg RA;
        ARMReg RB;
        ARMReg RS = gpr.R(value);

        if (dest != -1)
            RA = gpr.R(dest);

        if (regOffset != -1)
        {
            RB = gpr.R(regOffset);
            MOV(R10, RB);
            NOP(1);
        }
        else
            MOVI2R(R10, (u32)offset, false);

        if (dest != -1)
            ADD(R10, R10, RA);
        else
            NOP(1);

        MOV(R12, RS);
        UnsafeStoreFromReg(R10, R12, accessSize, 0);
        return;
    }
    ARMReg rA = gpr.GetReg();
    ARMReg rB = gpr.GetReg();
    ARMReg rC = gpr.GetReg();
    ARMReg RA;
    ARMReg RB;
    if (dest != -1)
        RA = gpr.R(dest);
    if (regOffset != -1)
        RB = gpr.R(regOffset);
    ARMReg RS = gpr.R(value);
    switch (accessSize)
    {
    case 32:
        MOVI2R(rA, (u32)&Memory::Write_U32);
        break;
    case 16:
        MOVI2R(rA, (u32)&Memory::Write_U16);
        break;
    case 8:
        MOVI2R(rA, (u32)&Memory::Write_U8);
        break;
    }
    MOV(rB, RS);
    if (regOffset == -1)
        MOVI2R(rC, offset);
    else
        MOV(rC, RB);
    if (dest != -1)
        ADD(rC, rC, RA);

    PUSH(4, R0, R1, R2, R3);
    MOV(R0, rB);
    MOV(R1, rC);
    BL(rA);
    POP(4, R0, R1, R2, R3);
    gpr.Unlock(rA, rB, rC);
}
Example #17
0
void process_sse2(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
                  void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  const dt_iop_rawprepare_data_t *const d = (dt_iop_rawprepare_data_t *)piece->data;

  // fprintf(stderr, "roi in %d %d %d %d\n", roi_in->x, roi_in->y, roi_in->width, roi_in->height);
  // fprintf(stderr, "roi out %d %d %d %d\n", roi_out->x, roi_out->y, roi_out->width, roi_out->height);

  const float scale = roi_in->scale / piece->iscale;
  const int csx = (int)roundf((float)d->x * scale), csy = (int)roundf((float)d->y * scale);

  if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && piece->pipe->filters)
  { // raw mosaic
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      const uint16_t *in = ((uint16_t *)ivoid) + ((size_t)roi_in->width * (j + csy) + csx);
      float *out = ((float *)ovoid) + (size_t)roi_out->width * j;

      int i = 0;

      // FIXME: figure alignment!  !!! replace with for !!!
      while((!dt_is_aligned(in, 16) || !dt_is_aligned(out, 16)) && (i < roi_out->width))
      {
        const int id = BL(roi_out, d, j, i);
        *out = (((float)(*in)) - d->sub[id]) / d->div[id];
        i++;
        in++;
        out++;
      }

      const __m128 sub = _mm_set_ps(d->sub[BL(roi_out, d, j, i + 3)], d->sub[BL(roi_out, d, j, i + 2)],
                                    d->sub[BL(roi_out, d, j, i + 1)], d->sub[BL(roi_out, d, j, i)]);

      const __m128 div = _mm_set_ps(d->div[BL(roi_out, d, j, i + 3)], d->div[BL(roi_out, d, j, i + 2)],
                                    d->div[BL(roi_out, d, j, i + 1)], d->div[BL(roi_out, d, j, i)]);

      // process aligned pixels with SSE
      for(; i < roi_out->width - (8 - 1); i += 8, in += 8)
      {
        const __m128i input = _mm_load_si128((__m128i *)in);

        __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0));
        __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0));

        __m128 flo = _mm_cvtepi32_ps(ilo);
        __m128 fhi = _mm_cvtepi32_ps(ihi);

        flo = _mm_div_ps(_mm_sub_ps(flo, sub), div);
        fhi = _mm_div_ps(_mm_sub_ps(fhi, sub), div);

        _mm_stream_ps(out, flo);
        out += 4;
        _mm_stream_ps(out, fhi);
        out += 4;
      }

      // process the rest
      for(; i < roi_out->width; i++, in++, out++)
      {
        const int id = BL(roi_out, d, j, i);
        *out = MAX(0.0f, ((float)(*in)) - d->sub[id]) / d->div[id];
      }
    }

    piece->pipe->filters = dt_rawspeed_crop_dcraw_filters(piece->pipe->filters, csx, csy);
    adjust_xtrans_filters(piece->pipe->xtrans, csx, csy);
  }
  else
  { // pre-downsampled buffer that needs black/white scaling

    const __m128 sub = _mm_load_ps(d->sub), div = _mm_load_ps(d->div);

#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      const float *in = ((float *)ivoid) + (size_t)4 * (roi_in->width * (j + csy) + csx);
      float *out = ((float *)ovoid) + (size_t)4 * roi_out->width * j;

      // process aligned pixels with SSE
      for(int i = 0; i < roi_out->width; i++, in += 4, out += 4)
      {
        const __m128 input = _mm_load_ps(in);

        const __m128 scaled = _mm_div_ps(_mm_sub_ps(input, sub), div);

        _mm_stream_ps(out, scaled);
      }
    }
  }
  _mm_sfence();
}
Example #18
0
void decodeInstruction(instruction_t instruction, uint32_t *dir_reg, char *dir_flags, uint8_t *SRAM, uint16_t *dec)
{
	uint8_t *R_activos=instruction.registers_list;
	/* Comparacion de mnemonic y Llamado de las funciones */
	if( strcmp(instruction.mnemonic,"ADC") == 0 || strcmp(instruction.mnemonic,"ADCS") == 0){
		dir_reg[PC]++;
		*dec=16704;
		*dec=*dec|instruction.op3_value<<3|instruction.op1_value;
		dir_reg[instruction.op1_value]=ADC(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
	}

	if( strcmp(instruction.mnemonic,"ADDS") == 0 || strcmp(instruction.mnemonic,"ADD") == 0){
		dir_reg[PC]++;
		if(instruction.op2_type=='S'){
			*dec=45056;
			dir_reg[SP]=ADD(dir_reg[SP],instruction.op3_value,dir_flags);
			*dec=*dec|instruction.op3_value;}
		else if(instruction.op3_type=='#'){
			*dec=7168;
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;
			dir_reg[instruction.op1_value]=ADD(dir_reg[instruction.op2_value], instruction.op3_value,dir_flags);
			mvprintw(4,20,"%X",*dec);}
		else{
			*dec=6144;
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;
			dir_reg[instruction.op1_value]=ADD(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);}
	}
	
	if( strcmp(instruction.mnemonic,"AND") == 0 || strcmp(instruction.mnemonic,"ANDS") == 0){
		dir_reg[PC]++;
		*dec=16384;
		if(instruction.op3_type=='#'){
			dir_reg[instruction.op1_value]=AND(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);}
		else
			dir_reg[instruction.op1_value]=AND(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"ASR") == 0 || strcmp(instruction.mnemonic,"ASRS") == 0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#'){
			*dec=4096;
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;
			dir_reg[instruction.op1_value]=ASR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);}
		else{
			*dec=16640;
			*dec=*dec|instruction.op3_value<<3|instruction.op1_value;
			dir_reg[instruction.op1_value]=ASR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);}
	}
	
	if( strcmp(instruction.mnemonic,"BICS") == 0 || strcmp(instruction.mnemonic,"BICS") == 0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#')
			dir_reg[instruction.op1_value]=BIC(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);
		else{
			*dec=17280;
			dir_reg[instruction.op1_value]=BIC(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
			*dec=*dec|instruction.op3_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"CMN" ) == 0 || strcmp(instruction.mnemonic,"CMNS") == 0){
		dir_reg[PC]++;
		CMN(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value],dir_flags);
		*dec=17088;
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
		mvprintw(4,20,"%X",*dec);
	}
	
	if( strcmp(instruction.mnemonic,"CMP") == 0 || strcmp(instruction.mnemonic,"CMPS") == 0){
		dir_reg[PC]++;
		CMP(dir_reg[instruction.op1_value],dir_reg[instruction.op2_value],dir_flags);
		*dec=17024;
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
		mvprintw(4,20,"%X",*dec);
	}
	
	if( strcmp(instruction.mnemonic,"EOR") == 0 || strcmp(instruction.mnemonic,"EORS") == 0){
		dir_reg[PC]++;
		*dec=16448;
		if(instruction.op3_type=='#')
			dir_reg[instruction.op1_value]=EOR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);
		else
			dir_reg[instruction.op1_value]=EOR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"LSLS") == 0 || strcmp(instruction.mnemonic,"LSL") == 0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#'){
			*dec=0;
			dir_reg[instruction.op1_value]=LSL(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
		else{
			*dec=16512;
			dir_reg[instruction.op1_value]=LSL(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
			*dec=*dec|instruction.op3_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"LSRS") == 0 || strcmp(instruction.mnemonic,"LSR") == 0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#'){
			*dec=2048;
			dir_reg[instruction.op1_value]=LSR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
		else{
			*dec=16576;
			dir_reg[instruction.op1_value]=LSR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
			*dec=*dec|instruction.op3_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"MOV") == 0 || strcmp(instruction.mnemonic,"MOVS") == 0){
		dir_reg[PC]++;
		if(instruction.op2_type=='#'){
			*dec=8192;
			dir_reg[instruction.op1_value]=MOV(instruction.op2_value,dir_flags);
			*dec=*dec|instruction.op1_value<<8|instruction.op2_value;}
		else{
			*dec=0;
			dir_reg[instruction.op1_value]=MOV(dir_reg[instruction.op2_value],dir_flags);
			*dec=*dec|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"MUL") == 0 || strcmp(instruction.mnemonic,"MULS") == 0){
		dir_reg[PC]++;
		*dec=17216;
		if(instruction.op3_type=='#'){
			dir_reg[instruction.op1_value]=MUL(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);}
		else{
			dir_reg[instruction.op1_value]=MUL(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
			*dec=*dec|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"MVN") == 0 || strcmp(instruction.mnemonic,"MVNS") == 0){
		dir_reg[PC]++;
		*dec=17344;
		dir_reg[instruction.op1_value]=MVN(dir_reg[instruction.op2_value], dir_flags);
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
	}
	
	if( strcmp(instruction.mnemonic,"ORR") == 0 || strcmp(instruction.mnemonic,"ORRS") == 0){
		dir_reg[PC]++;
		*dec=17152;
		if(instruction.op3_type=='#'){
			dir_reg[instruction.op1_value]=ORR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);}
		else{
			dir_reg[instruction.op1_value]=ORR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
			*dec=*dec|instruction.op3_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"REV") == 0 || strcmp(instruction.mnemonic,"REVS") == 0){
		dir_reg[PC]++;
		*dec=47616;
		dir_reg[instruction.op1_value]=REV(dir_reg[instruction.op2_value]);
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
	}
	
	if( strcmp(instruction.mnemonic,"REVG") == 0 || strcmp(instruction.mnemonic,"REVGS") == 0){
		dir_reg[PC]++;
		*dec=47680;
		dir_reg[instruction.op1_value]=REVG(dir_reg[instruction.op2_value]);
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
	}
	
	if( strcmp(instruction.mnemonic,"REVSH") == 0 || strcmp(instruction.mnemonic,"REVSHS") == 0){
		dir_reg[PC]++;
		*dec=47808;
		dir_reg[instruction.op1_value]=REVSH(dir_reg[instruction.op2_value]);
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
	}
	
	if( strcmp(instruction.mnemonic,"ROR") == 0 || strcmp(instruction.mnemonic,"RORS") == 0){
		dir_reg[PC]++;
		*dec=16832;
		if(instruction.op3_type=='#'){
			dir_reg[instruction.op1_value]=ROR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);}
		else{
			dir_reg[instruction.op1_value]=ROR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
			*dec=*dec|instruction.op3_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"RSB") == 0 || strcmp(instruction.mnemonic,"RSBS") == 0){
		dir_reg[PC]++;
		*dec=16690;
		dir_reg[instruction.op1_value]=RSB(dir_reg[instruction.op2_value], dir_flags);
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
	}
	
	if( strcmp(instruction.mnemonic,"SBC") == 0 || strcmp(instruction.mnemonic,"SBCS") == 0){
		dir_reg[PC]++;
		*dec=16768;
		SBC(dir_reg[instruction.op1_value],dir_reg[instruction.op2_value], dir_flags);
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
	}
	
	if( strcmp(instruction.mnemonic,"SUBS") == 0 || strcmp(instruction.mnemonic,"SUB") == 0){
		dir_reg[PC]++;
		if(instruction.op2_type=='S'){
			*dec=45184;
			dir_reg[SP]=SUB(dir_reg[SP],instruction.op3_value,dir_flags);
			*dec=*dec|instruction.op3_value;}
		else if(instruction.op3_type=='#'){
			*dec=7680;
			dir_reg[instruction.op1_value]=SUB(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
		else{
			*dec=6656;
			dir_reg[instruction.op1_value]=SUB(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if( strcmp(instruction.mnemonic,"TST") == 0 || strcmp(instruction.mnemonic,"TSTS") == 0){
		dir_reg[PC]++;
		*dec=16896;
		TST(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], dir_flags);
		*dec=*dec|instruction.op2_value<<3|instruction.op1_value;
	}
	
	if( strcmp(instruction.mnemonic,"NOP") == 0 ){
		NOP(dir_reg);
		*dec=48896;
	}
	
	if( strcmp(instruction.mnemonic,"B") == 0 ){
		*dec=57344;
		*dec=*dec|instruction.op1_value;
		B(instruction.op1_value, dir_reg);
	}
	
	if( strcmp(instruction.mnemonic,"BL") == 0 ){
		*dec=0;
		BL(instruction.op1_value, dir_reg);
	}
	
	if( strcmp(instruction.mnemonic,"BX") == 0 ){
		*dec=18176;
		BX(dir_reg);
	}
	
	if( strcmp(instruction.mnemonic,"BEQ") == 0 ){
		*dec=0;
		BEQ(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BNE") == 0 ){
		*dec=0;
		BNE(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BCS") == 0 ){
		*dec=0;
		BCS(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BCC") == 0 ){
		*dec=0;
		BCC(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BMI") == 0 ){
		*dec=0;
		BMI(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BPL") == 0 ){
		*dec=0;
		BPL(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BVS") == 0 ){
		*dec=0;
		BVS(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BVC") == 0 ){
		*dec=0;
		BVC(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BHI") == 0 ){
		*dec=0;
		BHI(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BLS") == 0 ){
		*dec=0;
		BLS(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BGE") == 0 ){
		*dec=0;
		BGE(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BLT") == 0 ){
		*dec=0;
		BLT(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BGT") == 0 ){
		*dec=0;
		BGT(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BLE") == 0 ){
		*dec=0;
		BLE(instruction.op1_value, dir_reg, dir_flags);
	}
	
	if( strcmp(instruction.mnemonic,"BAL") == 0 ){
		*dec=0;
		BAL(instruction.op1_value, dir_reg);
	}
	
	if(strcmp(instruction.mnemonic,"PUSH")==0){
		dir_reg[PC]++;
		*dec=46080;
		PUSH(SRAM, dir_reg,R_activos);
	}
	
	if(strcmp(instruction.mnemonic,"POP")==0){
		dir_reg[PC]++;
		*dec=48128;
		POP(SRAM,dir_reg,R_activos);
	}
	
	data=(uint8_t)dir_reg[instruction.op1_value];
	if(strcmp(instruction.mnemonic,"LDR")==0){
		dir_reg[PC]++;
		if(instruction.op2_type=='=' && instruction.op3_type=='N'){
			*dec=0;
			dir_reg[instruction.op1_value]=instruction.op2_value;}
		else if(instruction.op2_type=='S'){
			*dec=38912;
			dir_reg[instruction.op1_value]=LDR(dir_reg[SP], instruction.op3_value<<2, SRAM);
			*dec=*dec|instruction.op3_value|instruction.op1_value<<8;}
		else if(instruction.op3_type=='#' || instruction.op3_type=='N'){
			*dec=26624;
			if((dir_reg[instruction.op2_value]+(instruction.op3_value<<2))>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<2)), &data,Read);
			else
				dir_reg[instruction.op1_value]=LDR(dir_reg[instruction.op2_value], instruction.op3_value<<2, SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value|instruction.op1_value;}
		else{
			*dec=22528;
			if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read);
			else
				dir_reg[instruction.op1_value]=LDR(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if(strcmp(instruction.mnemonic,"LDRB")==0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#' || instruction.op3_type=='N'){
			*dec=30720;
			if((dir_reg[instruction.op2_value]+instruction.op3_value)>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+instruction.op3_value), &data,Read);	
			else
				dir_reg[instruction.op1_value]=LDRB(dir_reg[instruction.op2_value], instruction.op3_value, SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
		else{
			*dec=23552;
			if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read);
			else 
				dir_reg[instruction.op1_value]=LDRB(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if(strcmp(instruction.mnemonic,"LDRH")==0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#' || instruction.op3_type=='N'){
			*dec=34816;
			if((dir_reg[instruction.op2_value]+(instruction.op3_value<<1))>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<1)), &data,Read);
			else
				dir_reg[instruction.op1_value]=LDRH(dir_reg[instruction.op2_value], instruction.op3_value<<1, SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
		else{
			*dec=23040;
			if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read);
			else
				dir_reg[instruction.op1_value]=LDRH(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if(strcmp(instruction.mnemonic,"LDRSB")==0){
		dir_reg[PC]++;
		*dec=22016;
		*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;
		if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000)
			IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read);
		else
			dir_reg[instruction.op1_value]=LDRSB(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);
	}
	
	if(strcmp(instruction.mnemonic,"LDRSH")==0){
		dir_reg[PC]++;
		*dec=24064;
		*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;
		if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000)
			IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read);
		else
			dir_reg[instruction.op1_value]=LDRSH(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);
	}
	
	if(strcmp(instruction.mnemonic,"STR")==0){
		dir_reg[PC]++;
		if(instruction.op2_type=='S'){
			*dec=38912;
			STR(dir_reg[instruction.op1_value],dir_reg[SP], instruction.op3_value<<2, SRAM);
			*dec=*dec|instruction.op3_value|instruction.op1_value<<8;}
		else if(instruction.op3_type=='#' || instruction.op3_type=='N'){
			*dec=24576;
			if((dir_reg[instruction.op2_value]+(instruction.op3_value<<2))>=0x40000000){
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<2)), &data,Write);}
			else{
				STR(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], instruction.op3_value<<2, SRAM);}
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;
			mvprintw(1,3,"Hola");}
		else{
			*dec=20480;
			if((dir_reg[instruction.op2_value]+dir_reg[instruction.op2_value])>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(dir_reg[instruction.op3_value])), &data,Write);
			else{
				STR(dir_reg[instruction.op1_value], instruction.op2_value, instruction.op3_value, SRAM);}
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if(strcmp(instruction.mnemonic,"STRB")==0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#' || instruction.op3_type=='N'){
			*dec=28672;
			if(dir_reg[instruction.op2_value]+instruction.op3_value>=0x40000000){
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+instruction.op3_value), &data,Write);}
			else{			
				STRB(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], instruction.op3_value, SRAM);}
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
		else{
			*dec=21504;
			if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000){
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(dir_reg[instruction.op3_value])), &data,Write);}
			else{
				STRB(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);}
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
	}
	
	if(strcmp(instruction.mnemonic,"STRH")==0){
		dir_reg[PC]++;
		if(instruction.op3_type=='#' || instruction.op3_type=='N'){
			*dec=32768;
			if(((dir_reg[instruction.op2_value])+(instruction.op3_value<<1))>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<1)),&data,Write);
			else
				STRH(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], instruction.op3_value<<1, SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
		else{
			*dec=20992;
			if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000)
				IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(dir_reg[instruction.op3_value])), &data,Write);
			else
				STRH(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);
			*dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;}
	}
}
Example #19
0
	GString GetSVGText(int diffIndex, double intervalduration = 1, double peakMargin = 1.2)
	{
		std::stringstream out;
		vector<int> dataPoints = GetDataPoints(diffIndex, intervalduration);
		auto peak_it = std::max_element(dataPoints.begin(), dataPoints.end());
		float peakf = *peak_it;
		double peak = *peak_it * peakMargin;

		out << "<svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\">\n";

		auto ptIdx = 0;
		float ImageHeight = CfgValNPS("GraphHeight", 300);
		float GraphYOffset = CfgValNPS("GraphYOffs", 50);
		float GraphXOffset = CfgValNPS("GraphXOffs", 100);
		
		float IntervalWidth = 10;
		float GraphWidth = dataPoints.size() * IntervalWidth;

		float RealGraphWidth = CfgValNPS("Width", 1000);
		float XRatio = RealGraphWidth / GraphWidth;

		Vec2 BL(GraphXOffset, GraphYOffset + ImageHeight);
		Vec2 BR(RealGraphWidth, 0);
		Vec2 TL(GraphXOffset, GraphYOffset);
		BR += BL;

		GString DiffAuth = Song->GetDifficulty(diffIndex)->Author;

		if (!DiffAuth.length())
			DiffAuth = "an anonymous charter";

		float avgNPS = Song->GetDifficulty(diffIndex)->TotalScoringObjects / Song->GetDifficulty(diffIndex)->Duration;

		out << Utility::Format("<text x=\"%d\" y=\"%d\" fill=\"black\">%s - %s (%s) by %s (Max NPS: %.2f/Avg NPS: %.2f)</text>\n",
			20, 20,
			Song->SongName, Song->SongAuthor, Song->GetDifficulty(diffIndex)->Name, DiffAuth,
			peakf / intervalduration,
			avgNPS);

		out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(0,0,0);stroke-width:4\"/>\n",
			BL.x, BL.y, BR.x, BR.y);

		out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(0,0,0);stroke-width:4\"/>\n",
			TL.x, TL.y, BL.x, BL.y);

		auto ptAmt = 5;
		for (auto i = 1; i <= ptAmt; i++)
		{
			float X = (BL.x - GraphXOffset / 2);
			float Y = (BL.y - i * (ImageHeight / ptAmt / peakMargin));
			float Value = (peakf * i / ptAmt / intervalduration);
			out << Utility::Format("\t<text x=\"%d\" y=\"%d\" fill=\"black\">%.2f</text>\n",
				X, Y, Value);

			out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(0,0,0);stroke-width:0.5\"/>\n",
				X, Y, GraphXOffset + RealGraphWidth, Y);
		}

		for (auto point : dataPoints)
		{
			double relativeFreq = point / peak;
			double relFreqNext;
			int x1, y1, x2, y2;

			if (ptIdx + 1 < dataPoints.size())
			{
				relFreqNext = dataPoints[ptIdx + 1] / peak;
			}
			else
				relFreqNext = 0;

			x1 = IntervalWidth * ptIdx * XRatio + GraphXOffset;
			y1 = ImageHeight - ImageHeight * relativeFreq + GraphYOffset;

			x2 = IntervalWidth * (ptIdx + 1) * XRatio + GraphXOffset;
			y2 = ImageHeight - ImageHeight * relFreqNext  + GraphYOffset;

			out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(255,0,0);stroke-width:2\"/>\n",
				x1, y1, x2, y2);


			ptIdx++;
		}

		out << "</svg>";
		return out.str();
	}
Example #20
0
void PHILinkItem::html( const PHIRequest *req, QByteArray &out, QByteArray &script, const QByteArray &indent ) const
{
    PHILabelItem::html( req, out, script, indent );
    QByteArray arr=data( DUrl ).toByteArray();
    script+=BL( "$$link('" )+id()+BL( "'," );
    if ( arr.isEmpty() ) {
        script+=BL( "undefined" );
    } else {
        script+=BL( "function(){" );
        if ( arr.startsWith( "javascript:" ) ) {
            arr.remove( 0, 11 );
            script+=arr;
        } else {
            arr.replace( '\'', BL( "\\'" ) );
            script+=BL( "phi.href('" )+arr+BL( "');" );
        }
        script+='}';
    }
    script+=BL( ",'" )+cssColor( realColor() )+BL( "','" );
    if ( realBackgroundColor()!=QColor( Qt::transparent ) ) script+=cssColor( realBackgroundColor() );
    script+=BL( "','" )+cssColor( realHoverColor() )+BL( "','" );
    if ( realHoverBgColor()!=QColor( Qt::transparent ) ) script+=cssColor( realHoverBgColor() );
    script+=BL( "')" );
    if ( arr.isEmpty() ) script+=BL( ";\n" );
    else script+=BL( ".cursor('pointer');\n" );
}
Example #21
0
transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
{
    uint32_t offsets[8] = {0, 4*N, 2*N, 6*N, N, 5*N, 7*N, 3*N};
    uint32_t offsets_o[8] = {0, 4*N, 2*N, 6*N, 7*N, 3*N, N, 5*N};

    int32_t pAddr = 0;
    int32_t pN = 0;
    int32_t pLUT = 0;

    insns_t  *fp;
    insns_t  *start;
    insns_t  *x_4_addr;
    insns_t  *x_8_addr;
    uint32_t  loop_count;

    int       count;
    ptrdiff_t len;

    size_t   *ps;
    size_t   *pps;

    count = ffts_tree_count(N, leaf_N, 0) + 1;

    ps = pps = malloc(2 * count * sizeof(*ps));
    if (!ps) {
        return NULL;
    }

    ffts_elaborate_tree(&pps, N, leaf_N, 0);

    pps[0] = 0;
    pps[1] = 0;

    pps = ps;

#ifdef HAVE_SSE
    if (sign < 0) {
        p->constants = (const void*) sse_constants;
    } else {
        p->constants = (const void*) sse_constants_inv;
    }
#endif

    fp = (insns_t*) p->transform_base;

    /* generate base cases */
    x_4_addr = generate_size4_base_case(&fp, sign);
    x_8_addr = generate_size8_base_case(&fp, sign);

#ifdef __arm__
    start = generate_prologue(&fp, p);

#ifdef HAVE_NEON
    memcpy(fp, neon_ee, neon_oo - neon_ee);
    if (sign < 0) {
        fp[33] ^= 0x00200000;
        fp[37] ^= 0x00200000;
        fp[38] ^= 0x00200000;
        fp[39] ^= 0x00200000;
        fp[40] ^= 0x00200000;
        fp[41] ^= 0x00200000;
        fp[44] ^= 0x00200000;
        fp[45] ^= 0x00200000;
        fp[46] ^= 0x00200000;
        fp[47] ^= 0x00200000;
        fp[48] ^= 0x00200000;
        fp[57] ^= 0x00200000;
    }

    fp += (neon_oo - neon_ee) / 4;
#else
    memcpy(fp, vfp_e, vfp_o - vfp_e);

    if (sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;
#endif
#else
    /* generate functions */
    start = generate_prologue(&fp, p);

    loop_count = 4 * p->i0;
    generate_leaf_init(&fp, loop_count);

    if (ffts_ctzl(N) & 1) {
        generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, 7);
        }

        loop_count += 4;
        generate_leaf_oe(&fp, offsets_o);
    } else {
        generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);

        loop_count += 4;
        generate_leaf_eo(&fp, offsets);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
        }
    }

    if (p->i1) {
        uint32_t offsets_oe[8] = {7*N, 3*N, N, 5*N, 0, 4*N, 6*N, 2*N};

        loop_count += 4 * p->i1;

        /* align loop/jump destination */
#ifdef _M_X64
        x86_mov_reg_imm(fp, X86_EBX, loop_count);
#else
        x86_mov_reg_imm(fp, X86_ECX, loop_count);
        ffts_align_mem16(&fp, 9);
#endif

        generate_leaf_ee(&fp, offsets_oe, 0);
    }

    generate_transform_init(&fp);

    /* generate subtransform calls */
    count = 2;
    while (pps[0]) {
        size_t ws_is;

        if (!pN) {
#ifdef _M_X64
            x86_mov_reg_imm(fp, X86_EBX, pps[0]);
#else
            x86_mov_reg_imm(fp, X86_ECX, pps[0] / 4);
#endif
        } else {
            int offset = (4 * pps[1]) - pAddr;
            if (offset) {
#ifdef _M_X64
                x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#else
                x64_alu_reg_imm_size(fp, X86_ADD, X64_RDX, offset, 8);
#endif
            }

            if (pps[0] > leaf_N && pps[0] - pN) {
                int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN);

#ifdef _M_X64
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_EBX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_EBX, -factor);
                }
#else
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_ECX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_ECX, -factor);
                }
#endif
            }
        }

        ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1];
        if (ws_is != pLUT) {
            int offset = (int) (ws_is - pLUT);

#ifdef _M_X64
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8);
#else
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#endif
        }

        if (pps[0] == 2 * leaf_N) {
            x64_call_code(fp, x_4_addr);
        } else {
            x64_call_code(fp, x_8_addr);
        }

        pAddr = 4 * pps[1];
        if (pps[0] > leaf_N) {
            pN = pps[0];
        }

        pLUT = ws_is;//LUT_offset(pps[0], leafN);
        //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }
#endif

#ifdef __arm__
#ifdef HAVE_NEON
    if (ffts_ctzl(N) & 1) {
        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }

        *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_oe, neon_end - neon_oe);
        if(sign < 0) {
            fp[19] ^= 0x00200000;
            fp[20] ^= 0x00200000;
            fp[22] ^= 0x00200000;
            fp[23] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[64] ^= 0x00200000;
            fp[65] ^= 0x00200000;
            fp[66] ^= 0x00200000;
            fp[67] ^= 0x00200000;
        }
        fp += (neon_end - neon_oe) / 4;

    } else {

        *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_eo, neon_oe - neon_eo);
        if(sign < 0) {
            fp[10] ^= 0x00200000;
            fp[11] ^= 0x00200000;
            fp[13] ^= 0x00200000;
            fp[14] ^= 0x00200000;
            fp[31] ^= 0x00200000;
            fp[33] ^= 0x00200000;
            fp[34] ^= 0x00200000;
            fp[35] ^= 0x00200000;
            fp[59] ^= 0x00200000;
            fp[60] ^= 0x00200000;
            fp[61] ^= 0x00200000;
            fp[62] ^= 0x00200000;
        }
        fp += (neon_oe - neon_eo) / 4;

        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }
    }

    if(p->i1) {
        ADDI(&fp, 2, 3, 0);
        ADDI(&fp, 3, 7, 0);
        ADDI(&fp, 7, 2, 0);

        ADDI(&fp, 2, 4, 0);
        ADDI(&fp, 4, 8, 0);
        ADDI(&fp, 8, 2, 0);

        ADDI(&fp, 2, 5, 0);
        ADDI(&fp, 5, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 6, 0);
        ADDI(&fp, 6, 10, 0);
        ADDI(&fp, 10, 2, 0);

        ADDI(&fp, 2, 9, 0);
        ADDI(&fp, 9, 10, 0);
        ADDI(&fp, 10, 2, 0);

        *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
        fp++;
        MOVI(&fp, 11, p->i1);
        memcpy(fp, neon_ee, neon_oo - neon_ee);
        if(sign < 0) {
            fp[33] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[39] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[44] ^= 0x00200000;
            fp[45] ^= 0x00200000;
            fp[46] ^= 0x00200000;
            fp[47] ^= 0x00200000;
            fp[48] ^= 0x00200000;
            fp[57] ^= 0x00200000;
        }
        fp += (neon_oo - neon_ee) / 4;
    }
#else
    ADDI(&fp, 2, 7, 0);
    ADDI(&fp, 7, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 8, 0);
    ADDI(&fp, 8, 10, 0);
    ADDI(&fp, 10, 2, 0);

    MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
    memcpy(fp, vfp_o, vfp_x4 - vfp_o);
    if(sign > 0) {
        fp[22] ^= 0x00000040;
        fp[24] ^= 0x00000040;
        fp[25] ^= 0x00000040;
        fp[26] ^= 0x00000040;
        fp[62] ^= 0x00000040;
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[66] ^= 0x00000040;
    }
    fp += (vfp_x4 - vfp_o) / 4;

    ADDI(&fp, 2, 3, 0);
    ADDI(&fp, 3, 7, 0);
    ADDI(&fp, 7, 2, 0);

    ADDI(&fp, 2, 4, 0);
    ADDI(&fp, 4, 8, 0);
    ADDI(&fp, 8, 2, 0);

    ADDI(&fp, 2, 5, 0);
    ADDI(&fp, 5, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 6, 0);
    ADDI(&fp, 6, 10, 0);
    ADDI(&fp, 10, 2, 0);

    ADDI(&fp, 2, 9, 0);
    ADDI(&fp, 9, 10, 0);
    ADDI(&fp, 10, 2, 0);

    *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
    fp++;
    MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
    memcpy(fp, vfp_e, vfp_o - vfp_e);
    if(sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;

#endif
    *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p));
    fp++; // load offsets into r12
    //ADDI(&fp, 2, 1, 0);
    MOVI(&fp, 1, 0);

    // args: r0 - out
    //       r1 - N
    //       r2 - ws
    //	ADDI(&fp, 3, 1, 0); // put N into r3 for counter

    count = 2;
    while(pps[0]) {

        //	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
        if(!pN) {
            MOVI(&fp, 1, pps[0]);
        } else {
            if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
            if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
        }

        if (p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT) {
            ADDI(&fp, 2, 2, p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
        }

        if(pps[0] == 2 * leaf_N) {
            *fp = BL(fp+2, x_4_addr);
            fp++;
        } else if(!pps[2]) {
            //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
            memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
            if(sign < 0) {
                fp[31] ^= 0x00200000;
                fp[32] ^= 0x00200000;
                fp[33] ^= 0x00200000;
                fp[34] ^= 0x00200000;
                fp[65] ^= 0x00200000;
                fp[66] ^= 0x00200000;
                fp[70] ^= 0x00200000;
                fp[74] ^= 0x00200000;
                fp[97] ^= 0x00200000;
                fp[98] ^= 0x00200000;
                fp[102] ^= 0x00200000;
                fp[104] ^= 0x00200000;
            }
            fp += (neon_ee - neon_x8_t) / 4;
            //*fp++ = BL(fp+2, x_8_t_addr);

#else
            *fp = BL(fp+2, x_8_addr);
            fp++;
#endif
        } else {
            *fp = BL(fp+2, x_8_addr);
            fp++;
        }

        pAddr = pps[1] * 4;
        pN = pps[0];
        pLUT = p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
        //	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }

    *fp++ = 0xecbd8b10;
    *fp++ = POP_LR();
    count++;
#else
    generate_epilogue(&fp);
#endif

    //	*fp++ = B(14); count++;

    //for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
    //	fprintf(stderr, "%08x\n", x_4_addr[i]);
    //fprintf(stderr, "\n");
    //for(int i=0;i<count;i++)

    //fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));

    free(ps);

#if defined(_MSC_VER)
#pragma warning(push)

    /* disable type cast warning from data pointer to function pointer */
#pragma warning(disable : 4055)
#endif

    return (transform_func_t) start;

#if defined(_MSC_VER)
#pragma warning(pop)
#endif
}