inline void internal::Trr2kNTTT ( UpperOrLower uplo, Orientation orientationOfB, Orientation orientationOfC, Orientation orientationOfD, T alpha, const DistMatrix<T,MC,MR>& A, const DistMatrix<T,MC,MR>& B, const DistMatrix<T,MC,MR>& C, const DistMatrix<T,MC,MR>& D, T beta, DistMatrix<T,MC,MR>& E ) { #ifndef RELEASE PushCallStack("internal::Trr2kNTTT"); if( E.Height() != E.Width() || A.Width() != C.Height() || A.Height() != E.Height() || C.Width() != E.Height() || B.Height() != E.Width() || D.Height() != E.Width() || A.Width() != B.Width() || C.Height() != D.Width() ) throw std::logic_error("Nonconformal Trr2kNTTT"); #endif const Grid& g = E.Grid(); DistMatrix<T,MC,MR> AL(g), AR(g), A0(g), A1(g), A2(g); DistMatrix<T,MC,MR> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T,MC,MR> CT(g), C0(g), CB(g), C1(g), C2(g); DistMatrix<T,MC,MR> DL(g), DR(g), D0(g), D1(g), D2(g); DistMatrix<T,MC, STAR> A1_MC_STAR(g); DistMatrix<T,VR, STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR > B1AdjOrTrans_STAR_MR(g); DistMatrix<T,STAR,MC > C1_STAR_MC(g); DistMatrix<T,VR, STAR> D1_VR_STAR(g); DistMatrix<T,STAR,MR > D1AdjOrTrans_STAR_MR(g); LockedPartitionRight( A, AL, AR, 0 ); LockedPartitionRight( B, BL, BR, 0 ); LockedPartitionDown ( C, CT, CB, 0 ); LockedPartitionRight( D, DL, DR, 0 ); while( AL.Width() < A.Width() ) { LockedRepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2 ); LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); LockedRepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); LockedRepartitionRight ( DL, /**/ DR, D0, /**/ D1, D2 ); A1_MC_STAR.AlignWith( E ); B1_VR_STAR.AlignWith( E ); B1AdjOrTrans_STAR_MR.AlignWith( E ); C1_STAR_MC.AlignWith( E ); D1_VR_STAR.AlignWith( E ); D1AdjOrTrans_STAR_MR.AlignWith( E ); //--------------------------------------------------------------------// A1_MC_STAR = A1; C1_STAR_MC = C1; B1_VR_STAR = B1; D1_VR_STAR = D1; if( orientationOfB == ADJOINT ) B1AdjOrTrans_STAR_MR.AdjointFrom( B1_VR_STAR ); else B1AdjOrTrans_STAR_MR.TransposeFrom( B1_VR_STAR ); if( orientationOfD == ADJOINT ) D1AdjOrTrans_STAR_MR.AdjointFrom( D1_VR_STAR ); else D1AdjOrTrans_STAR_MR.TransposeFrom( D1_VR_STAR ); internal::LocalTrr2k ( uplo, orientationOfC, alpha, A1_MC_STAR, B1AdjOrTrans_STAR_MR, C1_STAR_MC, D1AdjOrTrans_STAR_MR, beta, E ); //--------------------------------------------------------------------// D1AdjOrTrans_STAR_MR.FreeAlignments(); D1_VR_STAR.FreeAlignments(); C1_STAR_MC.FreeAlignments(); B1AdjOrTrans_STAR_MR.FreeAlignments(); B1_VR_STAR.FreeAlignments(); A1_MC_STAR.FreeAlignments(); SlideLockedPartitionRight ( DL, /**/ DR, D0, D1, /**/ D2 ); SlideLockedPartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlideLockedPartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void HemmRUC ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::HemmRUC"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error("{A,B,C} must be distributed on the same grid"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> ATL(g), ATR(g), A00(g), A01(g), A02(g), AColPan(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), ARowPan(g), A20(g), A21(g), A22(g); DistMatrix<T> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g), CLeft(g), CRight(g); // Temporary distributions DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,VR, STAR> AColPan_VR_STAR(g); DistMatrix<T,STAR,MR > AColPanAdj_STAR_MR(g); DistMatrix<T,MR, STAR> ARowPanAdj_MR_STAR(g); B1_MC_STAR.AlignWith( C ); // Start the algorithm Scale( beta, C ); LockedPartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( CR.Width() > 0 ) { LockedRepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); ARowPan.LockedView1x2( A11, A12 ); AColPan.LockedView2x1 ( A01, A11 ); CLeft.View1x2( C0, C1 ); CRight.View1x2( C1, C2 ); AColPan_VR_STAR.AlignWith( CLeft ); AColPanAdj_STAR_MR.AlignWith( CLeft ); ARowPanAdj_MR_STAR.AlignWith( CRight ); //--------------------------------------------------------------------// B1_MC_STAR = B1; AColPan_VR_STAR = AColPan; AColPanAdj_STAR_MR.AdjointFrom( AColPan_VR_STAR ); ARowPanAdj_MR_STAR.AdjointFrom( ARowPan ); MakeTrapezoidal( LEFT, LOWER, 0, ARowPanAdj_MR_STAR ); MakeTrapezoidal( RIGHT, LOWER, -1, AColPanAdj_STAR_MR ); LocalGemm ( NORMAL, ADJOINT, alpha, B1_MC_STAR, ARowPanAdj_MR_STAR, T(1), CRight ); LocalGemm ( NORMAL, NORMAL, alpha, B1_MC_STAR, AColPanAdj_STAR_MR, T(1), CLeft ); //--------------------------------------------------------------------// AColPan_VR_STAR.FreeAlignments(); AColPanAdj_STAR_MR.FreeAlignments(); ARowPanAdj_MR_STAR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
void JitArmILAsmRoutineManager::Generate() { enterCode = GetCodePtr(); PUSH(9, R4, R5, R6, R7, R8, R9, R10, R11, _LR); // Take care to 8-byte align stack for function calls. // We are misaligned here because of an odd number of args for PUSH. // It's not like x86 where you need to account for an extra 4 bytes // consumed by CALL. SUB(_SP, _SP, 4); MOVI2R(R0, (u32)&CoreTiming::downcount); MOVI2R(R9, (u32)&PowerPC::ppcState.spr[0]); FixupBranch skipToRealDispatcher = B(); dispatcher = GetCodePtr(); printf("ILDispatcher is %p\n", dispatcher); // Downcount Check // The result of slice decrementation should be in flags if somebody jumped here // IMPORTANT - We jump on negative, not carry!!! FixupBranch bail = B_CC(CC_MI); SetJumpTarget(skipToRealDispatcher); dispatcherNoCheck = GetCodePtr(); // This block of code gets the address of the compiled block of code // It runs though to the compiling portion if it isn't found LDR(R12, R9, PPCSTATE_OFF(pc));// Load the current PC into R12 Operand2 iCacheMask = Operand2(0xE, 2); // JIT_ICACHE_MASK BIC(R12, R12, iCacheMask); // R12 contains PC & JIT_ICACHE_MASK here. MOVI2R(R14, (u32)jit->GetBlockCache()->iCache); LDR(R12, R14, R12); // R12 contains iCache[PC & JIT_ICACHE_MASK] here // R12 Confirmed this is the correct iCache Location loaded. TST(R12, 0x80); // Test to see if it is a JIT block. SetCC(CC_EQ); // Success, it is our Jitblock. MOVI2R(R14, (u32)jit->GetBlockCache()->GetCodePointers()); // LDR R14 right here to get CodePointers()[0] pointer. LSL(R12, R12, 2); // Multiply by four because address locations are u32 in size LDR(R14, R14, R12); // Load the block address in to R14 B(R14); // No need to jump anywhere after here, the block will go back to dispatcher start SetCC(); // If we get to this point, that means that we don't have the block cached to execute // So call ArmJit to compile the block and then execute it. MOVI2R(R14, (u32)&Jit); BL(R14); B(dispatcherNoCheck); // fpException() // Floating Point Exception Check, Jumped to if false fpException = GetCodePtr(); LDR(R0, R9, PPCSTATE_OFF(Exceptions)); ORR(R0, R0, EXCEPTION_FPU_UNAVAILABLE); STR(R0, R9, PPCSTATE_OFF(Exceptions)); QuickCallFunction(R14, (void*)&PowerPC::CheckExceptions); LDR(R0, R9, PPCSTATE_OFF(npc)); STR(R0, R9, PPCSTATE_OFF(pc)); B(dispatcher); SetJumpTarget(bail); doTiming = GetCodePtr(); // XXX: In JIT64, Advance() gets called /after/ the exception checking // once it jumps back to the start of outerLoop QuickCallFunction(R14, (void*)&CoreTiming::Advance); // Does exception checking testExceptions = GetCodePtr(); LDR(R0, R9, PPCSTATE_OFF(pc)); STR(R0, R9, PPCSTATE_OFF(npc)); QuickCallFunction(R14, (void*)&PowerPC::CheckExceptions); LDR(R0, R9, PPCSTATE_OFF(npc)); STR(R0, R9, PPCSTATE_OFF(pc)); // Check the state pointer to see if we are exiting // Gets checked on every exception check MOVI2R(R0, (u32)PowerPC::GetStatePtr()); MVN(R1, 0); LDR(R0, R0); TST(R0, R1); FixupBranch Exit = B_CC(CC_NEQ); B(dispatcher); SetJumpTarget(Exit); ADD(_SP, _SP, 4); POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC); // Returns GenerateCommon(); FlushIcache(); }
void JitArm::SafeLoadToReg(bool fastmem, u32 dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse) { ARMReg RD = gpr.R(dest); if (Core::g_CoreStartupParameter.bFastmem && fastmem) { // Preload for fastmem if (offsetReg != -1) gpr.R(offsetReg); if (addr != -1) MOV(R10, gpr.R(addr)); else MOV(R10, 0); UnsafeLoadToReg(RD, R10, accessSize, offsetReg, offset); return; } ARMReg rA = gpr.GetReg(); ARMReg rB = gpr.GetReg(); if (offsetReg == -1) { MOVI2R(rA, offset); if (addr != -1) ADD(rA, rA, gpr.R(addr)); } else { if (addr != -1) ADD(rA, gpr.R(addr), gpr.R(offsetReg)); else MOV(rA, gpr.R(offsetReg)); } switch (accessSize) { case 8: MOVI2R(rB, (u32)&Memory::Read_U8); break; case 16: MOVI2R(rB, (u32)&Memory::Read_U16); break; case 32: MOVI2R(rB, (u32)&Memory::Read_U32); break; } PUSH(4, R0, R1, R2, R3); MOV(R0, rA); BL(rB); MOV(rA, R0); POP(4, R0, R1, R2, R3); MOV(RD, rA); if (signExtend) // Only on 16 loads SXTH(RD, RD); if (reverse) { if (accessSize == 32) REV(RD, RD); else if (accessSize == 16) REV16(RD, RD); } gpr.Unlock(rA, rB); }
inline void SymmLLA ( T alpha, const DistMatrix<T,MC,MR>& A, const DistMatrix<T,MC,MR>& B, T beta, DistMatrix<T,MC,MR>& C ) { #ifndef RELEASE PushCallStack("internal::SymmLLA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); #endif const Grid& g = A.Grid(); DistMatrix<T,MC,MR> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T,MC,MR> CL(g), CR(g), C0(g), C1(g), C2(g); DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR> B1Trans_STAR_MR(g); DistMatrix<T,MC,MR > Z1(g); DistMatrix<T,MC,STAR> Z1_MC_STAR(g); DistMatrix<T,MR,STAR> Z1_MR_STAR(g); DistMatrix<T,MR,MC > Z1_MR_MC(g); B1_MC_STAR.AlignWith( A ); B1_VR_STAR.AlignWith( A ); B1Trans_STAR_MR.AlignWith( A ); Z1_MC_STAR.AlignWith( A ); Z1_MR_STAR.AlignWith( A ); Scale( beta, C ); LockedPartitionRight ( B, BL, BR, 0 ); PartitionRight ( C, CL, CR, 0 ); while( CL.Width() < C.Width() ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); Z1.AlignWith( C1 ); Zeros( C1.Height(), C1.Width(), Z1_MC_STAR ); Zeros( C1.Height(), C1.Width(), Z1_MR_STAR ); //--------------------------------------------------------------------// B1_MC_STAR = B1; B1_VR_STAR = B1_MC_STAR; B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR ); LocalSymmetricAccumulateLL ( TRANSPOSE, alpha, A, B1_MC_STAR, B1Trans_STAR_MR, Z1_MC_STAR, Z1_MR_STAR ); Z1_MR_MC.SumScatterFrom( Z1_MR_STAR ); Z1 = Z1_MR_MC; Z1.SumScatterUpdate( T(1), Z1_MC_STAR ); Axpy( T(1), Z1, C1 ); //--------------------------------------------------------------------// Z1.FreeAlignments(); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
unsigned char GR(int i,int j) { BL(i,j); }
inline void internal::GemmTNA ( Orientation orientationOfA, T alpha, const DistMatrix<T,MC,MR>& A, const DistMatrix<T,MC,MR>& B, T beta, DistMatrix<T,MC,MR>& C ) { #ifndef RELEASE PushCallStack("internal::GemmTNA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( orientationOfA == NORMAL ) throw std::logic_error("GemmTNA assumes A is (Conjugate)Transposed"); if( A.Width() != C.Height() || B.Width() != C.Width() || A.Height() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal GemmTNA: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T,MC,MR> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T,MC,MR> CL(g), CR(g), C0(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,MR,STAR> D1_MR_STAR(g); DistMatrix<T,MR,MC > D1_MR_MC(g); DistMatrix<T,MC,MR > D1(g); // Start the algorithm Scal( beta, C ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); B1_MC_STAR.AlignWith( A ); D1_MR_STAR.AlignWith( A ); D1_MR_STAR.ResizeTo( C1.Height(), C1.Width() ); D1.AlignWith( C1 ); //--------------------------------------------------------------------// B1_MC_STAR = B1; // B1[MC,*] <- B1[MC,MR] // D1[MR,*] := alpha (A1[MC,MR])^T B1[MC,*] // = alpha (A1^T)[MR,MC] B1[MC,*] internal::LocalGemm ( orientationOfA, NORMAL, alpha, A, B1_MC_STAR, (T)0, D1_MR_STAR ); // C1[MC,MR] += scattered & transposed D1[MR,*] summed over grid cols D1_MR_MC.SumScatterFrom( D1_MR_STAR ); D1 = D1_MR_MC; Axpy( (T)1, D1, C1 ); //--------------------------------------------------------------------// B1_MC_STAR.FreeAlignments(); D1_MR_STAR.FreeAlignments(); D1.FreeAlignments(); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void internal::Syr2kLN ( T alpha, const DistMatrix<T,MC,MR>& A, const DistMatrix<T,MC,MR>& B, T beta, DistMatrix<T,MC,MR>& C ) { #ifndef RELEASE PushCallStack("internal::Syr2kLN"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != C.Height() || A.Height() != C.Width() || B.Height() != C.Height() || B.Height() != C.Width() || A.Width() != B.Width() ) { std::ostringstream msg; msg << "Nonconformal Syr2kLN:\n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T,MC,MR> AL(g), AR(g), A0(g), A1(g), A2(g); DistMatrix<T,MC,MR> BL(g), BR(g), B0(g), B1(g), B2(g); // Temporary distributions DistMatrix<T,MC, STAR> A1_MC_STAR(g); DistMatrix<T,MC, STAR> B1_MC_STAR(g); DistMatrix<T,VR, STAR> A1_VR_STAR(g); DistMatrix<T,VR, STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR > A1Trans_STAR_MR(g); DistMatrix<T,STAR,MR > B1Trans_STAR_MR(g); // Start the algorithm ScaleTrapezoid( beta, LEFT, LOWER, 0, C ); LockedPartitionRight( A, AL, AR, 0 ); LockedPartitionRight( B, BL, BR, 0 ); while( AR.Width() > 0 ) { LockedRepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2 ); LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); A1_MC_STAR.AlignWith( C ); B1_MC_STAR.AlignWith( C ); A1_VR_STAR.AlignWith( C ); B1_VR_STAR.AlignWith( C ); A1Trans_STAR_MR.AlignWith( C ); B1Trans_STAR_MR.AlignWith( C ); //--------------------------------------------------------------------// A1_VR_STAR = A1_MC_STAR = A1; A1Trans_STAR_MR.TransposeFrom( A1_VR_STAR ); B1_VR_STAR = B1_MC_STAR = B1; B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR ); internal::LocalTrr2k ( LOWER, alpha, A1_MC_STAR, B1Trans_STAR_MR, B1_MC_STAR, A1Trans_STAR_MR, (T)1, C ); //--------------------------------------------------------------------// A1_MC_STAR.FreeAlignments(); B1_MC_STAR.FreeAlignments(); A1_VR_STAR.FreeAlignments(); B1_VR_STAR.FreeAlignments(); A1Trans_STAR_MR.FreeAlignments(); B1Trans_STAR_MR.FreeAlignments(); SlideLockedPartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void SUMMA_NNDot ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE CallStackEntry entry("gemm::SUMMA_NNDot"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) LogicError("{A,B,C} must have the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal matrices: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; LogicError( msg.str() ); } #endif const Grid& g = A.Grid(); if( A.Height() > B.Width() ) { // Matrix views DistMatrix<T> AT(g), AB(g), A0(g), A1(g), A2(g); DistMatrix<T> BL(g), B0(g), BR(g), B1(g), B2(g); DistMatrix<T> CT(g), C0(g), C1L(g), C1R(g), CB(g), C1(g), C10(g), C11(g), C12(g), C2(g); // Temporary distributions DistMatrix<T,STAR,VC> A1_STAR_VC(g); DistMatrix<T,VC,STAR> B1_VC_STAR(g); DistMatrix<T,STAR,STAR> C11_STAR_STAR(g); // Star the algorithm Scale( beta, C ); LockedPartitionDown ( A, AT, AB, 0 ); PartitionDown ( C, CT, CB, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); RepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); A1_STAR_VC = A1; B1_VC_STAR.AlignWith( A1_STAR_VC ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C1, C1L, C1R, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( C1L, /**/ C1R, C10, /**/ C11, C12 ); //------------------------------------------------------------// B1_VC_STAR = B1; LocalGemm ( NORMAL, NORMAL, alpha, A1_STAR_VC, B1_VC_STAR, C11_STAR_STAR ); C11.SumScatterUpdate( T(1), C11_STAR_STAR ); //------------------------------------------------------------// SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( C1L, /**/ C1R, C10, C11, /**/ C12 ); } SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlidePartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); } } else { // Matrix views DistMatrix<T> AT(g), AB(g), A0(g), A1(g), A2(g); DistMatrix<T> BL(g), B0(g), BR(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C1T(g), C01(g), C0(g), C1(g), C2(g), C1B(g), C11(g), C21(g); // Temporary distributions DistMatrix<T,STAR,VR> A1_STAR_VR(g); DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,STAR> C11_STAR_STAR(g); // Star the algorithm Scale( beta, C ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); B1_VR_STAR = B1; A1_STAR_VR.AlignWith( B1_VR_STAR ); LockedPartitionDown ( A, AT, AB, 0 ); PartitionDown ( C1, C1T, C1B, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); RepartitionDown ( C1T, C01, /***/ /***/ C11, C1B, C21 ); //------------------------------------------------------------// A1_STAR_VR = A1; LocalGemm ( NORMAL, NORMAL, alpha, A1_STAR_VR, B1_VR_STAR, C11_STAR_STAR ); C11.SumScatterUpdate( T(1), C11_STAR_STAR ); //------------------------------------------------------------// SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlidePartitionDown ( C1T, C01, C11, /***/ /***/ C1B, C21 ); } SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } } }
void MAST::NPSOLOptimizationInterface::optimize() { #if MAST_ENABLE_NPSOL == 1 // make sure that functions have been provided libmesh_assert(_funobj); libmesh_assert(_funcon); int N = _feval->n_vars(), NCLIN = 0, NCNLN = _feval->n_eq()+_feval->n_ineq(), NCTOTL = N+NCLIN+NCNLN, LDA = std::max(NCLIN, 1), LDJ = std::max(NCNLN, 1), LDR = N, INFORM = 0, // on exit: Reports result of call to NPSOL // < 0 either funobj or funcon has set this to -ve // 0 => converged to point x // 1 => x satisfies optimality conditions, but sequence of iterates has not converged // 2 => Linear constraints and bounds cannot be satisfied. No feasible solution // 3 => Nonlinear constraints and bounds cannot be satisfied. No feasible solution // 4 => Major iter limit was reached // 6 => x does not satisfy first-order optimality to required accuracy // 7 => function derivatives seem to be incorrect // 9 => input parameter invalid ITER = 0, // iter count LENIW = 3*N + NCLIN + 2*NCNLN, LENW = 2*N*N + N*NCLIN + 2*N*NCNLN + 20*N + 11*NCLIN + 21*NCNLN; Real F = 0.; // on exit: final objective std::vector<int> IW (LENIW, 0), ISTATE (NCTOTL, 0); // status of constraints l <= r(x) <= u, // -2 => lower bound is violated by more than delta // -1 => upper bound is violated by more than delta // 0 => both bounds are satisfied by more than delta // 1 => lower bound is active (to within delta) // 2 => upper bound is active (to within delta) // 3 => boundars are equal and equality constraint is satisfied std::vector<Real> A (LDA, 0.), // this is used for liear constraints, not currently handled BL (NCTOTL, 0.), BU (NCTOTL, 0.), C (NCNLN, 0.), // on exit: nonlinear constraints CJAC (LDJ* N, 0.), // // on exit: CJAC(i,j) is the partial derivative of ith nonlinear constraint CLAMBDA (NCTOTL, 0.), // on entry: need not be initialized for cold start // on exit: QP multiplier from the QP subproblem, >=0 if istate(j)=1, <0 if istate(j)=2 G (N, 0.), // on exit: objective gradient R (LDR*N, 0.), // on entry: need not be initialized if called with Cold Statrt // on exit: information about Hessian, if Hessian=Yes, R is upper Cholesky factor of approx H X (N, 0.), // on entry: initial point // on exit: final estimate of solution W (LENW, 0.), // workspace xmin (N, 0.), xmax (N, 0.); // now setup the lower and upper limits for the variables and constraints _feval->init_dvar(X, xmin, xmax); for (unsigned int i=0; i<N; i++) { BL[i] = xmin[i]; BU[i] = xmax[i]; } // all constraints are assumed to be g_i(x) <= 0, so that the upper // bound is 0 and lower bound is -infinity for (unsigned int i=0; i<NCNLN; i++) { BL[i+N] = -1.e20; BU[i+N] = 0.; } std::string nm; // nm = "List"; // npoptn_(nm.c_str(), (int)nm.length()); // nm = "Verify level 3"; // npoptn_(nm.c_str(), (int)nm.length()); npsol_(&N, &NCLIN, &NCNLN, &LDA, &LDJ, &LDR, &A[0], &BL[0], &BU[0], _funcon, _funobj, &INFORM, &ITER, &ISTATE[0], &C[0], &CJAC[0], &CLAMBDA[0], &F, &G[0], &R[0], &X[0], &IW[0], &LENIW, &W[0], &LENW); #endif // MAST_ENABLE_NPSOL 1 }
inline void SUMMA_NNA ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE CallStackEntry entry("gemm::SUMMA_NNA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) LogicError("{A,B,C} must have the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal matrices: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; LogicError( msg.str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR> B1Trans_STAR_MR(g); DistMatrix<T,MC,STAR> D1_MC_STAR(g); B1_VR_STAR.AlignWith( A ); B1Trans_STAR_MR.AlignWith( A ); D1_MC_STAR.AlignWith( A ); // Start the algorithm Scale( beta, C ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); //--------------------------------------------------------------------// B1_VR_STAR = B1; B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR ); // D1[MC,*] := alpha A[MC,MR] B1[MR,*] LocalGemm( NORMAL, TRANSPOSE, alpha, A, B1Trans_STAR_MR, D1_MC_STAR ); // C1[MC,MR] += scattered result of D1[MC,*] summed over grid rows C1.SumScatterUpdate( T(1), D1_MC_STAR ); //--------------------------------------------------------------------// SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } }
void test_basic_types() { bcon basic_types[] = {"string", BS("a string"), "f(double)", BF(3.14159), "boolean", BB(1), "time", BT(time(0)), "null", BNULL, "symbol", BX("a symbol"), "int", BI(123), "long", BL(456789L), BEND}; test_bson_from_bcon( basic_types, BCON_OK, BSON_VALID ); }
void decodeInstruction(instruction_t instruction,unsigned long *r[],unsigned long *bandera,unsigned long *PC,unsigned long*LR,uint8_t*memoria,unsigned long *codificacion) { int auxban; unsigned long aux1,aux2,des; // codificacion funciones de la alu if(strcmp(instruction.mnemonic,"ADDS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type== '#' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ADD(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type== 'R' )&&(instruction.op3_type =='#' )) { r[instruction.op1_value]=ADD(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type== '#' )&&(instruction.op3_type =='#' )) { r[instruction.op1_value]=ADD(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { ADD(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { ADD(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { ADD(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if(strcmp(instruction.mnemonic,"CMN") == 0) { if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' )) { ADD(r[instruction.op1_value],r[instruction.op2_value],&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' )) { ADD(instruction.op1_value,r[instruction.op2_value],&bandera); } if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' )) { ADD(r[instruction.op1_value],instruction.op2_value,&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' )) { ADD(instruction.op1_value,instruction.op2_value,&bandera); } } if( strcmp(instruction.mnemonic,"ADCS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=ADC(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ADC(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ADC(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { ADC(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { ADC(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { ADC(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if( strcmp(instruction.mnemonic,"ANDS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=AND(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=AND(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=AND(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=AND(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { AND(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { AND(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { AND(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { AND(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if(strcmp(instruction.mnemonic,"TEST") == 0) { if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' )) { AND(r[instruction.op1_value],r[instruction.op2_value],&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' )) { AND(instruction.op1_value,r[instruction.op2_value],&bandera); } if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' )) { AND(r[instruction.op1_value],instruction.op2_value,&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' )) { AND(instruction.op1_value,instruction.op2_value,&bandera); } } if( strcmp(instruction.mnemonic,"EORS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=EOR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=EOR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=EOR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { EOR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { EOR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { EOR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if( (strcmp(instruction.mnemonic,"MOVS") == 0)||(strcmp(instruction.mnemonic,"MOV") == 0)) { if((instruction.op1_type == 'R')&&(instruction.op2_type=='R') ) { r[instruction.op1_value]=MOV(r[instruction.op1_value],r[instruction.op2_value],&bandera); mostrar(r[instruction.op1_value]); } if((instruction.op1_type == 'R')&&(instruction.op2_type=='#') ) { r[instruction.op1_value]=MOV(instruction.op1_value,instruction.op2_value,&bandera); mostrar(r[instruction.op1_value]); } } if( strcmp(instruction.mnemonic,"ORRS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=ORR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ORR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ORR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { ORR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { ORR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { ORR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if( strcmp(instruction.mnemonic,"SUBS") == 0) { if(instruction.op1_type== 'R' ) { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=SUB(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=SUB(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=SUB(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { SUB(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { SUB(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { SUB(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if(strcmp(instruction.mnemonic,"CMP") == 0) { if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' )) { SUB(r[instruction.op1_value],r[instruction.op2_value],&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' )) { SUB(instruction.op1_value,r[instruction.op2_value],&bandera); } if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' )) { SUB(r[instruction.op1_value],instruction.op2_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { SUB(instruction.op1_value,instruction.op2_value,&bandera); } } // decodificacion funciones branch if(strcmp(instruction.mnemonic,"B")==0) { if(instruction.op1_type=='#') { *codificacion=(28<<11)+(instruction.op1_value); B(&PC,instruction.op1_value); } } if(strcmp(instruction.mnemonic,"BEQ")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(instruction.op1_value); BEQ(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BNE")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(1<<8)+(instruction.op1_value); BNE(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BCS")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(2<<8)+(instruction.op1_value); BCS(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BCC")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(3<<8)+(instruction.op1_value); BCC(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BMI")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(4<<8)+(instruction.op1_value); BMI(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BPL")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(5<<8)+(instruction.op1_value); BPL(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BVS")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(6<<8)+(instruction.op1_value); BVS(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BVC")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(2<<7)+(instruction.op1_value); BVC(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BHI")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(8<<8)+(instruction.op1_value); BHI(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BLS")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(9<<8)+(instruction.op1_value); BLS(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BGE")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(10<<8)+(instruction.op1_value); BGE(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BLT")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(11<<8)+(instruction.op1_value); BLT(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BGT")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(12<<8)+(instruction.op1_value); BGT(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BLE")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(13<<8)+(instruction.op1_value); BLE(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BL")==0) { if(instruction.op1_type=='#') { *codificacion=(31<<11)+(2047&instruction.op1_value+(((1<<31)&instruction.op1_value)>>20)); BL(&PC,instruction.op1_value,&LR); } }
void JitArm::lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff) u32 a = inst.RA, b = inst.RB, d = inst.RD; s32 offset = inst.SIMM_16; u32 accessSize = 0; s32 offsetReg = -1; bool zeroA = true; bool update = false; bool signExtend = false; bool reverse = false; bool fastmem = false; switch (inst.OPCD) { case 31: switch (inst.SUBOP10) { case 55: // lwzux zeroA = false; update = true; case 23: // lwzx accessSize = 32; offsetReg = b; break; case 119: //lbzux zeroA = false; update = true; case 87: // lbzx accessSize = 8; offsetReg = b; break; case 311: // lhzux zeroA = false; update = true; case 279: // lhzx accessSize = 16; offsetReg = b; break; case 375: // lhaux zeroA = false; update = true; case 343: // lhax accessSize = 16; signExtend = true; offsetReg = b; break; case 534: // lwbrx accessSize = 32; reverse = true; break; case 790: // lhbrx accessSize = 16; reverse = true; break; } break; case 33: // lwzu zeroA = false; update = true; case 32: // lwz fastmem = true; accessSize = 32; break; case 35: // lbzu zeroA = false; update = true; case 34: // lbz fastmem = true; accessSize = 8; break; case 41: // lhzu zeroA = false; update = true; case 40: // lhz fastmem = true; accessSize = 16; break; case 43: // lhau zeroA = false; update = true; case 42: // lha signExtend = true; accessSize = 16; break; } // Check for exception before loading ARMReg rA = gpr.GetReg(false); LDR(rA, R9, PPCSTATE_OFF(Exceptions)); CMP(rA, EXCEPTION_DSI); FixupBranch DoNotLoad = B_CC(CC_EQ); SafeLoadToReg(fastmem, d, zeroA ? a ? a : -1 : a, offsetReg, accessSize, offset, signExtend, reverse); if (update) { rA = gpr.GetReg(false); ARMReg RA = gpr.R(a); if (offsetReg == -1) MOVI2R(rA, offset); else MOV(RA, gpr.R(offsetReg)); ADD(RA, RA, rA); } SetJumpTarget(DoNotLoad); // LWZ idle skipping if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSkipIdle && inst.OPCD == 32 && (inst.hex & 0xFFFF0000) == 0x800D0000 && (Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 || (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) && Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8) { ARMReg RD = gpr.R(d); gpr.Flush(); fpr.Flush(); // if it's still 0, we can wait until the next event TST(RD, RD); FixupBranch noIdle = B_CC(CC_NEQ); rA = gpr.GetReg(); MOVI2R(rA, (u32)&PowerPC::OnIdle); MOVI2R(R0, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); BL(rA); gpr.Unlock(rA); WriteExceptionExit(); SetJumpTarget(noIdle); //js.compilerPC += 8; return; } }
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const dt_iop_rawprepare_data_t *const d = (dt_iop_rawprepare_data_t *)piece->data; // fprintf(stderr, "roi in %d %d %d %d\n", roi_in->x, roi_in->y, roi_in->width, roi_in->height); // fprintf(stderr, "roi out %d %d %d %d\n", roi_out->x, roi_out->y, roi_out->width, roi_out->height); const float scale = roi_in->scale / piece->iscale; const int csx = (int)roundf((float)d->x * scale), csy = (int)roundf((float)d->y * scale); if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && piece->pipe->filters) { // raw mosaic const uint16_t *const in = (const uint16_t *const)ivoid; float *const out = (float *const)ovoid; #ifdef _OPENMP #pragma omp parallel for SIMD() default(none) schedule(static) collapse(2) #endif for(int j = 0; j < roi_out->height; j++) { for(int i = 0; i < roi_out->width; i++) { const size_t pin = (size_t)(roi_in->width * (j + csy) + csx) + i; const size_t pout = (size_t)j * roi_out->width + i; const int id = BL(roi_out, d, j, i); out[pout] = (in[pin] - d->sub[id]) / d->div[id]; } } piece->pipe->filters = dt_rawspeed_crop_dcraw_filters(piece->pipe->filters, csx, csy); adjust_xtrans_filters(piece->pipe->xtrans, csx, csy); } else { // pre-downsampled buffer that needs black/white scaling const float *const in = (const float *const)ivoid; float *const out = (float *const)ovoid; const float sub = d->sub[0], div = d->div[0]; const int ch = piece->colors; #ifdef _OPENMP #pragma omp parallel for SIMD() default(none) schedule(static) collapse(3) #endif for(int j = 0; j < roi_out->height; j++) { for(int i = 0; i < roi_out->width; i++) { for(int c = 0; c < ch; c++) { const size_t pin = (size_t)ch * (roi_in->width * (j + csy) + csx + i) + c; const size_t pout = (size_t)ch * (j * roi_out->width + i) + c; out[pout] = (in[pin] - sub) / div; } } } } }
void JitArm::SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset) { if (Core::g_CoreStartupParameter.bFastmem && fastmem) { ARMReg RA; ARMReg RB; ARMReg RS = gpr.R(value); if (dest != -1) RA = gpr.R(dest); if (regOffset != -1) { RB = gpr.R(regOffset); MOV(R10, RB); NOP(1); } else MOVI2R(R10, (u32)offset, false); if (dest != -1) ADD(R10, R10, RA); else NOP(1); MOV(R12, RS); UnsafeStoreFromReg(R10, R12, accessSize, 0); return; } ARMReg rA = gpr.GetReg(); ARMReg rB = gpr.GetReg(); ARMReg rC = gpr.GetReg(); ARMReg RA; ARMReg RB; if (dest != -1) RA = gpr.R(dest); if (regOffset != -1) RB = gpr.R(regOffset); ARMReg RS = gpr.R(value); switch (accessSize) { case 32: MOVI2R(rA, (u32)&Memory::Write_U32); break; case 16: MOVI2R(rA, (u32)&Memory::Write_U16); break; case 8: MOVI2R(rA, (u32)&Memory::Write_U8); break; } MOV(rB, RS); if (regOffset == -1) MOVI2R(rC, offset); else MOV(rC, RB); if (dest != -1) ADD(rC, rC, RA); PUSH(4, R0, R1, R2, R3); MOV(R0, rB); MOV(R1, rC); BL(rA); POP(4, R0, R1, R2, R3); gpr.Unlock(rA, rB, rC); }
void process_sse2(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const dt_iop_rawprepare_data_t *const d = (dt_iop_rawprepare_data_t *)piece->data; // fprintf(stderr, "roi in %d %d %d %d\n", roi_in->x, roi_in->y, roi_in->width, roi_in->height); // fprintf(stderr, "roi out %d %d %d %d\n", roi_out->x, roi_out->y, roi_out->width, roi_out->height); const float scale = roi_in->scale / piece->iscale; const int csx = (int)roundf((float)d->x * scale), csy = (int)roundf((float)d->y * scale); if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && piece->pipe->filters) { // raw mosaic #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const uint16_t *in = ((uint16_t *)ivoid) + ((size_t)roi_in->width * (j + csy) + csx); float *out = ((float *)ovoid) + (size_t)roi_out->width * j; int i = 0; // FIXME: figure alignment! !!! replace with for !!! while((!dt_is_aligned(in, 16) || !dt_is_aligned(out, 16)) && (i < roi_out->width)) { const int id = BL(roi_out, d, j, i); *out = (((float)(*in)) - d->sub[id]) / d->div[id]; i++; in++; out++; } const __m128 sub = _mm_set_ps(d->sub[BL(roi_out, d, j, i + 3)], d->sub[BL(roi_out, d, j, i + 2)], d->sub[BL(roi_out, d, j, i + 1)], d->sub[BL(roi_out, d, j, i)]); const __m128 div = _mm_set_ps(d->div[BL(roi_out, d, j, i + 3)], d->div[BL(roi_out, d, j, i + 2)], d->div[BL(roi_out, d, j, i + 1)], d->div[BL(roi_out, d, j, i)]); // process aligned pixels with SSE for(; i < roi_out->width - (8 - 1); i += 8, in += 8) { const __m128i input = _mm_load_si128((__m128i *)in); __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0)); __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0)); __m128 flo = _mm_cvtepi32_ps(ilo); __m128 fhi = _mm_cvtepi32_ps(ihi); flo = _mm_div_ps(_mm_sub_ps(flo, sub), div); fhi = _mm_div_ps(_mm_sub_ps(fhi, sub), div); _mm_stream_ps(out, flo); out += 4; _mm_stream_ps(out, fhi); out += 4; } // process the rest for(; i < roi_out->width; i++, in++, out++) { const int id = BL(roi_out, d, j, i); *out = MAX(0.0f, ((float)(*in)) - d->sub[id]) / d->div[id]; } } piece->pipe->filters = dt_rawspeed_crop_dcraw_filters(piece->pipe->filters, csx, csy); adjust_xtrans_filters(piece->pipe->xtrans, csx, csy); } else { // pre-downsampled buffer that needs black/white scaling const __m128 sub = _mm_load_ps(d->sub), div = _mm_load_ps(d->div); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const float *in = ((float *)ivoid) + (size_t)4 * (roi_in->width * (j + csy) + csx); float *out = ((float *)ovoid) + (size_t)4 * roi_out->width * j; // process aligned pixels with SSE for(int i = 0; i < roi_out->width; i++, in += 4, out += 4) { const __m128 input = _mm_load_ps(in); const __m128 scaled = _mm_div_ps(_mm_sub_ps(input, sub), div); _mm_stream_ps(out, scaled); } } } _mm_sfence(); }
void decodeInstruction(instruction_t instruction, uint32_t *dir_reg, char *dir_flags, uint8_t *SRAM, uint16_t *dec) { uint8_t *R_activos=instruction.registers_list; /* Comparacion de mnemonic y Llamado de las funciones */ if( strcmp(instruction.mnemonic,"ADC") == 0 || strcmp(instruction.mnemonic,"ADCS") == 0){ dir_reg[PC]++; *dec=16704; *dec=*dec|instruction.op3_value<<3|instruction.op1_value; dir_reg[instruction.op1_value]=ADC(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); } if( strcmp(instruction.mnemonic,"ADDS") == 0 || strcmp(instruction.mnemonic,"ADD") == 0){ dir_reg[PC]++; if(instruction.op2_type=='S'){ *dec=45056; dir_reg[SP]=ADD(dir_reg[SP],instruction.op3_value,dir_flags); *dec=*dec|instruction.op3_value;} else if(instruction.op3_type=='#'){ *dec=7168; *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value; dir_reg[instruction.op1_value]=ADD(dir_reg[instruction.op2_value], instruction.op3_value,dir_flags); mvprintw(4,20,"%X",*dec);} else{ *dec=6144; *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value; dir_reg[instruction.op1_value]=ADD(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);} } if( strcmp(instruction.mnemonic,"AND") == 0 || strcmp(instruction.mnemonic,"ANDS") == 0){ dir_reg[PC]++; *dec=16384; if(instruction.op3_type=='#'){ dir_reg[instruction.op1_value]=AND(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);} else dir_reg[instruction.op1_value]=AND(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); } if( strcmp(instruction.mnemonic,"ASR") == 0 || strcmp(instruction.mnemonic,"ASRS") == 0){ dir_reg[PC]++; if(instruction.op3_type=='#'){ *dec=4096; *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value; dir_reg[instruction.op1_value]=ASR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);} else{ *dec=16640; *dec=*dec|instruction.op3_value<<3|instruction.op1_value; dir_reg[instruction.op1_value]=ASR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags);} } if( strcmp(instruction.mnemonic,"BICS") == 0 || strcmp(instruction.mnemonic,"BICS") == 0){ dir_reg[PC]++; if(instruction.op3_type=='#') dir_reg[instruction.op1_value]=BIC(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags); else{ *dec=17280; dir_reg[instruction.op1_value]=BIC(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); *dec=*dec|instruction.op3_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"CMN" ) == 0 || strcmp(instruction.mnemonic,"CMNS") == 0){ dir_reg[PC]++; CMN(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value],dir_flags); *dec=17088; *dec=*dec|instruction.op2_value<<3|instruction.op1_value; mvprintw(4,20,"%X",*dec); } if( strcmp(instruction.mnemonic,"CMP") == 0 || strcmp(instruction.mnemonic,"CMPS") == 0){ dir_reg[PC]++; CMP(dir_reg[instruction.op1_value],dir_reg[instruction.op2_value],dir_flags); *dec=17024; *dec=*dec|instruction.op2_value<<3|instruction.op1_value; mvprintw(4,20,"%X",*dec); } if( strcmp(instruction.mnemonic,"EOR") == 0 || strcmp(instruction.mnemonic,"EORS") == 0){ dir_reg[PC]++; *dec=16448; if(instruction.op3_type=='#') dir_reg[instruction.op1_value]=EOR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags); else dir_reg[instruction.op1_value]=EOR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); } if( strcmp(instruction.mnemonic,"LSLS") == 0 || strcmp(instruction.mnemonic,"LSL") == 0){ dir_reg[PC]++; if(instruction.op3_type=='#'){ *dec=0; dir_reg[instruction.op1_value]=LSL(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} else{ *dec=16512; dir_reg[instruction.op1_value]=LSL(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); *dec=*dec|instruction.op3_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"LSRS") == 0 || strcmp(instruction.mnemonic,"LSR") == 0){ dir_reg[PC]++; if(instruction.op3_type=='#'){ *dec=2048; dir_reg[instruction.op1_value]=LSR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} else{ *dec=16576; dir_reg[instruction.op1_value]=LSR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); *dec=*dec|instruction.op3_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"MOV") == 0 || strcmp(instruction.mnemonic,"MOVS") == 0){ dir_reg[PC]++; if(instruction.op2_type=='#'){ *dec=8192; dir_reg[instruction.op1_value]=MOV(instruction.op2_value,dir_flags); *dec=*dec|instruction.op1_value<<8|instruction.op2_value;} else{ *dec=0; dir_reg[instruction.op1_value]=MOV(dir_reg[instruction.op2_value],dir_flags); *dec=*dec|instruction.op2_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"MUL") == 0 || strcmp(instruction.mnemonic,"MULS") == 0){ dir_reg[PC]++; *dec=17216; if(instruction.op3_type=='#'){ dir_reg[instruction.op1_value]=MUL(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);} else{ dir_reg[instruction.op1_value]=MUL(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); *dec=*dec|instruction.op2_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"MVN") == 0 || strcmp(instruction.mnemonic,"MVNS") == 0){ dir_reg[PC]++; *dec=17344; dir_reg[instruction.op1_value]=MVN(dir_reg[instruction.op2_value], dir_flags); *dec=*dec|instruction.op2_value<<3|instruction.op1_value; } if( strcmp(instruction.mnemonic,"ORR") == 0 || strcmp(instruction.mnemonic,"ORRS") == 0){ dir_reg[PC]++; *dec=17152; if(instruction.op3_type=='#'){ dir_reg[instruction.op1_value]=ORR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);} else{ dir_reg[instruction.op1_value]=ORR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); *dec=*dec|instruction.op3_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"REV") == 0 || strcmp(instruction.mnemonic,"REVS") == 0){ dir_reg[PC]++; *dec=47616; dir_reg[instruction.op1_value]=REV(dir_reg[instruction.op2_value]); *dec=*dec|instruction.op2_value<<3|instruction.op1_value; } if( strcmp(instruction.mnemonic,"REVG") == 0 || strcmp(instruction.mnemonic,"REVGS") == 0){ dir_reg[PC]++; *dec=47680; dir_reg[instruction.op1_value]=REVG(dir_reg[instruction.op2_value]); *dec=*dec|instruction.op2_value<<3|instruction.op1_value; } if( strcmp(instruction.mnemonic,"REVSH") == 0 || strcmp(instruction.mnemonic,"REVSHS") == 0){ dir_reg[PC]++; *dec=47808; dir_reg[instruction.op1_value]=REVSH(dir_reg[instruction.op2_value]); *dec=*dec|instruction.op2_value<<3|instruction.op1_value; } if( strcmp(instruction.mnemonic,"ROR") == 0 || strcmp(instruction.mnemonic,"RORS") == 0){ dir_reg[PC]++; *dec=16832; if(instruction.op3_type=='#'){ dir_reg[instruction.op1_value]=ROR(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags);} else{ dir_reg[instruction.op1_value]=ROR(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); *dec=*dec|instruction.op3_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"RSB") == 0 || strcmp(instruction.mnemonic,"RSBS") == 0){ dir_reg[PC]++; *dec=16690; dir_reg[instruction.op1_value]=RSB(dir_reg[instruction.op2_value], dir_flags); *dec=*dec|instruction.op2_value<<3|instruction.op1_value; } if( strcmp(instruction.mnemonic,"SBC") == 0 || strcmp(instruction.mnemonic,"SBCS") == 0){ dir_reg[PC]++; *dec=16768; SBC(dir_reg[instruction.op1_value],dir_reg[instruction.op2_value], dir_flags); *dec=*dec|instruction.op2_value<<3|instruction.op1_value; } if( strcmp(instruction.mnemonic,"SUBS") == 0 || strcmp(instruction.mnemonic,"SUB") == 0){ dir_reg[PC]++; if(instruction.op2_type=='S'){ *dec=45184; dir_reg[SP]=SUB(dir_reg[SP],instruction.op3_value,dir_flags); *dec=*dec|instruction.op3_value;} else if(instruction.op3_type=='#'){ *dec=7680; dir_reg[instruction.op1_value]=SUB(dir_reg[instruction.op2_value],instruction.op3_value,dir_flags); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} else{ *dec=6656; dir_reg[instruction.op1_value]=SUB(dir_reg[instruction.op2_value],dir_reg[instruction.op3_value],dir_flags); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} } if( strcmp(instruction.mnemonic,"TST") == 0 || strcmp(instruction.mnemonic,"TSTS") == 0){ dir_reg[PC]++; *dec=16896; TST(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], dir_flags); *dec=*dec|instruction.op2_value<<3|instruction.op1_value; } if( strcmp(instruction.mnemonic,"NOP") == 0 ){ NOP(dir_reg); *dec=48896; } if( strcmp(instruction.mnemonic,"B") == 0 ){ *dec=57344; *dec=*dec|instruction.op1_value; B(instruction.op1_value, dir_reg); } if( strcmp(instruction.mnemonic,"BL") == 0 ){ *dec=0; BL(instruction.op1_value, dir_reg); } if( strcmp(instruction.mnemonic,"BX") == 0 ){ *dec=18176; BX(dir_reg); } if( strcmp(instruction.mnemonic,"BEQ") == 0 ){ *dec=0; BEQ(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BNE") == 0 ){ *dec=0; BNE(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BCS") == 0 ){ *dec=0; BCS(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BCC") == 0 ){ *dec=0; BCC(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BMI") == 0 ){ *dec=0; BMI(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BPL") == 0 ){ *dec=0; BPL(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BVS") == 0 ){ *dec=0; BVS(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BVC") == 0 ){ *dec=0; BVC(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BHI") == 0 ){ *dec=0; BHI(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BLS") == 0 ){ *dec=0; BLS(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BGE") == 0 ){ *dec=0; BGE(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BLT") == 0 ){ *dec=0; BLT(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BGT") == 0 ){ *dec=0; BGT(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BLE") == 0 ){ *dec=0; BLE(instruction.op1_value, dir_reg, dir_flags); } if( strcmp(instruction.mnemonic,"BAL") == 0 ){ *dec=0; BAL(instruction.op1_value, dir_reg); } if(strcmp(instruction.mnemonic,"PUSH")==0){ dir_reg[PC]++; *dec=46080; PUSH(SRAM, dir_reg,R_activos); } if(strcmp(instruction.mnemonic,"POP")==0){ dir_reg[PC]++; *dec=48128; POP(SRAM,dir_reg,R_activos); } data=(uint8_t)dir_reg[instruction.op1_value]; if(strcmp(instruction.mnemonic,"LDR")==0){ dir_reg[PC]++; if(instruction.op2_type=='=' && instruction.op3_type=='N'){ *dec=0; dir_reg[instruction.op1_value]=instruction.op2_value;} else if(instruction.op2_type=='S'){ *dec=38912; dir_reg[instruction.op1_value]=LDR(dir_reg[SP], instruction.op3_value<<2, SRAM); *dec=*dec|instruction.op3_value|instruction.op1_value<<8;} else if(instruction.op3_type=='#' || instruction.op3_type=='N'){ *dec=26624; if((dir_reg[instruction.op2_value]+(instruction.op3_value<<2))>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<2)), &data,Read); else dir_reg[instruction.op1_value]=LDR(dir_reg[instruction.op2_value], instruction.op3_value<<2, SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value|instruction.op1_value;} else{ *dec=22528; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read); else dir_reg[instruction.op1_value]=LDR(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} } if(strcmp(instruction.mnemonic,"LDRB")==0){ dir_reg[PC]++; if(instruction.op3_type=='#' || instruction.op3_type=='N'){ *dec=30720; if((dir_reg[instruction.op2_value]+instruction.op3_value)>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+instruction.op3_value), &data,Read); else dir_reg[instruction.op1_value]=LDRB(dir_reg[instruction.op2_value], instruction.op3_value, SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} else{ *dec=23552; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read); else dir_reg[instruction.op1_value]=LDRB(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} } if(strcmp(instruction.mnemonic,"LDRH")==0){ dir_reg[PC]++; if(instruction.op3_type=='#' || instruction.op3_type=='N'){ *dec=34816; if((dir_reg[instruction.op2_value]+(instruction.op3_value<<1))>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<1)), &data,Read); else dir_reg[instruction.op1_value]=LDRH(dir_reg[instruction.op2_value], instruction.op3_value<<1, SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} else{ *dec=23040; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read); else dir_reg[instruction.op1_value]=LDRH(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} } if(strcmp(instruction.mnemonic,"LDRSB")==0){ dir_reg[PC]++; *dec=22016; *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read); else dir_reg[instruction.op1_value]=LDRSB(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM); } if(strcmp(instruction.mnemonic,"LDRSH")==0){ dir_reg[PC]++; *dec=24064; *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value]), &data,Read); else dir_reg[instruction.op1_value]=LDRSH(dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM); } if(strcmp(instruction.mnemonic,"STR")==0){ dir_reg[PC]++; if(instruction.op2_type=='S'){ *dec=38912; STR(dir_reg[instruction.op1_value],dir_reg[SP], instruction.op3_value<<2, SRAM); *dec=*dec|instruction.op3_value|instruction.op1_value<<8;} else if(instruction.op3_type=='#' || instruction.op3_type=='N'){ *dec=24576; if((dir_reg[instruction.op2_value]+(instruction.op3_value<<2))>=0x40000000){ IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<2)), &data,Write);} else{ STR(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], instruction.op3_value<<2, SRAM);} *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value; mvprintw(1,3,"Hola");} else{ *dec=20480; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op2_value])>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(dir_reg[instruction.op3_value])), &data,Write); else{ STR(dir_reg[instruction.op1_value], instruction.op2_value, instruction.op3_value, SRAM);} *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} } if(strcmp(instruction.mnemonic,"STRB")==0){ dir_reg[PC]++; if(instruction.op3_type=='#' || instruction.op3_type=='N'){ *dec=28672; if(dir_reg[instruction.op2_value]+instruction.op3_value>=0x40000000){ IOAccess((uint8_t)(dir_reg[instruction.op2_value]+instruction.op3_value), &data,Write);} else{ STRB(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], instruction.op3_value, SRAM);} *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} else{ *dec=21504; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000){ IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(dir_reg[instruction.op3_value])), &data,Write);} else{ STRB(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM);} *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} } if(strcmp(instruction.mnemonic,"STRH")==0){ dir_reg[PC]++; if(instruction.op3_type=='#' || instruction.op3_type=='N'){ *dec=32768; if(((dir_reg[instruction.op2_value])+(instruction.op3_value<<1))>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(instruction.op3_value<<1)),&data,Write); else STRH(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], instruction.op3_value<<1, SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} else{ *dec=20992; if((dir_reg[instruction.op2_value]+dir_reg[instruction.op3_value])>=0x40000000) IOAccess((uint8_t)(dir_reg[instruction.op2_value]+(dir_reg[instruction.op3_value])), &data,Write); else STRH(dir_reg[instruction.op1_value], dir_reg[instruction.op2_value], dir_reg[instruction.op3_value], SRAM); *dec=*dec|instruction.op3_value<<6|instruction.op2_value<<3|instruction.op1_value;} } }
GString GetSVGText(int diffIndex, double intervalduration = 1, double peakMargin = 1.2) { std::stringstream out; vector<int> dataPoints = GetDataPoints(diffIndex, intervalduration); auto peak_it = std::max_element(dataPoints.begin(), dataPoints.end()); float peakf = *peak_it; double peak = *peak_it * peakMargin; out << "<svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\">\n"; auto ptIdx = 0; float ImageHeight = CfgValNPS("GraphHeight", 300); float GraphYOffset = CfgValNPS("GraphYOffs", 50); float GraphXOffset = CfgValNPS("GraphXOffs", 100); float IntervalWidth = 10; float GraphWidth = dataPoints.size() * IntervalWidth; float RealGraphWidth = CfgValNPS("Width", 1000); float XRatio = RealGraphWidth / GraphWidth; Vec2 BL(GraphXOffset, GraphYOffset + ImageHeight); Vec2 BR(RealGraphWidth, 0); Vec2 TL(GraphXOffset, GraphYOffset); BR += BL; GString DiffAuth = Song->GetDifficulty(diffIndex)->Author; if (!DiffAuth.length()) DiffAuth = "an anonymous charter"; float avgNPS = Song->GetDifficulty(diffIndex)->TotalScoringObjects / Song->GetDifficulty(diffIndex)->Duration; out << Utility::Format("<text x=\"%d\" y=\"%d\" fill=\"black\">%s - %s (%s) by %s (Max NPS: %.2f/Avg NPS: %.2f)</text>\n", 20, 20, Song->SongName, Song->SongAuthor, Song->GetDifficulty(diffIndex)->Name, DiffAuth, peakf / intervalduration, avgNPS); out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(0,0,0);stroke-width:4\"/>\n", BL.x, BL.y, BR.x, BR.y); out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(0,0,0);stroke-width:4\"/>\n", TL.x, TL.y, BL.x, BL.y); auto ptAmt = 5; for (auto i = 1; i <= ptAmt; i++) { float X = (BL.x - GraphXOffset / 2); float Y = (BL.y - i * (ImageHeight / ptAmt / peakMargin)); float Value = (peakf * i / ptAmt / intervalduration); out << Utility::Format("\t<text x=\"%d\" y=\"%d\" fill=\"black\">%.2f</text>\n", X, Y, Value); out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(0,0,0);stroke-width:0.5\"/>\n", X, Y, GraphXOffset + RealGraphWidth, Y); } for (auto point : dataPoints) { double relativeFreq = point / peak; double relFreqNext; int x1, y1, x2, y2; if (ptIdx + 1 < dataPoints.size()) { relFreqNext = dataPoints[ptIdx + 1] / peak; } else relFreqNext = 0; x1 = IntervalWidth * ptIdx * XRatio + GraphXOffset; y1 = ImageHeight - ImageHeight * relativeFreq + GraphYOffset; x2 = IntervalWidth * (ptIdx + 1) * XRatio + GraphXOffset; y2 = ImageHeight - ImageHeight * relFreqNext + GraphYOffset; out << Utility::Format("\t<line x1 = \"%d\" y1 = \"%d\" x2 = \"%d\" y2 = \"%d\" style = \"stroke:rgb(255,0,0);stroke-width:2\"/>\n", x1, y1, x2, y2); ptIdx++; } out << "</svg>"; return out.str(); }
void PHILinkItem::html( const PHIRequest *req, QByteArray &out, QByteArray &script, const QByteArray &indent ) const { PHILabelItem::html( req, out, script, indent ); QByteArray arr=data( DUrl ).toByteArray(); script+=BL( "$$link('" )+id()+BL( "'," ); if ( arr.isEmpty() ) { script+=BL( "undefined" ); } else { script+=BL( "function(){" ); if ( arr.startsWith( "javascript:" ) ) { arr.remove( 0, 11 ); script+=arr; } else { arr.replace( '\'', BL( "\\'" ) ); script+=BL( "phi.href('" )+arr+BL( "');" ); } script+='}'; } script+=BL( ",'" )+cssColor( realColor() )+BL( "','" ); if ( realBackgroundColor()!=QColor( Qt::transparent ) ) script+=cssColor( realBackgroundColor() ); script+=BL( "','" )+cssColor( realHoverColor() )+BL( "','" ); if ( realHoverBgColor()!=QColor( Qt::transparent ) ) script+=cssColor( realHoverBgColor() ); script+=BL( "')" ); if ( arr.isEmpty() ) script+=BL( ";\n" ); else script+=BL( ".cursor('pointer');\n" ); }
transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) { uint32_t offsets[8] = {0, 4*N, 2*N, 6*N, N, 5*N, 7*N, 3*N}; uint32_t offsets_o[8] = {0, 4*N, 2*N, 6*N, 7*N, 3*N, N, 5*N}; int32_t pAddr = 0; int32_t pN = 0; int32_t pLUT = 0; insns_t *fp; insns_t *start; insns_t *x_4_addr; insns_t *x_8_addr; uint32_t loop_count; int count; ptrdiff_t len; size_t *ps; size_t *pps; count = ffts_tree_count(N, leaf_N, 0) + 1; ps = pps = malloc(2 * count * sizeof(*ps)); if (!ps) { return NULL; } ffts_elaborate_tree(&pps, N, leaf_N, 0); pps[0] = 0; pps[1] = 0; pps = ps; #ifdef HAVE_SSE if (sign < 0) { p->constants = (const void*) sse_constants; } else { p->constants = (const void*) sse_constants_inv; } #endif fp = (insns_t*) p->transform_base; /* generate base cases */ x_4_addr = generate_size4_base_case(&fp, sign); x_8_addr = generate_size8_base_case(&fp, sign); #ifdef __arm__ start = generate_prologue(&fp, p); #ifdef HAVE_NEON memcpy(fp, neon_ee, neon_oo - neon_ee); if (sign < 0) { fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_oo - neon_ee) / 4; #else memcpy(fp, vfp_e, vfp_o - vfp_e); if (sign > 0) { fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; } fp += (vfp_o - vfp_e) / 4; #endif #else /* generate functions */ start = generate_prologue(&fp, p); loop_count = 4 * p->i0; generate_leaf_init(&fp, loop_count); if (ffts_ctzl(N) & 1) { generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0); if (p->i1) { loop_count += 4 * p->i1; generate_leaf_oo(&fp, loop_count, offsets_o, 7); } loop_count += 4; generate_leaf_oe(&fp, offsets_o); } else { generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8); loop_count += 4; generate_leaf_eo(&fp, offsets); if (p->i1) { loop_count += 4 * p->i1; generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7); } } if (p->i1) { uint32_t offsets_oe[8] = {7*N, 3*N, N, 5*N, 0, 4*N, 6*N, 2*N}; loop_count += 4 * p->i1; /* align loop/jump destination */ #ifdef _M_X64 x86_mov_reg_imm(fp, X86_EBX, loop_count); #else x86_mov_reg_imm(fp, X86_ECX, loop_count); ffts_align_mem16(&fp, 9); #endif generate_leaf_ee(&fp, offsets_oe, 0); } generate_transform_init(&fp); /* generate subtransform calls */ count = 2; while (pps[0]) { size_t ws_is; if (!pN) { #ifdef _M_X64 x86_mov_reg_imm(fp, X86_EBX, pps[0]); #else x86_mov_reg_imm(fp, X86_ECX, pps[0] / 4); #endif } else { int offset = (4 * pps[1]) - pAddr; if (offset) { #ifdef _M_X64 x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8); #else x64_alu_reg_imm_size(fp, X86_ADD, X64_RDX, offset, 8); #endif } if (pps[0] > leaf_N && pps[0] - pN) { int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN); #ifdef _M_X64 if (factor > 0) { x86_shift_reg_imm(fp, X86_SHL, X86_EBX, factor); } else { x86_shift_reg_imm(fp, X86_SHR, X86_EBX, -factor); } #else if (factor > 0) { x86_shift_reg_imm(fp, X86_SHL, X86_ECX, factor); } else { x86_shift_reg_imm(fp, X86_SHR, X86_ECX, -factor); } #endif } } ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1]; if (ws_is != pLUT) { int offset = (int) (ws_is - pLUT); #ifdef _M_X64 x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8); #else x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8); #endif } if (pps[0] == 2 * leaf_N) { x64_call_code(fp, x_4_addr); } else { x64_call_code(fp, x_8_addr); } pAddr = 4 * pps[1]; if (pps[0] > leaf_N) { pN = pps[0]; } pLUT = ws_is;//LUT_offset(pps[0], leafN); //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); count += 4; pps += 2; } #endif #ifdef __arm__ #ifdef HAVE_NEON if (ffts_ctzl(N) & 1) { ADDI(&fp, 2, 7, 0); ADDI(&fp, 7, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 8, 0); ADDI(&fp, 8, 10, 0); ADDI(&fp, 10, 2, 0); if(p->i1) { MOVI(&fp, 11, p->i1); memcpy(fp, neon_oo, neon_eo - neon_oo); if(sign < 0) { fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_eo - neon_oo) / 4; } *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; memcpy(fp, neon_oe, neon_end - neon_oe); if(sign < 0) { fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000; } fp += (neon_end - neon_oe) / 4; } else { *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; memcpy(fp, neon_eo, neon_oe - neon_eo); if(sign < 0) { fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000; fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000; } fp += (neon_oe - neon_eo) / 4; ADDI(&fp, 2, 7, 0); ADDI(&fp, 7, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 8, 0); ADDI(&fp, 8, 10, 0); ADDI(&fp, 10, 2, 0); if(p->i1) { MOVI(&fp, 11, p->i1); memcpy(fp, neon_oo, neon_eo - neon_oo); if(sign < 0) { fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_eo - neon_oo) / 4; } } if(p->i1) { ADDI(&fp, 2, 3, 0); ADDI(&fp, 3, 7, 0); ADDI(&fp, 7, 2, 0); ADDI(&fp, 2, 4, 0); ADDI(&fp, 4, 8, 0); ADDI(&fp, 8, 2, 0); ADDI(&fp, 2, 5, 0); ADDI(&fp, 5, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 6, 0); ADDI(&fp, 6, 10, 0); ADDI(&fp, 10, 2, 0); ADDI(&fp, 2, 9, 0); ADDI(&fp, 9, 10, 0); ADDI(&fp, 10, 2, 0); *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; MOVI(&fp, 11, p->i1); memcpy(fp, neon_ee, neon_oo - neon_ee); if(sign < 0) { fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_oo - neon_ee) / 4; } #else ADDI(&fp, 2, 7, 0); ADDI(&fp, 7, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 8, 0); ADDI(&fp, 8, 10, 0); ADDI(&fp, 10, 2, 0); MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); memcpy(fp, vfp_o, vfp_x4 - vfp_o); if(sign > 0) { fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040; fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040; } fp += (vfp_x4 - vfp_o) / 4; ADDI(&fp, 2, 3, 0); ADDI(&fp, 3, 7, 0); ADDI(&fp, 7, 2, 0); ADDI(&fp, 2, 4, 0); ADDI(&fp, 4, 8, 0); ADDI(&fp, 8, 2, 0); ADDI(&fp, 2, 5, 0); ADDI(&fp, 5, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 6, 0); ADDI(&fp, 6, 10, 0); ADDI(&fp, 10, 2, 0); ADDI(&fp, 2, 9, 0); ADDI(&fp, 9, 10, 0); ADDI(&fp, 10, 2, 0); *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); memcpy(fp, vfp_e, vfp_o - vfp_e); if(sign > 0) { fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; } fp += (vfp_o - vfp_e) / 4; #endif *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12 //ADDI(&fp, 2, 1, 0); MOVI(&fp, 1, 0); // args: r0 - out // r1 - N // r2 - ws // ADDI(&fp, 3, 1, 0); // put N into r3 for counter count = 2; while(pps[0]) { // fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); if(!pN) { MOVI(&fp, 1, pps[0]); } else { if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); } if (p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT) { ADDI(&fp, 2, 2, p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT); } if(pps[0] == 2 * leaf_N) { *fp = BL(fp+2, x_4_addr); fp++; } else if(!pps[2]) { //uint32_t *x_8_t_addr = fp; #ifdef HAVE_NEON memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); if(sign < 0) { fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; } fp += (neon_ee - neon_x8_t) / 4; //*fp++ = BL(fp+2, x_8_t_addr); #else *fp = BL(fp+2, x_8_addr); fp++; #endif } else { *fp = BL(fp+2, x_8_addr); fp++; } pAddr = pps[1] * 4; pN = pps[0]; pLUT = p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN); // fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); count += 4; pps += 2; } *fp++ = 0xecbd8b10; *fp++ = POP_LR(); count++; #else generate_epilogue(&fp); #endif // *fp++ = B(14); count++; //for(int i=0;i<(neon_x8 - neon_x4)/4;i++) // fprintf(stderr, "%08x\n", x_4_addr[i]); //fprintf(stderr, "\n"); //for(int i=0;i<count;i++) //fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp)); free(ps); #if defined(_MSC_VER) #pragma warning(push) /* disable type cast warning from data pointer to function pointer */ #pragma warning(disable : 4055) #endif return (transform_func_t) start; #if defined(_MSC_VER) #pragma warning(pop) #endif }