inline void SolveAfterCholesky ( UpperOrLower uplo, Orientation orientation, const DistMatrix<F>& A, DistMatrix<F>& B ) { #ifndef RELEASE PushCallStack("SolveAfterLU"); if( A.Grid() != B.Grid() ) throw std::logic_error("{A,B} must be distributed over the same grid"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( A.Height() != B.Height() ) throw std::logic_error("A and B must be the same height"); #endif if( B.Width() == 1 ) { if( uplo == LOWER ) { if( orientation == TRANSPOSE ) Conj( B ); Trsv( LOWER, NORMAL, NON_UNIT, A, B ); Trsv( LOWER, ADJOINT, NON_UNIT, A, B ); if( orientation == TRANSPOSE ) Conj( B ); } else { if( orientation == TRANSPOSE ) Conj( B ); Trsv( UPPER, ADJOINT, NON_UNIT, A, B ); Trsv( UPPER, NORMAL, NON_UNIT, A, B ); if( orientation == TRANSPOSE ) Conj( B ); } } else { if( uplo == LOWER ) { if( orientation == TRANSPOSE ) Conj( B ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, F(1), A, B ); Trsm( LEFT, LOWER, ADJOINT, NON_UNIT, F(1), A, B ); if( orientation == TRANSPOSE ) Conj( B ); } else { if( orientation == TRANSPOSE ) Conj( B ); Trsm( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), A, B ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(1), A, B ); if( orientation == TRANSPOSE ) Conj( B ); } } #ifndef RELEASE PopCallStack(); #endif }
void SolveAfter ( Orientation orientation, const Matrix<F>& A, const Matrix<F>& householderScalars, const Matrix<Base<F>>& signature, const Matrix<F>& B, Matrix<F>& X ) { DEBUG_CSE const Int m = A.Height(); const Int n = A.Width(); if( m > n ) LogicError("Must have full row rank"); // TODO: Add scaling auto AL = A( IR(0,m), IR(0,m) ); if( orientation == NORMAL ) { if( m != B.Height() ) LogicError("A and B do not conform"); // Copy B into X X.Resize( n, B.Width() ); auto XT = X( IR(0,m), ALL ); auto XB = X( IR(m,n), ALL ); XT = B; Zero( XB ); // Solve against L (checking for singularities) Trsm( LEFT, LOWER, NORMAL, NON_UNIT, F(1), AL, XT, true ); // Apply Q' to X lq::ApplyQ( LEFT, ADJOINT, A, householderScalars, signature, X ); } else // orientation in {TRANSPOSE,ADJOINT} { if( n != B.Height() ) LogicError("A and B do not conform"); // Copy B into X X = B; if( orientation == TRANSPOSE ) Conjugate( X ); // Apply Q to X lq::ApplyQ( LEFT, NORMAL, A, householderScalars, signature, X ); // Shrink X to its new height X.Resize( m, X.Width() ); // Solve against L' (check for singularities) Trsm( LEFT, LOWER, ADJOINT, NON_UNIT, F(1), AL, X, true ); if( orientation == TRANSPOSE ) Conjugate( X ); } }
inline void SolveAfter ( UpperOrLower uplo, Orientation orientation, const DistMatrix<F>& A, DistMatrix<F>& B ) { #ifndef RELEASE CallStackEntry entry("cholesky::SolveAfter"); if( A.Grid() != B.Grid() ) LogicError("{A,B} must be distributed over the same grid"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( A.Height() != B.Height() ) LogicError("A and B must be the same height"); #endif if( B.Width() == 1 ) { if( uplo == LOWER ) { if( orientation == TRANSPOSE ) Conjugate( B ); Trsv( LOWER, NORMAL, NON_UNIT, A, B ); Trsv( LOWER, ADJOINT, NON_UNIT, A, B ); if( orientation == TRANSPOSE ) Conjugate( B ); } else { if( orientation == TRANSPOSE ) Conjugate( B ); Trsv( UPPER, ADJOINT, NON_UNIT, A, B ); Trsv( UPPER, NORMAL, NON_UNIT, A, B ); if( orientation == TRANSPOSE ) Conjugate( B ); } } else { if( uplo == LOWER ) { if( orientation == TRANSPOSE ) Conjugate( B ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, F(1), A, B ); Trsm( LEFT, LOWER, ADJOINT, NON_UNIT, F(1), A, B ); if( orientation == TRANSPOSE ) Conjugate( B ); } else { if( orientation == TRANSPOSE ) Conjugate( B ); Trsm( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), A, B ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(1), A, B ); if( orientation == TRANSPOSE ) Conjugate( B ); } } }
void LowerBlocked( Matrix<F>& A, Matrix<F>& householderScalars ) { DEBUG_CSE const Int n = A.Height(); householderScalars.Resize( Max(n-1,0), 1 ); Matrix<F> UB1, V01, VB1, G11; const Int bsize = Blocksize(); for( Int k=0; k<n-1; k+=bsize ) { const Int nb = Min(bsize,n-1-k); const Range<Int> ind0( 0, k ), ind1( k, k+nb ), indB( k, n ), indR( k, n ), ind2( k+nb, n ); auto ABR = A( indB, indR ); auto A22 = A( ind2, ind2 ); auto householderScalars1 = householderScalars( ind1, ALL ); UB1.Resize( n-k, nb ); VB1.Resize( n-k, nb ); G11.Resize( nb, nb ); hessenberg::LowerPanel( ABR, householderScalars1, UB1, VB1, G11 ); auto AB0 = A( indB, ind0 ); auto A2R = A( ind2, indR ); auto U21 = UB1( IR(nb,END), ALL ); auto V21 = VB1( IR(nb,END), ALL ); // AB0 := AB0 - (UB1 inv(G11)^H UB1^H AB0) // = AB0 - (UB1 ((AB0^H UB1) inv(G11))^H) // ------------------------------------------- Gemm( ADJOINT, NORMAL, F(1), AB0, UB1, V01 ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), G11, V01 ); Gemm( NORMAL, ADJOINT, F(-1), UB1, V01, F(1), AB0 ); // A2R := (A2R - U21 inv(G11)^H VB1^H)(I - UB1 inv(G11) UB1^H) // ----------------------------------------------------------- // A2R := A2R - U21 inv(G11)^H VB1^H // (note: VB1 is overwritten) Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), G11, VB1 ); Gemm( NORMAL, ADJOINT, F(-1), U21, VB1, F(1), A2R ); // A2R := A2R - ((A2R UB1) inv(G11)) UB1^H Gemm( NORMAL, NORMAL, F(1), A2R, UB1, F(0), V21 ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), G11, V21 ); Gemm( NORMAL, ADJOINT, F(-1), V21, UB1, F(1), A2R ); } }
inline void SolveAfterLU ( Orientation orientation, const DistMatrix<F>& A, const DistMatrix<int,VC,STAR>& p, DistMatrix<F>& B ) { #ifndef RELEASE PushCallStack("SolveAfterLU"); if( A.Grid() != B.Grid() || A.Grid() != p.Grid() ) throw std::logic_error("{A,B} must be distributed over the same grid"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( A.Height() != B.Height() ) throw std::logic_error("A and B must be the same height"); if( A.Height() != p.Height() ) throw std::logic_error("A and p must be the same height"); #endif if( B.Width() == 1 ) { if( orientation == NORMAL ) { ApplyRowPivots( B, p ); Trsv( LOWER, NORMAL, UNIT, A, B ); Trsv( UPPER, NORMAL, NON_UNIT, A, B ); } else { Trsv( UPPER, orientation, NON_UNIT, A, B ); Trsv( LOWER, orientation, UNIT, A, B ); ApplyInverseRowPivots( B, p ); } } else { if( orientation == NORMAL ) { ApplyRowPivots( B, p ); Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A, B ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(1), A, B ); } else { Trsm( LEFT, UPPER, orientation, NON_UNIT, F(1), A, B ); Trsm( LEFT, LOWER, orientation, UNIT, F(1), A, B ); ApplyInverseRowPivots( B, p ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void CholeskyUVar2( Matrix<F>& A ) { #ifndef RELEASE PushCallStack("hpd_inverse::CholeskyUVar2"); if( A.Height() != A.Width() ) throw std::logic_error("Nonsquare matrices cannot be triangular"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); //--------------------------------------------------------------------// Cholesky( UPPER, A11 ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), A11, A01 ); Trsm( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), A11, A12 ); Herk( UPPER, NORMAL, F(1), A01, F(1), A00 ); Gemm( NORMAL, NORMAL, F(-1), A01, A12, F(1), A02 ); Herk( UPPER, ADJOINT, F(-1), A12, F(1), A22 ); Trsm( RIGHT, UPPER, ADJOINT, NON_UNIT, F(1), A11, A01 ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(-1), A11, A12 ); TriangularInverse( UPPER, NON_UNIT, A11 ); Trtrmm( ADJOINT, UPPER, A11 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void RLHF( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE CallStackEntry entry("apply_packed_reflectors::RLHF"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> ALeft; Matrix<R> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, R(1) ); Syrk( UPPER, NORMAL, R(1), HPanCopy, SInv ); HalveMainDiagonal( SInv ); Gemm( NORMAL, TRANSPOSE, R(1), ALeft, HPanCopy, Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, NORMAL, R(-1), Z, HPanCopy, R(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } }
inline typename Base<F>::type LogDetDivergence ( UpperOrLower uplo, const DistMatrix<F>& A, const DistMatrix<F>& B ) { #ifndef RELEASE PushCallStack("LogDetDivergence"); #endif if( A.Grid() != B.Grid() ) throw std::logic_error("A and B must use the same grid"); if( A.Height() != A.Width() || B.Height() != B.Width() || A.Height() != B.Height() ) throw std::logic_error ("A and B must be square matrices of the same size"); typedef typename Base<F>::type R; const int n = A.Height(); const Grid& g = A.Grid(); DistMatrix<F> ACopy( A ); DistMatrix<F> BCopy( B ); Cholesky( uplo, ACopy ); Cholesky( uplo, BCopy ); if( uplo == LOWER ) { Trtrsm( LEFT, uplo, NORMAL, NON_UNIT, F(1), BCopy, ACopy ); } else { MakeTrapezoidal( LEFT, uplo, 0, ACopy ); Trsm( LEFT, uplo, NORMAL, NON_UNIT, F(1), BCopy, ACopy ); } MakeTrapezoidal( LEFT, uplo, 0, ACopy ); const R frobNorm = Norm( ACopy, FROBENIUS_NORM ); R logDet; R localLogDet(0); DistMatrix<F,MD,STAR> d(g); ACopy.GetDiagonal( d ); if( d.InDiagonal() ) { const int nLocalDiag = d.LocalHeight(); for( int iLocal=0; iLocal<nLocalDiag; ++iLocal ) { const R delta = RealPart(d.GetLocal(iLocal,0)); localLogDet += 2*Log(delta); } } mpi::AllReduce( &localLogDet, &logDet, 1, mpi::SUM, g.VCComm() ); const R logDetDiv = frobNorm*frobNorm - logDet - R(n); #ifndef RELEASE PopCallStack(); #endif return logDetDiv; }
void BackwardMany ( const DistMatrix<F,VC,STAR>& L, DistMatrix<F,VC,STAR>& X, bool conjugate=false ) { // TODO: Replace this with modified inline code? const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE ); Trsm( LEFT, LOWER, orientation, UNIT, F(1), L, X, false, TRSM_SMALL ); }
inline void TriangularInverseUVar3( UnitOrNonUnit diag, Matrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TriangularInverseUVar3"); if( U.Height() != U.Width() ) throw std::logic_error("Nonsquare matrices cannot be triangular"); #endif // Matrix views Matrix<F> UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; // Start the algorithm PartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( UBR.Height() < U.Height() ) { RepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); //--------------------------------------------------------------------// Trsm( RIGHT, UPPER, NORMAL, diag, F(-1), U11, U01 ); Gemm( NORMAL, NORMAL, F(1), U01, U12, F(1), U02 ); Trsm( LEFT, UPPER, NORMAL, diag, F(1), U11, U12 ); TriangularInverseUVar3Unb( diag, U11 ); //--------------------------------------------------------------------// SlidePartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); } #ifndef RELEASE PopCallStack(); #endif }
// Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; }
inline void LU( Matrix<F>& A ) { #ifndef RELEASE PushCallStack("LU"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); //--------------------------------------------------------------------// internal::LUUnb( A11 ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), A11, A21 ); Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A11, A12 ); Gemm( NORMAL, NORMAL, F(-1), A21, A12, F(1), A22 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void SolveAfterLU( Orientation orientation, const Matrix<F>& A, Matrix<F>& B ) { #ifndef RELEASE PushCallStack("SolveAfterLU"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( A.Height() != B.Height() ) throw std::logic_error("A and B must be the same height"); #endif if( B.Width() == 1 ) { if( orientation == NORMAL ) { Trsv( LOWER, NORMAL, UNIT, A, B ); Trsv( UPPER, NORMAL, NON_UNIT, A, B ); } else { Trsv( UPPER, orientation, NON_UNIT, A, B ); Trsv( LOWER, orientation, UNIT, A, B ); } } else { if( orientation == NORMAL ) { Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A, B ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(1), A, B ); } else { Trsm( LEFT, UPPER, orientation, NON_UNIT, F(1), A, B ); Trsm( LEFT, LOWER, orientation, UNIT, F(1), A, B ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void GaussianElimination( Matrix<F>& A, Matrix<F>& B ) { #ifndef RELEASE CallStackEntry entry("GaussianElimination"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( A.Height() != B.Height() ) LogicError("A and B must be the same height"); #endif RowEchelon( A, B ); if( B.Width() == 1 ) Trsv( UPPER, NORMAL, NON_UNIT, A, B ); else Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(1), A, B ); }
inline typename Base<F>::type LogDetDivergence( UpperOrLower uplo, const Matrix<F>& A, const Matrix<F>& B ) { #ifndef RELEASE PushCallStack("LogDetDivergence"); #endif if( A.Height() != A.Width() || B.Height() != B.Width() || A.Height() != B.Height() ) throw std::logic_error ("A and B must be square matrices of the same size"); typedef typename Base<F>::type R; const int n = A.Height(); Matrix<F> ACopy( A ); Matrix<F> BCopy( B ); Cholesky( uplo, ACopy ); Cholesky( uplo, BCopy ); if( uplo == LOWER ) { Trtrsm( LEFT, uplo, NORMAL, NON_UNIT, F(1), BCopy, ACopy ); } else { MakeTrapezoidal( LEFT, uplo, 0, ACopy ); Trsm( LEFT, uplo, NORMAL, NON_UNIT, F(1), BCopy, ACopy ); } MakeTrapezoidal( LEFT, uplo, 0, ACopy ); const R frobNorm = Norm( ACopy, FROBENIUS_NORM ); Matrix<F> d; ACopy.GetDiagonal( d ); R logDet(0); for( int i=0; i<n; ++i ) logDet += 2*Log( RealPart(d.Get(i,0)) ); const R logDetDiv = frobNorm*frobNorm - logDet - R(n); #ifndef RELEASE PopCallStack(); #endif return logDetDiv; }
inline void CholeskyLVar2( Matrix<F>& A ) { #ifndef RELEASE PushCallStack("internal::CholeskyLVar2"); if( A.Height() != A.Width() ) throw std::logic_error ("Can only compute Cholesky factor of square matrices"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); //--------------------------------------------------------------------// Herk( LOWER, NORMAL, F(-1), A10, F(1), A11 ); CholeskyLVar3Unb( A11 ); Gemm( NORMAL, ADJOINT, F(-1), A20, A10, F(1), A21 ); Trsm( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), A11, A21 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void BusingerGolub ( AbstractDistMatrix<F>& APre, DistPermutation& Omega, AbstractDistMatrix<F>& Z, const QRCtrl<Base<F>>& ctrl ) { EL_DEBUG_CSE typedef Base<F> Real; DistMatrixReadWriteProxy<F,F,MC,MR> AProx( APre ); auto& A = AProx.Get(); auto ctrlCopy = ctrl; const Int m = A.Height(); const Int n = A.Width(); const Real eps = limits::Epsilon<Real>(); // Demand that we will be able to apply inv(R_L) to R_R by ensuring that // the minimum singular value is sufficiently (relatively) large ctrlCopy.adaptive = true; if( ctrl.boundRank ) { ctrlCopy.tol = Max(ctrl.tol,eps*ctrl.maxRank); } else { ctrlCopy.tol = Max(ctrl.tol,eps*Min(m,n)); } // Perform an adaptive pivoted QR factorization DistMatrix<F,MD,STAR> householderScalars(A.Grid()); DistMatrix<Base<F>,MD,STAR> signature(A.Grid()); QR( A, householderScalars, signature, Omega, ctrlCopy ); const Int numSteps = householderScalars.Height(); auto RL = A( IR(0,numSteps), IR(0,numSteps) ); auto RR = A( IR(0,numSteps), IR(numSteps,n) ); Copy( RR, Z ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(1), RL, Z ); }
inline void UVar3( Matrix<F>& A ) { #ifndef RELEASE CallStackEntry entry("cholesky::UVar3"); if( A.Height() != A.Width() ) throw std::logic_error ("Can only compute Cholesky factor of square matrices"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ABR.Height() > 0 ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); //--------------------------------------------------------------------// cholesky::UVar3Unb( A11 ); Trsm( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), A11, A12 ); Herk( UPPER, ADJOINT, F(-1), A12, F(1), A22 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } }
inline void BusingerGolub ( Matrix<F>& A, Permutation& Omega, Matrix<F>& Z, const QRCtrl<Base<F>>& ctrl ) { EL_DEBUG_CSE typedef Base<F> Real; auto ctrlCopy = ctrl; const Int m = A.Height(); const Int n = A.Width(); const Real eps = limits::Epsilon<Real>(); // Demand that we will be able to apply inv(R_L) to R_R by ensuring that // the minimum singular value is sufficiently (relatively) large ctrlCopy.adaptive = true; if( ctrl.boundRank ) { ctrlCopy.tol = Max(ctrl.tol,eps*ctrl.maxRank); } else { ctrlCopy.tol = Max(ctrl.tol,eps*Min(m,n)); } // Perform the pivoted QR factorization Matrix<F> householderScalars; Matrix<Base<F>> signature; QR( A, householderScalars, signature, Omega, ctrlCopy ); const Int numSteps = householderScalars.Height(); // Now form a minimizer of || RL Z - RR ||_2 via pseudo triangular solves auto RL = A( IR(0,numSteps), IR(0,numSteps) ); auto RR = A( IR(0,numSteps), IR(numSteps,n) ); Z = RR; Trsm( LEFT, UPPER, NORMAL, NON_UNIT, F(1), RL, Z ); }
void LLN ( UnitOrNonUnit diag, F alpha, const Matrix<F>& L, Matrix<F>& X, bool checkIfSingular=true ) { DEBUG_CSE const Int n = L.Height(); const Int bsize = Blocksize(); Matrix<F> Z11; ScaleTrapezoid( alpha, LOWER, X ); for( Int k=0; k<n; k+=bsize ) { const Int nb = Min(bsize,n-k); const Range<Int> ind0( 0, k ), ind1( k, k+nb ), ind2( k+nb, n ); auto L11 = L( ind1, ind1 ); auto L21 = L( ind2, ind1 ); auto X10 = X( ind1, ind0 ); auto X11 = X( ind1, ind1 ); auto X20 = X( ind2, ind0 ); auto X21 = X( ind2, ind1 ); Trsm( LEFT, LOWER, NORMAL, diag, F(1), L11, X10, checkIfSingular ); trstrm::LLNUnb( diag, F(1), L11, X11 ); Gemm( NORMAL, NORMAL, F(-1), L21, X10, F(1), X20 ); Z11 = X11; MakeTrapezoidal( LOWER, Z11 ); Gemm( NORMAL, NORMAL, F(-1), L21, Z11, F(1), X21 ); } }
inline void ApplyPackedReflectorsLUVF ( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUVF"); if( offset < 0 || offset > H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> AT, A0, ATop, AB, A1, A2; Matrix<R> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanHeight = H01.Height() + H11.Height(); const int HPanOffset = std::min( H11.Width(), std::max(offset-H00.Width(),0) ); const int HPanWidth = H11.Width()-HPanOffset; HPan.LockedView( H, 0, H00.Width()+HPanOffset, HPanHeight, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); ATop.View2x1( A0, A1 ); Zeros( HPan.Width(), ATop.Width(), Z ); Zeros( HPan.Width(), HPan.Width(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, UPPER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); Syrk( LOWER, TRANSPOSE, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( TRANSPOSE, NORMAL, R(1), HPanCopy, ATop, R(0), Z ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, NORMAL, R(-1), HPanCopy, Z, R(1), ATop ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyPackedReflectorsLUVF ( Conjugation conjugation, int offset, const Matrix<Complex<R> >& H, const Matrix<Complex<R> >& t, Matrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUVF"); if( offset < 0 || offset > H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); #endif typedef Complex<R> C; Matrix<C> HTL, HTR, H00, H01, H02, HPan, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<C> AT, A0, ATop, AB, A1, A2; Matrix<C> tT, t0, tB, t1, t2; Matrix<C> HPanCopy; Matrix<C> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanHeight = H01.Height() + H11.Height(); const int HPanOffset = std::min( H11.Width(), std::max(offset-H00.Width(),0) ); const int HPanWidth = H11.Width()-HPanOffset; HPan.LockedView( H, 0, H00.Width()+HPanOffset, HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); ATop.View2x1( A0, A1 ); Zeros( HPan.Width(), ATop.Width(), Z ); Zeros( HPan.Width(), HPan.Width(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, UPPER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); Herk( LOWER, ADJOINT, C(1), HPanCopy, C(0), SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( ADJOINT, NORMAL, C(1), HPanCopy, ATop, C(0), Z ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, C(1), SInv, Z ); Gemm( NORMAL, NORMAL, C(-1), HPanCopy, Z, C(1), ATop ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmUVar1( UnitOrNonUnit diag, Matrix<F>& A, const Matrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar1"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; // Temporary products Matrix<F> Y01; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); //--------------------------------------------------------------------// // Y01 := A00 U01 Zeros( Y01, A01.Height(), A01.Width() ); Hemm( LEFT, UPPER, F(1), A00, U01, F(0), Y01 ); // A01 := inv(U00)' A01 Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U00, A01 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A11 := A11 - (U01' A01 + A01' U01) Her2k( UPPER, ADJOINT, F(-1), U01, A01, F(1), A11 ); // A11 := inv(U11)' A11 inv(U11) TwoSidedTrsmUUnb( diag, A11, U11 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A01 := A01 inv(U11) Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U11, A01 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void TwoSidedTrsmUVar1 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar1"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,VC, STAR> U01_VC_STAR(g); DistMatrix<F,VR, STAR> U01_VR_STAR(g); DistMatrix<F,STAR,MR > U01Adj_STAR_MR(g); DistMatrix<F,STAR,STAR> X11_STAR_STAR(g); DistMatrix<F,MR, MC > Z01_MR_MC(g); DistMatrix<F,MC, STAR> Z01_MC_STAR(g); DistMatrix<F,MR, STAR> Z01_MR_STAR(g); DistMatrix<F> Y01(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A01_VC_STAR.AlignWith( A01 ); U01_MC_STAR.AlignWith( A00 ); U01_VR_STAR.AlignWith( A00 ); U01_VC_STAR.AlignWith( A00 ); U01Adj_STAR_MR.AlignWith( A00 ); Y01.AlignWith( A01 ); Z01_MR_MC.AlignWith( A01 ); Z01_MC_STAR.AlignWith( A00 ); Z01_MR_STAR.AlignWith( A00 ); //--------------------------------------------------------------------// // Y01 := A00 U01 U01_MC_STAR = U01; U01_VR_STAR = U01_MC_STAR; U01Adj_STAR_MR.AdjointFrom( U01_VR_STAR ); Zeros( Z01_MC_STAR, A01.Height(), A01.Width() ); Zeros( Z01_MR_STAR, A01.Height(), A01.Width() ); LocalSymmetricAccumulateLU ( ADJOINT, F(1), A00, U01_MC_STAR, U01Adj_STAR_MR, Z01_MC_STAR, Z01_MR_STAR ); Z01_MR_MC.SumScatterFrom( Z01_MR_STAR ); Y01 = Z01_MR_MC; Y01.SumScatterUpdate( F(1), Z01_MC_STAR ); // A01 := inv(U00)' A01 // // This is the bottleneck because A01 only has blocksize columns Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U00, A01 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A11 := A11 - (U01' A01 + A01' U01) A01_VC_STAR = A01; U01_VC_STAR = U01_MC_STAR; Zeros( X11_STAR_STAR, A11.Height(), A11.Width() ); Her2k ( UPPER, ADJOINT, F(-1), A01_VC_STAR.Matrix(), U01_VC_STAR.Matrix(), F(0), X11_STAR_STAR.Matrix() ); A11.SumScatterUpdate( F(1), X11_STAR_STAR ); // A11 := inv(U11)' A11 inv(U11) A11_STAR_STAR = A11; U11_STAR_STAR = U11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A01 := A01 inv(U11) A01_VC_STAR = A01; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void RUVF ( Conjugation conjugation, Int offset, const Matrix<F>& H, const Matrix<F>& t, Matrix<F>& A ) { #ifndef RELEASE CallStackEntry cse("apply_packed_reflectors::RUVF"); // TODO: Proper dimension checks if( t.Height() != H.DiagonalLength(offset) ) LogicError("t must be the same length as H's offset diag"); #endif Matrix<F> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<F> ALeft; Matrix<F> tT, t0, tB, t1, t2; Matrix<F> SInv, Z; LockedPartitionDownOffsetDiagonal ( offset, H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2 ); LockedView2x1( HPan, H01, H11 ); View( ALeft, A, 0, 0, A.Height(), HPan.Height() ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( UPPER, HPanCopy, 0, RIGHT ); SetDiagonal( HPanCopy, F(1), 0, RIGHT ); Herk( UPPER, ADJOINT, F(1), HPanCopy, SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, NORMAL, F(1), ALeft, HPanCopy, Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), SInv, Z ); Gemm( NORMAL, ADJOINT, F(-1), Z, HPanCopy, F(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); } }
inline void RLHF ( Conjugation conjugation, int offset, const Matrix<Complex<R> >& H, const Matrix<Complex<R> >& t, Matrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLHF"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); #endif typedef Complex<R> C; Matrix<C> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<C> ALeft; Matrix<C> tT, t0, tB, t1, t2; Matrix<C> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanHeight ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); Zeros( ALeft.Height(), HPan.Height(), Z ); Zeros( HPan.Height(), HPan.Height(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, C(1) ); Herk( UPPER, NORMAL, C(1), HPanCopy, C(0), SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, ADJOINT, C(1), ALeft, HPanCopy, C(0), Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, C(1), SInv, Z ); Gemm( NORMAL, NORMAL, C(-1), Z, HPanCopy, C(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmUVar5( UnitOrNonUnit diag, Matrix<F>& A, const Matrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmUVar5"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; // Temporary products Matrix<F> Y12; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); //--------------------------------------------------------------------// // A11 := inv(U11)' A11 inv(U11) TwoSidedTrsmUUnb( diag, A11, U11 ); // Y12 := A11 U12 Zeros( A12.Height(), A12.Width(), Y12 ); Hemm( LEFT, UPPER, F(1), A11, U12, F(0), Y12 ); // A12 := inv(U11)' A12 Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U11, A12 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A22 := A22 - (A12' U12 + U12' A12) Her2k( UPPER, ADJOINT, F(-1), A12, U12, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A12 := A12 inv(U22) Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U22, A12 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmUVar5 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmUVar5"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); DistMatrix<F,STAR,VC > A12_STAR_VC(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MC > U12_STAR_MC(g); DistMatrix<F,STAR,MR > U12_STAR_MR(g); DistMatrix<F,STAR,VC > U12_STAR_VC(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,STAR,VR > Y12_STAR_VR(g); DistMatrix<F> Y12(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A12_STAR_VC.AlignWith( A22 ); A12_STAR_VR.AlignWith( A22 ); U12_STAR_MC.AlignWith( A22 ); U12_STAR_MR.AlignWith( A22 ); U12_STAR_VC.AlignWith( A22 ); U12_STAR_VR.AlignWith( A22 ); Y12.AlignWith( A12 ); Y12_STAR_VR.AlignWith( A12 ); //--------------------------------------------------------------------// // A11 := inv(U11)' A11 inv(U11) U11_STAR_STAR = U11; A11_STAR_STAR = A11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // Y12 := A11 U12 U12_STAR_VR = U12; Y12_STAR_VR.ResizeTo( A12.Height(), A12.Width() ); Hemm ( LEFT, UPPER, F(1), A11_STAR_STAR.LocalMatrix(), U12_STAR_VR.LocalMatrix(), F(0), Y12_STAR_VR.LocalMatrix() ); Y12 = Y12_STAR_VR; // A12 := inv(U11)' A12 A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, diag, F(1), U11_STAR_STAR, A12_STAR_VR ); A12 = A12_STAR_VR; // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A22 := A22 - (A12' U12 + U12' A12) A12_STAR_VR = A12; A12_STAR_VC = A12_STAR_VR; U12_STAR_VC = U12_STAR_VR; A12_STAR_MC = A12_STAR_VC; U12_STAR_MC = U12_STAR_VC; A12_STAR_MR = A12_STAR_VR; U12_STAR_MR = U12_STAR_VR; LocalTrr2k ( UPPER, ADJOINT, ADJOINT, F(-1), U12_STAR_MC, A12_STAR_MR, A12_STAR_MC, U12_STAR_MR, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A12 := A12 inv(U22) // // This is the bottleneck because A12 only has blocksize rows Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U22, A12 ); //--------------------------------------------------------------------// A12_STAR_MC.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A12_STAR_VC.FreeAlignments(); A12_STAR_VR.FreeAlignments(); U12_STAR_MC.FreeAlignments(); U12_STAR_MR.FreeAlignments(); U12_STAR_VC.FreeAlignments(); U12_STAR_VR.FreeAlignments(); Y12.FreeAlignments(); Y12_STAR_VR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LU( Matrix<F>& A, Matrix<int>& p ) { #ifndef RELEASE CallStackEntry entry("LU"); if( p.Viewing() && (p.Height() != std::min(A.Height(),A.Width()) || p.Width() != 1) ) throw std::logic_error ("p must be a vector of the same height as the min dimension of A"); #endif if( !p.Viewing() ) p.ResizeTo( std::min(A.Height(),A.Width()), 1 ); // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABRL, ABRR, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<int> pT, p0, pB, p1, p2; // Pivot composition std::vector<int> image, preimage; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( p, pT, pB, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); RepartitionDown ( pT, p0, /**/ /**/ p1, pB, p2 ); PartitionRight( ABR, ABRL, ABRR, A11.Width() ); const int pivotOffset = A01.Height(); //--------------------------------------------------------------------// lu::Panel( ABRL, p1, pivotOffset ); ComposePivots( p1, pivotOffset, image, preimage ); ApplyRowPivots( ABL, image, preimage ); ApplyRowPivots( ABRR, image, preimage ); Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A11, A12 ); Gemm( NORMAL, NORMAL, F(-1), A21, A12, F(1), A22 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlidePartitionDown ( pT, p0, p1, /**/ /**/ pB, p2 ); } }
inline void LLVF( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::LLVF"); if( offset > 0 || offset < -H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Height() != A.Height() ) throw std::logic_error ("Height of transforms must equal height of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> AT, A0, AB, A1, A2; Matrix<R> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); int HPanHeight = H11.Height() + H21.Height(); int HPanWidth = std::min( H11.Width(), std::max(HPanHeight+offset,0) ); LockedView( HPan, H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); Zeros( HPanWidth, AB.Width(), Z ); Zeros( HPanWidth, HPanWidth, SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, LOWER, offset, HPanCopy ); SetDiagonal( LEFT, offset, HPanCopy, R(1) ); Syrk( LOWER, TRANSPOSE, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( TRANSPOSE, NORMAL, R(1), HPanCopy, AB, R(0), Z ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, NORMAL, R(-1), HPanCopy, Z, R(1), AB ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }