inline void RLVB ( Conjugation conjugation, int offset, const Matrix<Complex<R> >& H, const Matrix<Complex<R> >& t, Matrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLVB"); if( offset > 0 || offset < -H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Height() != A.Width() ) throw std::logic_error ("Height of transforms must equal width of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); #endif typedef Complex<R> C; Matrix<C> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<C> ARight; Matrix<C> tT, t0, tB, t1, t2; Matrix<C> SInv, Z; LockedPartitionUpDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionUp ( t, tT, tB, 0 ); while( HBR.Height() < H.Height() && HBR.Width() < H.Width() ) { LockedRepartitionUpDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); const int HPanHeight = H11.Height() + H21.Height(); const int HPanWidth = std::min( H11.Width(), std::max(HPanHeight+offset,0) ); const int leftover = A.Width()-HPanHeight; LockedView( HPan, H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); LockedRepartitionUp ( tT, t0, t1, /**/ /**/ tB, t2, HPanWidth ); View( ARight, A, 0, leftover, A.Height(), HPanHeight ); Zeros( ARight.Height(), HPan.Width(), Z ); Zeros( HPan.Width(), HPan.Width(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, LOWER, offset, HPanCopy ); SetDiagonal( LEFT, offset, HPanCopy, C(1) ); Herk( LOWER, ADJOINT, C(1), HPanCopy, C(0), SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, NORMAL, C(1), ARight, HPanCopy, C(0), Z ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, C(1), SInv, Z ); Gemm( NORMAL, ADJOINT, C(-1), Z, HPanCopy, C(1), ARight ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); SlideLockedPartitionUp ( tT, t0, /**/ /**/ t1, tB, t2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void RLVB( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLVB"); if( offset > 0 || offset < -H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Height() != A.Width() ) throw std::logic_error ("Height of transforms must equal width of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> ARight; Matrix<R> SInv, Z; LockedPartitionUpDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HBR.Height() < H.Height() && HBR.Width() < H.Width() ) { LockedRepartitionUpDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); const int HPanHeight = H11.Height() + H21.Height(); const int HPanWidth = std::min( H11.Width(), std::max(HPanHeight+offset,0) ); const int leftover = A.Width()-HPanHeight; LockedView( HPan, H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); View( ARight, A, 0, leftover, A.Height(), HPanHeight ); Zeros( ARight.Height(), HPanWidth, Z ); Zeros( HPanWidth, HPanWidth, SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, LOWER, offset, HPanCopy ); SetDiagonal( LEFT, offset, HPanCopy, R(1) ); Syrk( LOWER, TRANSPOSE, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( NORMAL, NORMAL, R(1), ARight, HPanCopy, R(0), Z ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, TRANSPOSE, R(-1), Z, HPanCopy, R(1), ARight ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void RowEchelon( Matrix<F>& A, Matrix<F>& B ) { #ifndef RELEASE CallStackEntry entry("RowEchelon"); if( A.Height() != B.Height() ) LogicError("A and B must be the same height"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, APan, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> BT, B0, BB, B1, B2; Matrix<Int> p1; // Pivot composition std::vector<Int> image, preimage; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( B, BT, BB, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); RepartitionDown ( BT, B0, /**/ /**/ B1, BB, B2 ); View2x1 ( APan, A12, A22 ); //--------------------------------------------------------------------// lu::Panel( APan, p1, A00.Height() ); ComposePivots( p1, A00.Height(), image, preimage ); ApplyRowPivots( BB, image, preimage ); Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A11, A12 ); Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A11, B1 ); Gemm( NORMAL, NORMAL, F(-1), A21, A12, F(1), A22 ); Gemm( NORMAL, NORMAL, F(-1), A21, B1, F(1), B2 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlidePartitionDown ( BT, B0, B1, /**/ /**/ BB, B2 ); } }
QDWHInfo QDWHInner( Matrix<F>& A, Base<F> sMinUpper, const QDWHCtrl& ctrl ) { EL_DEBUG_CSE typedef Base<F> Real; typedef Complex<Real> Cpx; const Int m = A.Height(); const Int n = A.Width(); const Real oneThird = Real(1)/Real(3); if( m < n ) LogicError("Height cannot be less than width"); QDWHInfo info; QRCtrl<Base<F>> qrCtrl; qrCtrl.colPiv = ctrl.colPiv; const Real eps = limits::Epsilon<Real>(); const Real tol = 5*eps; const Real cubeRootTol = Pow(tol,oneThird); Real L = sMinUpper / Sqrt(Real(n)); Real frobNormADiff; Matrix<F> ALast, ATemp, C; Matrix<F> Q( m+n, n ); auto QT = Q( IR(0,m ), ALL ); auto QB = Q( IR(m,END), ALL ); while( info.numIts < ctrl.maxIts ) { ALast = A; Real L2; Cpx dd, sqd; if( Abs(1-L) < tol ) { L2 = 1; dd = 0; sqd = 1; } else { L2 = L*L; dd = Pow( 4*(1-L2)/(L2*L2), oneThird ); sqd = Sqrt( Real(1)+dd ); } const Cpx arg = Real(8) - Real(4)*dd + Real(8)*(2-L2)/(L2*sqd); const Real a = (sqd + Sqrt(arg)/Real(2)).real(); const Real b = (a-1)*(a-1)/4; const Real c = a+b-1; const Real alpha = a-b/c; const Real beta = b/c; L = L*(a+b*L2)/(1+c*L2); if( c > 100 ) { // // The standard QR-based algorithm // QT = A; QT *= Sqrt(c); MakeIdentity( QB ); qr::ExplicitUnitary( Q, true, qrCtrl ); Gemm( NORMAL, ADJOINT, F(alpha/Sqrt(c)), QT, QB, F(beta), A ); ++info.numQRIts; } else { // // Use faster Cholesky-based algorithm since A is well-conditioned // Identity( C, n, n ); Herk( LOWER, ADJOINT, c, A, Real(1), C ); Cholesky( LOWER, C ); ATemp = A; Trsm( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), C, ATemp ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, F(1), C, ATemp ); A *= beta; Axpy( alpha, ATemp, A ); ++info.numCholIts; } ++info.numIts; ALast -= A; frobNormADiff = FrobeniusNorm( ALast ); if( frobNormADiff <= cubeRootTol && Abs(1-L) <= tol ) break; } return info; }
inline void LUHF ( Conjugation conjugation, Int offset, const Matrix<F>& H, const Matrix<F>& t, Matrix<F>& A ) { #ifndef RELEASE CallStackEntry cse("apply_packed_reflectors::LUHF"); // TODO: Proper dimension checks if( t.Height() != H.DiagonalLength(offset) ) LogicError("t must be the same length as H's offset diag"); #endif Matrix<F> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<F> AT, A0, AB, A1, A2; Matrix<F> tT, t0, tB, t1, t2; Matrix<F> SInv, Z; LockedPartitionDownOffsetDiagonal ( offset, H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2 ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2, H11.Height() ); LockedView1x2( HPan, H11, H12 ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTriangular( UPPER, HPanCopy ); SetDiagonal( HPanCopy, F(1) ); Herk( LOWER, NORMAL, F(1), HPanCopy, SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, NORMAL, F(1), HPanCopy, AB, Z ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, F(1), SInv, Z ); Gemm( ADJOINT, NORMAL, F(-1), HPanCopy, Z, F(1), AB ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } }
inline void TwoSidedTrsmUVar4( UnitOrNonUnit diag, Matrix<F>& A, const Matrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmUVar4"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; // Temporary products Matrix<F> Y12; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); //--------------------------------------------------------------------// // A01 := A01 inv(U11) Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U11, A01 ); // A11 := inv(U11)' A11 inv(U11) TwoSidedTrsmUUnb( diag, A11, U11 ); // A02 := A02 - A01 U12 Gemm( NORMAL, NORMAL, F(-1), A01, U12, F(1), A02 ); // Y12 := A11 U12 Zeros( A12.Height(), A12.Width(), Y12 ); Hemm( LEFT, UPPER, F(1), A11, U12, F(0), Y12 ); // A12 := inv(U11)' A12 Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U11, A12 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A22 := A22 - (A12' U12 + U12' A12) Her2k( UPPER, ADJOINT, F(-1), A12, U12, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /**********************************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrtrsmLLN ( UnitOrNonUnit diag, F alpha, const Matrix<F>& L, Matrix<F>& X, bool checkIfSingular=true ) { #ifndef RELEASE CallStackEntry entry("internal::TrtrsmLLN"); #endif // Matrix views Matrix<F> LTL, LTR, L00, L01, L02, LBL, LBR, L10, L11, L12, L20, L21, L22; Matrix<F> XTL, XTR, X00, X01, X02, XBL, XBR, X10, X11, X12, X20, X21, X22; Matrix<F> Z11; // Start the algorithm ScaleTrapezoid( alpha, LOWER, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionDownDiagonal ( X, XTL, XTR, XBL, XBR, 0 ); while( XBR.Height() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionDownDiagonal ( XTL, /**/ XTR, X00, /**/ X01, X02, /*************/ /******************/ /**/ X10, /**/ X11, X12, XBL, /**/ XBR, X20, /**/ X21, X22 ); //--------------------------------------------------------------------// Trsm( LEFT, LOWER, NORMAL, diag, F(1), L11, X10, checkIfSingular ); TrtrsmLLNUnb( diag, F(1), L11, X11 ); Gemm( NORMAL, NORMAL, F(-1), L21, X10, F(1), X20 ); Z11 = X11; MakeTriangular( LOWER, Z11 ); Gemm( NORMAL, NORMAL, F(-1), L21, Z11, F(1), X21 ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionDownDiagonal ( XTL, /**/ XTR, X00, X01, /**/ X02, /**/ X10, X11, /**/ X12, /*************/ /******************/ XBL, /**/ XBR, X20, X21, /**/ X22 ); } }
void TransformColumns ( const Matrix<F>& V, DistMatrix<F,MC,MR,BLOCK>& A ) { DEBUG_CSE const Int width = A.Width(); const Grid& grid = A.Grid(); const Int blockWidth = A.BlockWidth(); const Int firstBlockWidth = blockWidth - A.RowCut(); if( width <= firstBlockWidth || grid.Width() == 1 ) { if( grid.Col() == A.ColOwner(0) ) { // This process row can locally update its portion of A TransformColumns( V, A.Matrix() ); } } else if( width <= firstBlockWidth + blockWidth ) { const int firstCol = A.ColOwner( 0 ); const int secondCol = A.ColOwner( firstBlockWidth ); if( grid.Col() == firstCol ) { // // Replace A with // // | ALeft, ARight | | VLeft, VRight |, // // where ALeft is owned by this process column and ARight by the // next. // // Partition space for the combined matrix Matrix<F> ACombine( A.LocalHeight(), width ); auto ALeft = ACombine( ALL, IR(0,firstBlockWidth) ); auto ARight = ACombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix ALeft = A.LockedMatrix(); // Exchange the data El::SendRecv( ALeft, ARight, A.RowComm(), secondCol, secondCol ); // Form our portion of the result auto VLeft = V( ALL, IR(0,firstBlockWidth) ); Gemm( NORMAL, NORMAL, F(1), ACombine, VLeft, A.Matrix() ); } else if( grid.Col() == secondCol ) { // // Replace A with // // | ALeft, ARight | | VLeft, VRight |, // // where ALeft is owned by the previous process column and ARight // by this one. // // Partition space for the combined matrix Matrix<F> ACombine( A.LocalHeight(), width ); auto ALeft = ACombine( ALL, IR(0,firstBlockWidth) ); auto ARight = ACombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix ARight = A.LockedMatrix(); // Exchange the data El::SendRecv( ARight, ALeft, A.RowComm(), firstCol, firstCol ); // Form our portion of the result auto VRight = V( ALL, IR(firstBlockWidth,END) ); Gemm( NORMAL, NORMAL, F(1), ACombine, VRight, A.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,MC,STAR,BLOCK> A_MC_STAR( A ); Matrix<F> ALocCopy( A_MC_STAR.Matrix() ); Gemm( NORMAL, NORMAL, F(1), ALocCopy, V, A_MC_STAR.Matrix() ); A = A_MC_STAR; } }
inline void TwoSidedTrsmUVar2( UnitOrNonUnit diag, Matrix<F>& A, const Matrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar2"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; // Temporary products Matrix<F> Y01; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); //--------------------------------------------------------------------// // Y01 := A00 U01 Zeros( Y01, A01.Height(), A01.Width() ); Hemm( LEFT, UPPER, F(1), A00, U01, F(0), Y01 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A11 := A11 - (U01' A01 + A01' U01) Her2k( UPPER, ADJOINT, F(-1), U01, A01, F(1), A11 ); // A11 := inv(U11)' A11 inv(U11) TwoSidedTrsmUUnb( diag, A11, U11 ); // A12 := A12 - A02' U01 Gemm( ADJOINT, NORMAL, F(-1), A02, U01, F(1), A12 ); // A12 := inv(U11)' A12 Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U11, A12 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A01 := A01 inv(U11) Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U11, A01 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void ApplyPackedReflectorsRLHF ( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsRLHF"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> ALeft; Matrix<R> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; HPan.LockedView( H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); ALeft.View( A, 0, 0, A.Height(), HPanWidth ); Zeros( ALeft.Height(), HPan.Height(), Z ); Zeros( HPan.Height(), HPan.Height(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); Syrk( UPPER, NORMAL, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( NORMAL, TRANSPOSE, R(1), ALeft, HPanCopy, R(0), Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, NORMAL, R(-1), Z, HPanCopy, R(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }
void Tikhonov ( Orientation orientation, const ElementalMatrix<F>& APre, const ElementalMatrix<F>& BPre, const ElementalMatrix<F>& G, ElementalMatrix<F>& XPre, TikhonovAlg alg ) { DEBUG_CSE DistMatrixReadProxy<F,F,MC,MR> AProx( APre ), BProx( BPre ); DistMatrixWriteProxy<F,F,MC,MR> XProx( XPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& X = XProx.Get(); const bool normal = ( orientation==NORMAL ); const Int m = ( normal ? A.Height() : A.Width() ); const Int n = ( normal ? A.Width() : A.Height() ); if( G.Width() != n ) LogicError("Tikhonov matrix was the wrong width"); if( orientation == TRANSPOSE && IsComplex<F>::value ) LogicError("Transpose version of complex Tikhonov not yet supported"); if( m >= n ) { DistMatrix<F> Z(A.Grid()); if( alg == TIKHONOV_CHOLESKY ) { if( orientation == NORMAL ) Herk( LOWER, ADJOINT, Base<F>(1), A, Z ); else Herk( LOWER, NORMAL, Base<F>(1), A, Z ); Herk( LOWER, ADJOINT, Base<F>(1), G, Base<F>(1), Z ); Cholesky( LOWER, Z ); } else { const Int mG = G.Height(); Zeros( Z, m+mG, n ); auto ZT = Z( IR(0,m), IR(0,n) ); auto ZB = Z( IR(m,m+mG), IR(0,n) ); if( orientation == NORMAL ) ZT = A; else Adjoint( A, ZT ); ZB = G; qr::ExplicitTriang( Z ); } if( orientation == NORMAL ) Gemm( ADJOINT, NORMAL, F(1), A, B, X ); else Gemm( NORMAL, NORMAL, F(1), A, B, X ); cholesky::SolveAfter( LOWER, NORMAL, Z, X ); } else { LogicError("This case not yet supported"); } }
inline void Cannon_NN ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE CallStackEntry entry("gemm::Cannon_NN"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) LogicError("{A,B,C} must have the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal matrices: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; LogicError( msg.str() ); } #endif const Grid& g = A.Grid(); if( g.Height() != g.Width() ) LogicError("Process grid must be square for Cannon's"); if( C.ColAlignment() != A.ColAlignment() || C.RowAlignment() != B.RowAlignment() ) LogicError("C is not properly aligned"); const Int row = g.Row(); const Int col = g.Col(); const Int pSqrt = g.Height(); mpi::Comm rowComm = g.RowComm(); mpi::Comm colComm = g.ColComm(); if( A.Width() % pSqrt != 0 ) LogicError("For now, width(A) must be integer multiple of sqrt(p)"); // Begin by scaling our local portion of C Scale( beta, C ); // Load the initial A and B packages (may want to transpose B...) const Int localHeightA = A.LocalHeight(); const Int localHeightB = B.LocalHeight(); const Int localWidthA = A.LocalWidth(); const Int localWidthB = B.LocalWidth(); Matrix<T> pkgA(localHeightA,localWidthA,localHeightA), pkgB(localHeightB,localWidthB,localHeightB); for( Int jLoc=0; jLoc<localWidthA; ++jLoc ) MemCopy ( pkgA.Buffer(0,jLoc), A.LockedBuffer(0,jLoc), localHeightA ); for( Int jLoc=0; jLoc<localWidthB; ++jLoc ) MemCopy ( pkgB.Buffer(0,jLoc), B.LockedBuffer(0,jLoc), localHeightB ); // Perform the initial circular shifts so that our A and B packages align const Int rowShiftA = A.RowShift(); const Int colShiftB = B.ColShift(); const Int leftInitA = (col+pSqrt-colShiftB) % pSqrt; const Int rightInitA = (col+colShiftB) % pSqrt; const Int aboveInitB = (row+pSqrt-rowShiftA) % pSqrt; const Int belowInitB = (row+rowShiftA) % pSqrt; const Int pkgSizeA = localHeightA*localWidthA; const Int pkgSizeB = localHeightB*localWidthB; mpi::SendRecv( pkgA.Buffer(), pkgSizeA, leftInitA, rightInitA, rowComm ); mpi::SendRecv( pkgB.Buffer(), pkgSizeB, aboveInitB, belowInitB, colComm ); // Now begin the data flow const Int aboveRow = (row+pSqrt-1) % pSqrt; const Int belowRow = (row+1) % pSqrt; const Int leftCol = (col+pSqrt-1) % pSqrt; const Int rightCol = (col+1) % pSqrt; for( Int q=0; q<pSqrt; ++q ) { Gemm( NORMAL, NORMAL, alpha, pkgA, pkgB, T(1), C.Matrix() ); if( q != pSqrt-1 ) { mpi::SendRecv ( pkgA.Buffer(), pkgSizeA, leftCol, rightCol, rowComm ); mpi::SendRecv ( pkgB.Buffer(), pkgSizeB, aboveRow, belowRow, colComm ); } } }
int QDWH ( Matrix<F>& A, typename Base<F>::type lowerBound, typename Base<F>::type upperBound ) { #ifndef RELEASE PushCallStack("QDWH"); #endif typedef typename Base<F>::type R; const int height = A.Height(); const int width = A.Width(); const R oneHalf = R(1)/R(2); const R oneThird = R(1)/R(3); if( height < width ) throw std::logic_error("Height cannot be less than width"); const R epsilon = lapack::MachineEpsilon<R>(); const R tol = 5*epsilon; const R cubeRootTol = Pow(tol,oneThird); // Form the first iterate Scale( 1/upperBound, A ); int numIts=0; R frobNormADiff; Matrix<F> ALast; Matrix<F> Q( height+width, width ); Matrix<F> QT, QB; PartitionDown( Q, QT, QB, height ); Matrix<F> C; Matrix<F> ATemp; do { ++numIts; ALast = A; R L2; Complex<R> dd, sqd; if( Abs(1-lowerBound) < tol ) { L2 = 1; dd = 0; sqd = 1; } else { L2 = lowerBound*lowerBound; dd = Pow( 4*(1-L2)/(L2*L2), oneThird ); sqd = Sqrt( 1+dd ); } const Complex<R> arg = 8 - 4*dd + 8*(2-L2)/(L2*sqd); const R a = (sqd + Sqrt( arg )/2).real; const R b = (a-1)*(a-1)/4; const R c = a+b-1; const Complex<R> alpha = a-b/c; const Complex<R> beta = b/c; lowerBound = lowerBound*(a+b*L2)/(1+c*L2); if( c > 100 ) { // // The standard QR-based algorithm // QT = A; Scale( Sqrt(c), QT ); MakeIdentity( QB ); ExplicitQR( Q ); Gemm( NORMAL, ADJOINT, alpha/Sqrt(c), QT, QB, beta, A ); } else { // // Use faster Cholesky-based algorithm since A is well-conditioned // Identity( width, width, C ); Herk( LOWER, ADJOINT, F(c), A, F(1), C ); Cholesky( LOWER, C ); ATemp = A; Trsm( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), C, ATemp ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, F(1), C, ATemp ); Scale( beta, A ); Axpy( alpha, ATemp, A ); } Axpy( F(-1), A, ALast ); frobNormADiff = Norm( ALast, FROBENIUS_NORM ); } while( frobNormADiff > cubeRootTol || Abs(1-lowerBound) > tol ); #ifndef RELEASE PopCallStack(); #endif return numIts; }
inline void ApplyPackedReflectorsLUHB ( Conjugation conjugation, int offset, const Matrix<Complex<R> >& H, const Matrix<Complex<R> >& t, Matrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUHB"); if( offset < 0 || offset > H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); #endif typedef Complex<R> C; Matrix<C> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<C> ABottom; Matrix<C> tT, t0, tB, t1, t2; Matrix<C> SInv, Z; LockedPartitionUpDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionUp ( t, tT, tB, 0 ); while( HBR.Height() < H.Height() && HBR.Width() < H.Width() ) { LockedRepartitionUpDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); const int HPanWidth = H11.Width() + H12.Width(); const int HPanHeight = std::min( H11.Height(), std::max(HPanWidth-offset,0) ); const int leftover = A.Height()-HPanWidth; HPan.LockedView( H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); LockedRepartitionUp ( tT, t0, t1, /**/ /**/ tB, t2, HPanHeight ); ABottom.View( A, leftover, 0, HPanWidth, A.Width() ); Zeros( HPanHeight, ABottom.Width(), Z ); Zeros( HPanHeight, HPanHeight, SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, UPPER, offset, HPanCopy ); SetDiagonalToOne( LEFT, offset, HPanCopy ); Herk( UPPER, NORMAL, C(1), HPanCopy, C(0), SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, NORMAL, C(1), HPanCopy, ABottom, C(0), Z ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, C(1), SInv, Z ); Gemm( ADJOINT, NORMAL, C(-1), HPanCopy, Z, C(1), ABottom ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); SlideLockedPartitionUp ( tT, t0, /**/ /**/ t1, tB, t2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void Inverse( Matrix<F>& A ) { #ifndef RELEASE PushCallStack("Inverse"); if( A.Height() != A.Width() ) throw std::logic_error("Cannot invert non-square matrices"); #endif Matrix<int> p; LU( A, p ); TriangularInverse( UPPER, NON_UNIT, A ); // Solve inv(A) L = inv(U) for inv(A) Matrix<F> ATL, ATR, ABL, ABR; Matrix<F> A00, A01, A02, A10, A11, A12, A20, A21, A22; Matrix<F> A1, A2; Matrix<F> L11, L21; PartitionUpDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ABR.Height() < A.Height() ) { RepartitionUpDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); View( A1, A, 0, A00.Width(), A.Height(), A01.Width() ); View( A2, A, 0, A00.Width()+A01.Width(), A.Height(), A02.Width() ); //--------------------------------------------------------------------// // Copy out L1 L11 = A11; L21 = A21; // Zero the strictly lower triangular portion of A1 MakeTrapezoidal( LEFT, UPPER, 0, A11 ); Zero( A21 ); // Perform the lazy update of A1 Gemm( NORMAL, NORMAL, F(-1), A2, L21, F(1), A1 ); // Solve against this diagonal block of L11 Trsm( RIGHT, LOWER, NORMAL, UNIT, F(1), L11, A1 ); //--------------------------------------------------------------------// SlidePartitionUpDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /*******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); } // inv(A) := inv(A) P ApplyInverseColumnPivots( A, p ); #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyPackedReflectorsLUHB ( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUHB"); if( offset < 0 || offset > H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> ABottom; Matrix<R> SInv, Z; LockedPartitionUpDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HBR.Height() < H.Height() && HBR.Width() < H.Width() ) { LockedRepartitionUpDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); const int HPanWidth = H11.Width() + H12.Width(); const int HPanHeight = std::min( H11.Height(), std::max(HPanWidth-offset,0) ); const int leftover = A.Height()-HPanWidth; HPan.LockedView( H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); ABottom.View( A, leftover, 0, HPanWidth, A.Width() ); Zeros( HPanHeight, ABottom.Width(), Z ); Zeros( HPanHeight, HPanHeight, SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, UPPER, offset, HPanCopy ); SetDiagonalToOne( LEFT, offset, HPanCopy ); Syrk( UPPER, NORMAL, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( NORMAL, NORMAL, R(1), HPanCopy, ABottom, R(0), Z ); Trsm( LEFT, UPPER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( TRANSPOSE, NORMAL, R(-1), HPanCopy, Z, R(1), ABottom ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); } #ifndef RELEASE PopCallStack(); #endif }
bool NnlsBlockpivot(const DenseMatrix<T>& LHS, const DenseMatrix<T>& RHS, DenseMatrix<T>& X, // input as xinit DenseMatrix<T>& Y) // gradX { // Solve (LHS)*X = RHS for X by block principal pivoting. Matrix LHS // is assumed to be symmetric positive definite. const int PBAR = 3; const unsigned int WIDTH = RHS.Width(); const unsigned int HEIGHT = RHS.Height(); const unsigned int MAX_ITER = HEIGHT*5; BitMatrix passive_set = (X > T(0)); std::vector<unsigned int> tmp_indices(WIDTH); for (unsigned int i=0; i<WIDTH; ++i) tmp_indices[i] = i; MakeZeros(X); if (!BppSolveNormalEqNoGroup(tmp_indices, passive_set, LHS, RHS, X)) return false; // Y = LHS * X - RHS Gemm(NORMAL, NORMAL, T(1), LHS, X, T(0), Y); Axpy( T(-1), RHS, Y); std::vector<int> P(WIDTH, PBAR), Ninf(WIDTH, HEIGHT+1); BitMatrix nonopt_set = (Y < T(0)) & ~passive_set; BitMatrix infeas_set = (X < T(0)) & passive_set; std::vector<int> col_sums(WIDTH); std::vector<int> not_good(WIDTH); nonopt_set.SumColumns(not_good); infeas_set.SumColumns(col_sums); not_good += col_sums; BitMatrix not_opt_cols = (not_good > 0); BitMatrix not_opt_mask; std::vector<unsigned int> non_opt_col_indices(WIDTH); not_opt_cols.Find(non_opt_col_indices); DenseMatrix<double> RHSsub(HEIGHT, WIDTH); DenseMatrix<double> Xsub(HEIGHT, WIDTH); DenseMatrix<double> Ysub(HEIGHT, WIDTH); unsigned int iter = 0; while (!non_opt_col_indices.empty()) { // exit if not getting anywhere if (iter >= MAX_ITER) return false; UpdatePassiveSet(passive_set, PBAR, HEIGHT, not_opt_cols, nonopt_set, infeas_set, not_good, P, Ninf); // equivalent of repmat(NotOptCols, HEIGHT, 1) not_opt_mask = MatrixFromColumnMask(not_opt_cols, HEIGHT); // Setup for the normal equation solver by extracting submatrices // from RHS and X. The normal equation solver will extract further // subproblems from RHSsub and Xsub and write all updated values // back into RHSsub and Xsub. RHS.SubmatrixFromCols(RHSsub, non_opt_col_indices); X.SubmatrixFromCols(Xsub, non_opt_col_indices); if (!BppSolveNormalEqNoGroup(non_opt_col_indices, passive_set, LHS, RHSsub, Xsub)) return false; ZeroizeSmallValues(Xsub, 1.0e-12); // compute Ysub = LHS * Xsub - RHSsub Ysub.Resize(RHSsub.Height(), RHSsub.Width()); Gemm(NORMAL, NORMAL, T(1), LHS, Xsub, T(0), Ysub); Axpy( T(-1), RHSsub, Ysub); // update Y and X using the new values in Ysub and Xsub OverwriteCols(Y, Ysub, non_opt_col_indices, non_opt_col_indices.size()); OverwriteCols(X, Xsub, non_opt_col_indices, non_opt_col_indices.size()); ZeroizeSmallValues(X, 1.0e-12); ZeroizeSmallValues(Y, 1.0e-12); // Check optimality - BppUpdateSets does the equivalent of the next two lines. // nonopt_set = not_opt_mask & (Y < T(0)) & ~passive_set; // infeas_set = not_opt_mask & (X < T(0)) & passive_set; BppUpdateSets(nonopt_set, infeas_set, not_opt_mask, X, Y, passive_set); nonopt_set.SumColumns(not_good); infeas_set.SumColumns(col_sums); not_good += col_sums; not_opt_cols = (not_good > 0); not_opt_cols.Find(non_opt_col_indices); ++iter; } return true; }
void TransformRows ( const Matrix<F>& V, DistMatrix<F,MC,MR,BLOCK>& A ) { DEBUG_CSE const Int height = A.Height(); const Grid& grid = A.Grid(); const Int blockHeight = A.BlockHeight(); const Int firstBlockHeight = blockHeight - A.ColCut(); if( height <= firstBlockHeight || grid.Height() == 1 ) { if( grid.Row() == A.RowOwner(0) ) { // This process row can locally update its portion of A TransformRows( V, A.Matrix() ); } } else if( height <= firstBlockHeight + blockHeight ) { const int firstRow = A.RowOwner( 0 ); const int secondRow = A.RowOwner( firstBlockHeight ); if( grid.Row() == firstRow ) { // // Replace A with // // | VLeft, VRight |' | ATop |, // | ABottom | // // where ATop is owned by this process row and ABottom by the next. // auto VLeft = V( ALL, IR(0,firstBlockHeight) ); // Partition space for the combined matrix Matrix<F> ACombine( height, A.LocalWidth() ); auto ATop = ACombine( IR(0,firstBlockHeight), ALL ); auto ABottom = ACombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix ATop = A.LockedMatrix(); // Exchange the data El::SendRecv( ATop, ABottom, A.ColComm(), secondRow, secondRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), VLeft, ACombine, A.Matrix() ); } else if( grid.Row() == secondRow ) { // // Replace A with // // | VLeft, VRight |' | ATop |, // | ABottom | // // where ATop is owned by the previous process row and ABottom by // this one. // auto VRight = V( ALL, IR(firstBlockHeight,END) ); // Partition space for the combined matrix Matrix<F> ACombine( height, A.LocalWidth() ); auto ATop = ACombine( IR(0,firstBlockHeight), ALL ); auto ABottom = ACombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix ABottom = A.LockedMatrix(); // Exchange the data El::SendRecv( ABottom, ATop, A.ColComm(), firstRow, firstRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), VRight, ACombine, A.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,STAR,MR,BLOCK> A_STAR_MR( A ); Matrix<F> ALocCopy( A_STAR_MR.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), V, ALocCopy, A_STAR_MR.Matrix() ); A = A_STAR_MR; } }
bool NnlsHals(const MatrixType<T>& A, DenseMatrix<T>& W, DenseMatrix<T>& H, const T tol, const bool verbose, const unsigned int max_iter) { unsigned int n = A.Width(); unsigned int k = W.Width(); if (static_cast<unsigned int>(W.Height()) != static_cast<unsigned int>(A.Height())) throw std::logic_error("NnlsHals: W and A must have identical height"); if (static_cast<unsigned int>(H.Width()) != static_cast<unsigned int>(A.Width())) throw std::logic_error("NnlsHals: H and A must have identical width"); if (H.Height() != W.Width()) throw std::logic_error("NnlsHals: non-conformant W and H"); DenseMatrix<T> WtW(k, k), WtA(k, n), WtWH_r(1, n), gradH(k, n); if (verbose) std::cout << "\nRunning NNLS solver..." << std::endl; // compute W'W and W'A for the normal equations Gemm(TRANSPOSE, NORMAL, T(1.0), W, W, T(0.0), WtW); Gemm(TRANSPOSE, NORMAL, T(1.0), W, A, T(0.0), WtA); bool success = false; T pg0 = T(0), pg; for (unsigned int i=0; i<max_iter; ++i) { // compute the new matrix H UpdateH_Hals(H, WtWH_r, WtW, WtA); // compute gradH = WtW*H - WtA Gemm(NORMAL, NORMAL, T(1.0), WtW, H, T(0.0), gradH); Axpy( T(-1.0), WtA, gradH); // compute progress metric if (0 == i) { pg0 = ProjectedGradientNorm(gradH, H); if (verbose) ReportProgress(i+1, T(1.0)); continue; } else { pg = ProjectedGradientNorm(gradH, H); } if (verbose) ReportProgress(i+1, pg/pg0); // check progress vs. desired tolerance if (pg < tol * pg0) { success = true; NormalizeAndScale<T>(W, H); break; } } if (!success) std::cerr << "NNLS solver reached iteration limit." << std::endl; return success; }
inline void TwoSidedTrsmLVar2( UnitOrNonUnit diag, Matrix<F>& A, const Matrix<F>& L ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmLVar2"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( L.Height() != L.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != L.Height() ) throw std::logic_error("A and L must be the same size"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> LTL, LTR, L00, L01, L02, LBL, LBR, L10, L11, L12, L20, L21, L22; // Temporary products Matrix<F> X11; Matrix<F> Y10; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); //--------------------------------------------------------------------// // Y10 := L10 A00 Zeros( L10.Height(), A00.Width(), Y10 ); Hemm( RIGHT, LOWER, F(1), A00, L10, F(0), Y10 ); // A10 := A10 - 1/2 Y10 Axpy( F(-1)/F(2), Y10, A10 ); // A11 := A11 - (A10 L10' + L10 A10') Her2k( LOWER, NORMAL, F(-1), A10, L10, F(1), A11 ); // A11 := inv(L11) A11 inv(L11)' TwoSidedTrsmLUnb( diag, A11, L11 ); // A21 := A21 - A20 L10' Gemm( NORMAL, ADJOINT, F(-1), A20, L10, F(1), A21 ); // A21 := A21 inv(L11)' Trsm( RIGHT, LOWER, ADJOINT, diag, F(1), L11, A21 ); // A10 := A10 - 1/2 Y10 Axpy( F(-1)/F(2), Y10, A10 ); // A10 := inv(L11) A10 Trsm( LEFT, LOWER, NORMAL, diag, F(1), L11, A10 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /**********************************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LU( Matrix<F>& A, Matrix<int>& p ) { #ifndef RELEASE PushCallStack("LU"); if( p.Viewing() && (std::min(A.Height(),A.Width()) != p.Height() || p.Width() != 1) ) throw std::logic_error ("p must be a vector of the same height as the min dimension of A."); #endif if( !p.Viewing() ) p.ResizeTo( std::min(A.Height(),A.Width()), 1 ); // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABRL, ABRR, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<int> pT, p0, pB, p1, p2; // Pivot composition std::vector<int> image, preimage; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( p, pT, pB, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); RepartitionDown ( pT, p0, /**/ /**/ p1, pB, p2 ); PartitionRight( ABR, ABRL, ABRR, A11.Width() ); const int pivotOffset = A01.Height(); //--------------------------------------------------------------------// internal::PanelLU( ABRL, p1, pivotOffset ); internal::ComposePanelPivots( p1, pivotOffset, image, preimage ); ApplyRowPivots( ABL, image, preimage ); ApplyRowPivots( ABRR, image, preimage ); Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A11, A12 ); Gemm( NORMAL, NORMAL, F(-1), A21, A12, F(1), A22 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlidePartitionDown ( pT, p0, p1, /**/ /**/ pB, p2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LU( Matrix<F>& A, Matrix<Int>& p ) { #ifndef RELEASE CallStackEntry entry("LU"); #endif p.ResizeTo( Min(A.Height(),A.Width()), 1 ); // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABRL, ABRR, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<Int> pT, p0, pB, p1, p2; // Pivot composition std::vector<Int> image, preimage; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( p, pT, pB, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); RepartitionDown ( pT, p0, /**/ /**/ p1, pB, p2 ); PartitionRight( ABR, ABRL, ABRR, A11.Width() ); const Int pivotOffset = A01.Height(); //--------------------------------------------------------------------// lu::Panel( ABRL, p1, pivotOffset ); ComposePivots( p1, pivotOffset, image, preimage ); ApplyRowPivots( ABL, image, preimage ); ApplyRowPivots( ABRR, image, preimage ); Trsm( LEFT, LOWER, NORMAL, UNIT, F(1), A11, A12 ); Gemm( NORMAL, NORMAL, F(-1), A21, A12, F(1), A22 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlidePartitionDown ( pT, p0, p1, /**/ /**/ pB, p2 ); } }
int Halley ( DistMatrix<F>& A, typename Base<F>::type upperBound ) { #ifndef RELEASE PushCallStack("Halley"); #endif typedef typename Base<F>::type R; const Grid& g = A.Grid(); const int height = A.Height(); const int width = A.Width(); const R oneHalf = R(1)/R(2); const R oneThird = R(1)/R(3); if( height < width ) throw std::logic_error("Height cannot be less than width"); const R epsilon = lapack::MachineEpsilon<R>(); const R tol = 5*epsilon; const R cubeRootTol = Pow(tol,oneThird); const R a = 3; const R b = 1; const R c = 3; // Form the first iterate Scale( 1/upperBound, A ); int numIts=0; R frobNormADiff; DistMatrix<F> ALast( g ); DistMatrix<F> Q( height+width, width, g ); DistMatrix<F> QT(g), QB(g); PartitionDown( Q, QT, QB, height ); DistMatrix<F> C( g ); DistMatrix<F> ATemp( g ); do { if( numIts > 100 ) throw std::runtime_error("Halley iteration did not converge"); ++numIts; ALast = A; // TODO: Come up with a test for when we can use the Cholesky approach if( true ) { // // The standard QR-based algorithm // QT = A; Scale( Sqrt(c), QT ); MakeIdentity( QB ); ExplicitQR( Q ); Gemm( NORMAL, ADJOINT, F(a-b/c)/Sqrt(c), QT, QB, F(b/c), A ); } else { // // Use faster Cholesky-based algorithm since A is well-conditioned // Identity( width, width, C ); Herk( LOWER, ADJOINT, F(c), A, F(1), C ); Cholesky( LOWER, C ); ATemp = A; Trsm( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), C, ATemp ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, F(1), C, ATemp ); Scale( b/c, A ); Axpy( a-b/c, ATemp, A ); } Axpy( F(-1), A, ALast ); frobNormADiff = Norm( ALast, FROBENIUS_NORM ); } while( frobNormADiff > cubeRootTol ); #ifndef RELEASE PopCallStack(); #endif return numIts; }