void LUNMedium ( UnitOrNonUnit diag, const AbstractDistMatrix<F>& UPre, AbstractDistMatrix<F>& XPre, bool checkIfSingular ) { EL_DEBUG_CSE const Int m = XPre.Height(); const Int bsize = Blocksize(); const Grid& g = UPre.Grid(); DistMatrixReadProxy<F,F,MC,MR> UProx( UPre ); DistMatrixReadWriteProxy<F,F,MC,MR> XProx( XPre ); auto& U = UProx.GetLocked(); auto& X = XProx.Get(); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MR, STAR> X1Trans_MR_STAR(g); const Int kLast = LastOffset( m, bsize ); for( Int k=kLast; k>=0; k-=bsize ) { const Int nb = Min(bsize,m-k); const Range<Int> ind0( 0, k ), ind1( k, k+nb ); auto U01 = U( ind0, ind1 ); auto U11 = U( ind1, ind1 ); auto X0 = X( ind0, ALL ); auto X1 = X( ind1, ALL ); U11_STAR_STAR = U11; // U11[* ,* ] <- U11[MC,MR] X1Trans_MR_STAR.AlignWith( X0 ); Transpose( X1, X1Trans_MR_STAR ); // X1^T[MR,* ] := X1^T[MR,* ] U11^-T[* ,* ] // = (U11^-1[* ,* ] X1[* ,MR])^T LocalTrsm ( RIGHT, UPPER, TRANSPOSE, diag, F(1), U11_STAR_STAR, X1Trans_MR_STAR, checkIfSingular ); Transpose( X1Trans_MR_STAR, X1 ); U01_MC_STAR.AlignWith( X0 ); U01_MC_STAR = U01; // U01[MC,* ] <- U01[MC,MR] // X0[MC,MR] -= U01[MC,* ] X1[* ,MR] LocalGemm ( NORMAL, TRANSPOSE, F(-1), U01_MC_STAR, X1Trans_MR_STAR, F(1), X0 ); } }
void LUNLarge ( UnitOrNonUnit diag, const AbstractDistMatrix<F>& UPre, AbstractDistMatrix<F>& XPre, bool checkIfSingular ) { EL_DEBUG_CSE const Int m = XPre.Height(); const Int bsize = Blocksize(); const Grid& g = UPre.Grid(); DistMatrixReadProxy<F,F,MC,MR> UProx( UPre ); DistMatrixReadWriteProxy<F,F,MC,MR> XProx( XPre ); auto& U = UProx.GetLocked(); auto& X = XProx.Get(); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MR > X1_STAR_MR(g); DistMatrix<F,STAR,VR > X1_STAR_VR(g); const Int kLast = LastOffset( m, bsize ); for( Int k=kLast; k>=0; k-=bsize ) { const Int nb = Min(bsize,m-k); const Range<Int> ind0( 0, k ), ind1( k, k+nb ); auto U01 = U( ind0, ind1 ); auto U11 = U( ind1, ind1 ); auto X0 = X( ind0, ALL ); auto X1 = X( ind1, ALL ); U11_STAR_STAR = U11; // U11[* ,* ] <- U11[MC,MR] X1_STAR_VR = X1; // X1[* ,VR] <- X1[MC,MR] // X1[* ,VR] := U11^-1[* ,* ] X1[* ,VR] LocalTrsm ( LEFT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_STAR_VR, checkIfSingular ); X1_STAR_MR.AlignWith( X0 ); X1_STAR_MR = X1_STAR_VR; // X1[* ,MR] <- X1[* ,VR] X1 = X1_STAR_MR; // X1[MC,MR] <- X1[* ,MR] U01_MC_STAR.AlignWith( X0 ); U01_MC_STAR = U01; // U01[MC,* ] <- U01[MC,MR] // X0[MC,MR] -= U01[MC,* ] X1[* ,MR] LocalGemm( NORMAL, NORMAL, F(-1), U01_MC_STAR, X1_STAR_MR, F(1), X0 ); } }
void BackwardSingle ( const DistMatrix<F,VC,STAR>& L, DistMatrix<F,VC,STAR>& X, bool conjugate=false ) { const Grid& g = L.Grid(); const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE ); DistMatrix<F,STAR,STAR> D(g), L11_STAR_STAR(g), Z1_STAR_STAR(g); FormDiagonalBlocks( L, D, conjugate ); const Int m = L.Height(); const Int n = L.Width(); const Int numRHS = X.Width(); const Int bsize = Blocksize(); const Int kLast = LastOffset( n, bsize ); for( Int k=kLast; k>=0; k-=bsize ) { const Int nb = Min(bsize,n-k); const Range<Int> ind1(k,k+nb), ind2(k+nb,m); auto L11Trans_STAR_STAR = D( IR(0,nb), ind1 ); auto L21 = L( ind2, ind1 ); auto X1 = X( ind1, IR(0,numRHS) ); auto X2 = X( ind2, IR(0,numRHS) ); // X1 -= L21' X2 LocalGemm( orientation, NORMAL, F(-1), L21, X2, Z1_STAR_STAR ); axpy::util::UpdateWithLocalData( F(1), X1, Z1_STAR_STAR ); El::AllReduce( Z1_STAR_STAR, X1.DistComm() ); // X1 := L11^-1 X1 LocalTrsm ( LEFT, UPPER, NORMAL, UNIT, F(1), L11Trans_STAR_STAR, Z1_STAR_STAR ); X1 = Z1_STAR_STAR; } }
inline void TrsmLUNLarge ( UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLUNLarge"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MR > X1_STAR_MR(g); DistMatrix<F,STAR,VR > X1_STAR_VR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); U01_MC_STAR.AlignWith( X0 ); X1_STAR_MR.AlignWith( X0 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; // U11[* ,* ] <- U11[MC,MR] X1_STAR_VR = X1; // X1[* ,VR] <- X1[MC,MR] // X1[* ,VR] := U11^-1[* ,* ] X1[* ,VR] LocalTrsm ( LEFT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_STAR_VR, checkIfSingular ); X1_STAR_MR = X1_STAR_VR; // X1[* ,MR] <- X1[* ,VR] X1 = X1_STAR_MR; // X1[MC,MR] <- X1[* ,MR] U01_MC_STAR = U01; // U01[MC,* ] <- U01[MC,MR] // X0[MC,MR] -= U01[MC,* ] X1[* ,MR] LocalGemm( NORMAL, NORMAL, F(-1), U01_MC_STAR, X1_STAR_MR, F(1), X0 ); //--------------------------------------------------------------------// U01_MC_STAR.FreeAlignments(); X1_STAR_MR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmLUNSmall ( UnitOrNonUnit diag, F alpha, const DistMatrix<F,VC,STAR>& U, DistMatrix<F,VC,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLUNSmall"); if( U.Grid() != X.Grid() ) throw std::logic_error ("U and X must be distributed over the same grid"); if( U.Height() != U.Width() || U.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLUN: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str() ); } if( U.ColAlignment() != X.ColAlignment() ) throw std::logic_error("U and X are assumed to be aligned"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F,VC,STAR> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F,VC,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; // U11[* ,* ] <- U11[VC,* ] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VC,* ] // X1[* ,* ] := U11^-1[* ,* ] X1[* ,* ] LocalTrsm ( LEFT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_STAR_STAR, checkIfSingular ); X1 = X1_STAR_STAR; // X0[VC,* ] -= U01[VC,* ] X1[* ,* ] LocalGemm( NORMAL, NORMAL, F(-1), U01, X1_STAR_STAR, F(1), X0 ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyPackedReflectorsLUVF ( Conjugation conjugation, int offset, const DistMatrix<Complex<R> >& H, const DistMatrix<Complex<R>,MD,STAR>& t, DistMatrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUVF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) throw std::logic_error ("{H,t,A} must be distributed over the same grid"); if( offset < 0 || offset > H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) throw std::logic_error("t must be aligned with H's 'offset' diagonal"); #endif typedef Complex<R> C; const Grid& g = H.Grid(); DistMatrix<C> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<C> AT(g), A0(g), ATop(g), AB(g), A1(g), A2(g); DistMatrix<C,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<C> HPanCopy(g); DistMatrix<C,VC, STAR> HPan_VC_STAR(g); DistMatrix<C,MC, STAR> HPan_MC_STAR(g); DistMatrix<C,STAR,STAR> t1_STAR_STAR(g); DistMatrix<C,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<C,STAR,MR > Z_STAR_MR(g); DistMatrix<C,STAR,VR > Z_STAR_VR(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanHeight = H01.Height() + H11.Height(); const int HPanOffset = std::min( H11.Width(), std::max(offset-H00.Width(),0) ); const int HPanWidth = H11.Width()-HPanOffset; HPan.LockedView( H, 0, H00.Width()+HPanOffset, HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); ATop.View2x1( A0, A1 ); HPan_MC_STAR.AlignWith( ATop ); Z_STAR_MR.AlignWith( ATop ); Z_STAR_VR.AlignWith( ATop ); Zeros( HPan.Width(), ATop.Width(), Z_STAR_MR ); Zeros( HPan.Width(), HPan.Width(), SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, UPPER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); HPan_VC_STAR = HPanCopy; Herk ( LOWER, ADJOINT, C(1), HPan_VC_STAR.LockedLocalMatrix(), C(0), SInv_STAR_STAR.LocalMatrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_MC_STAR = HPanCopy; LocalGemm( ADJOINT, NORMAL, C(1), HPan_MC_STAR, ATop, C(0), Z_STAR_MR ); Z_STAR_VR.SumScatterFrom( Z_STAR_MR ); LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, C(1), SInv_STAR_STAR, Z_STAR_VR ); Z_STAR_MR = Z_STAR_VR; LocalGemm( NORMAL, NORMAL, C(-1), HPan_MC_STAR, Z_STAR_MR, C(1), ATop ); //--------------------------------------------------------------------// HPan_MC_STAR.FreeAlignments(); Z_STAR_MR.FreeAlignments(); Z_STAR_VR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmRUT ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmRUT"); if( orientation == NORMAL ) throw std::logic_error("TrsmRUT expects a (Conjugate)Transpose option"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,VR, STAR> U01_VR_STAR(g); DistMatrix<F,STAR,MR > U01AdjOrTrans_STAR_MR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionLeft( X, XL, XR, 0 ); while( XL.Width() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionLeft ( XL, /**/ XR, X0, X1, /**/ X2 ); X1_VC_STAR.AlignWith( X0 ); X1Trans_STAR_MC.AlignWith( X0 ); U01_VR_STAR.AlignWith( X0 ); U01AdjOrTrans_STAR_MR.AlignWith( X0 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, UPPER, orientation, diag, F(1), U11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); U01_VR_STAR = U01; if( orientation == ADJOINT ) U01AdjOrTrans_STAR_MR.AdjointFrom( U01_VR_STAR ); else U01AdjOrTrans_STAR_MR.TransposeFrom( U01_VR_STAR ); // X0[MC,MR] -= X1[MC,* ] (U01[MR,* ])^(T/H) // = X1^T[* ,MC] (U01^(T/H))[* ,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, U01AdjOrTrans_STAR_MR, F(1), X0 ); //--------------------------------------------------------------------// X1_VC_STAR.FreeAlignments(); X1Trans_STAR_MC.FreeAlignments(); U01_VR_STAR.FreeAlignments(); U01AdjOrTrans_STAR_MR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionLeft ( XL, /**/ XR, X0, /**/ X1, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void CholeskyUVar3( DistMatrix<F>& A ) { #ifndef RELEASE PushCallStack("internal::CholeskyUVar3"); if( A.Height() != A.Width() ) throw std::logic_error ("Can only compute Cholesky factor of square matrices"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); // Temporary matrix distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ABR.Height() > 0 ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A12_STAR_VR.AlignWith( A22 ); //--------------------------------------------------------------------// A11_STAR_STAR = A11; LocalCholesky( UPPER, A11_STAR_STAR ); A11 = A11_STAR_STAR; A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), A11_STAR_STAR, A12_STAR_VR ); A12_STAR_MC = A12_STAR_VR; A12_STAR_MR = A12_STAR_VR; LocalTrrk ( UPPER, ADJOINT, F(-1), A12_STAR_MC, A12_STAR_MR, F(1), A22 ); A12 = A12_STAR_MR; //--------------------------------------------------------------------// A12_STAR_MC.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A12_STAR_VR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void HPDInverseLVar2( DistMatrix<F>& A ) { #ifndef RELEASE PushCallStack("internal::HPDInverseLVar2"); if( A.Height() != A.Width() ) throw std::logic_error("Nonsquare matrices cannot be triangular"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A10_STAR_VR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,STAR,MC > A10_STAR_MC(g); DistMatrix<F,STAR,MR > A10_STAR_MR(g); DistMatrix<F,STAR,MC > A21Trans_STAR_MC(g); DistMatrix<F,VR, STAR> A21_VR_STAR(g); DistMatrix<F,STAR,MR > A21Adj_STAR_MR(g); // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); A10_STAR_VR.AlignWith( A00 ); A21_VC_STAR.AlignWith( A20 ); A10_STAR_MC.AlignWith( A00 ); A10_STAR_MR.AlignWith( A00 ); A21Trans_STAR_MC.AlignWith( A20 ); A21_VR_STAR.AlignWith( A22 ); A21Adj_STAR_MR.AlignWith( A22 ); //--------------------------------------------------------------------// A11_STAR_STAR = A11; LocalCholesky( LOWER, A11_STAR_STAR ); A10_STAR_VR = A10; LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, F(1), A11_STAR_STAR, A10_STAR_VR ); A21_VC_STAR = A21; LocalTrsm ( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), A11_STAR_STAR, A21_VC_STAR ); A10_STAR_MC = A10_STAR_VR; A10_STAR_MR = A10_STAR_VR; LocalTrrk ( LOWER, ADJOINT, F(1), A10_STAR_MC, A10_STAR_MR, F(1), A00 ); A21Trans_STAR_MC.TransposeFrom( A21_VC_STAR ); LocalGemm ( TRANSPOSE, NORMAL, F(-1), A21Trans_STAR_MC, A10_STAR_MR, F(1), A20 ); A21_VR_STAR = A21_VC_STAR; A21Adj_STAR_MR.AdjointFrom( A21_VR_STAR ); LocalTrrk ( LOWER, TRANSPOSE, F(-1), A21Trans_STAR_MC, A21Adj_STAR_MR, F(1), A22 ); LocalTrsm ( LEFT, LOWER, ADJOINT, NON_UNIT, F(1), A11_STAR_STAR, A10_STAR_VR ); LocalTrsm ( RIGHT, LOWER, NORMAL, NON_UNIT, F(-1), A11_STAR_STAR, A21_VC_STAR ); LocalTriangularInverse( LOWER, NON_UNIT, A11_STAR_STAR ); LocalTrtrmm( ADJOINT, LOWER, A11_STAR_STAR ); A11 = A11_STAR_STAR; A10 = A10_STAR_VR; A21 = A21_VC_STAR; //--------------------------------------------------------------------// A10_STAR_VR.FreeAlignments(); A21_VC_STAR.FreeAlignments(); A10_STAR_MC.FreeAlignments(); A10_STAR_MR.FreeAlignments(); A21Trans_STAR_MC.FreeAlignments(); A21_VR_STAR.FreeAlignments(); A21Adj_STAR_MR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void RUVF ( Conjugation conjugation, Int offset, const DistMatrix<F>& H, const DistMatrix<F,MD,STAR>& t, DistMatrix<F>& A ) { #ifndef RELEASE CallStackEntry cse("apply_packed_reflectors::RUVF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) LogicError("{H,t,A} must be distributed over the same grid"); // TODO: Proper dimension checks if( t.Height() != H.DiagonalLength(offset) ) LogicError("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) LogicError("t must be aligned with H's 'offset' diagonal"); #endif const Grid& g = H.Grid(); DistMatrix<F> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<F> ALeft(g); DistMatrix<F,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<F,VC, STAR> HPan_VC_STAR(g); DistMatrix<F,MR, STAR> HPan_MR_STAR(g); DistMatrix<F,STAR,STAR> t1_STAR_STAR(g); DistMatrix<F,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<F,STAR,MC > ZAdj_STAR_MC(g); DistMatrix<F,STAR,VC > ZAdj_STAR_VC(g); LockedPartitionDownOffsetDiagonal ( offset, H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2 ); LockedView2x1( HPan, H01, H11 ); View( ALeft, A, 0, 0, A.Height(), HPan.Height() ); HPan_MR_STAR.AlignWith( ALeft ); ZAdj_STAR_MC.AlignWith( ALeft ); ZAdj_STAR_VC.AlignWith( ALeft ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( UPPER, HPanCopy, 0, RIGHT ); SetDiagonal( HPanCopy, F(1), 0, RIGHT ); HPan_VC_STAR = HPanCopy; Zeros( SInv_STAR_STAR, HPan.Width(), HPan.Width() ); Herk ( UPPER, ADJOINT, F(1), HPan_VC_STAR.LockedMatrix(), F(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_MR_STAR = HPan_VC_STAR; LocalGemm( ADJOINT, ADJOINT, F(1), HPan_MR_STAR, ALeft, ZAdj_STAR_MC ); ZAdj_STAR_VC.SumScatterFrom( ZAdj_STAR_MC ); LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), SInv_STAR_STAR, ZAdj_STAR_VC ); ZAdj_STAR_MC = ZAdj_STAR_VC; LocalGemm ( ADJOINT, ADJOINT, F(-1), ZAdj_STAR_MC, HPan_MR_STAR, F(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); } }
inline void ApplyPackedReflectorsLUHB ( int offset, const DistMatrix<R>& H, DistMatrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUHB"); if( H.Grid() != A.Grid() ) throw std::logic_error("{H,A} must be distributed over the same grid"); if( offset < 0 || offset > H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); #endif const Grid& g = H.Grid(); DistMatrix<R> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<R> ABottom(g); DistMatrix<R,STAR,VR > HPan_STAR_VR(g); DistMatrix<R,STAR,MC > HPan_STAR_MC(g); DistMatrix<R,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<R,STAR,MR > Z_STAR_MR(g); DistMatrix<R,STAR,VR > Z_STAR_VR(g); LockedPartitionUpDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HBR.Height() < H.Height() && HBR.Width() < H.Width() ) { LockedRepartitionUpDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); const int HPanWidth = H11.Width() + H12.Width(); const int HPanHeight = std::min( H11.Height(), std::max(HPanWidth-offset,0) ); const int leftover = A.Height()-HPanWidth; HPan.LockedView( H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); ABottom.View( A, leftover, 0, HPanWidth, A.Width() ); HPan_STAR_MC.AlignWith( ABottom ); Z_STAR_MR.AlignWith( ABottom ); Z_STAR_VR.AlignWith( ABottom ); Zeros( HPanHeight, ABottom.Width(), Z_STAR_MR ); Zeros( HPanHeight, HPanHeight, SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, UPPER, offset, HPanCopy ); SetDiagonalToOne( LEFT, offset, HPanCopy ); HPan_STAR_VR = HPanCopy; Syrk ( UPPER, NORMAL, R(1), HPan_STAR_VR.LockedLocalMatrix(), R(0), SInv_STAR_STAR.LocalMatrix() ); SInv_STAR_STAR.SumOverGrid(); HalveMainDiagonal( SInv_STAR_STAR ); HPan_STAR_MC = HPan_STAR_VR; LocalGemm ( NORMAL, NORMAL, R(1), HPan_STAR_MC, ABottom, R(0), Z_STAR_MR ); Z_STAR_VR.SumScatterFrom( Z_STAR_MR ); LocalTrsm ( LEFT, UPPER, NORMAL, NON_UNIT, R(1), SInv_STAR_STAR, Z_STAR_VR ); Z_STAR_MR = Z_STAR_VR; LocalGemm ( TRANSPOSE, NORMAL, R(-1), HPan_STAR_MC, Z_STAR_MR, R(1), ABottom ); //--------------------------------------------------------------------// HPan_STAR_MC.FreeAlignments(); Z_STAR_MR.FreeAlignments(); Z_STAR_VR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmRUN ( UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmRUN"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MR > U12_STAR_MR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); X1_VC_STAR.AlignWith( X2 ); X1Trans_STAR_MC.AlignWith( X2 ); U12_STAR_MR.AlignWith( X2 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); U12_STAR_MR = U12; // X2[MC,MR] -= X1[MC,* ] U12[* ,MR] // = X1^T[* ,MC] U12[* ,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, U12_STAR_MR, F(1), X2 ); //--------------------------------------------------------------------// X1_VC_STAR.FreeAlignments(); X1Trans_STAR_MC.FreeAlignments(); U12_STAR_MR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmRLT ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F>& L, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE CallStackEntry entry("internal::TrsmRLT"); if( orientation == NORMAL ) LogicError("TrsmRLT expects a (Conjugate)Transpose option"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,VR, STAR> L21_VR_STAR(g); DistMatrix<F,STAR,MR > L21AdjOrTrans_STAR_MR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); X1_VC_STAR.AlignWith( X2 ); X1Trans_STAR_MC.AlignWith( X2 ); L21_VR_STAR.AlignWith( X2 ); L21AdjOrTrans_STAR_MR.AlignWith( X2 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); L21_VR_STAR = L21; if( orientation == ADJOINT ) L21AdjOrTrans_STAR_MR.AdjointFrom( L21_VR_STAR ); else L21AdjOrTrans_STAR_MR.TransposeFrom( L21_VR_STAR ); // X2[MC,MR] -= X1[MC,*] (L21[MR,*])^(T/H) // = X1^T[* ,MC] (L21^(T/H))[*,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, L21AdjOrTrans_STAR_MR, F(1), X2 ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } }
inline void TwoSidedTrsmLVar2 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& L ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmLVar2"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( L.Height() != L.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != L.Height() ) throw std::logic_error("A and L must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); // Temporary distributions DistMatrix<F,MR, STAR> A10Adj_MR_STAR(g); DistMatrix<F,STAR,VR > A10_STAR_VR(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,MR, STAR> F10Adj_MR_STAR(g); DistMatrix<F,MR, STAR> L10Adj_MR_STAR(g); DistMatrix<F,VC, STAR> L10Adj_VC_STAR(g); DistMatrix<F,STAR,MC > L10_STAR_MC(g); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,MC, STAR> X11_MC_STAR(g); DistMatrix<F,MC, STAR> X21_MC_STAR(g); DistMatrix<F,MC, STAR> Y10Adj_MC_STAR(g); DistMatrix<F,MR, MC > Y10Adj_MR_MC(g); DistMatrix<F> X11(g); DistMatrix<F> Y10Adj(g); Matrix<F> Y10Local; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); A10Adj_MR_STAR.AlignWith( L10 ); F10Adj_MR_STAR.AlignWith( A00 ); L10Adj_MR_STAR.AlignWith( A00 ); L10Adj_VC_STAR.AlignWith( A00 ); L10_STAR_MC.AlignWith( A00 ); X11.AlignWith( A11 ); X11_MC_STAR.AlignWith( L10 ); X21_MC_STAR.AlignWith( A20 ); Y10Adj_MC_STAR.AlignWith( A00 ); Y10Adj_MR_MC.AlignWith( A10 ); //--------------------------------------------------------------------// // Y10 := L10 A00 L10Adj_MR_STAR.AdjointFrom( L10 ); L10Adj_VC_STAR = L10Adj_MR_STAR; L10_STAR_MC.AdjointFrom( L10Adj_VC_STAR ); Y10Adj_MC_STAR.ResizeTo( A10.Width(), A10.Height() ); F10Adj_MR_STAR.ResizeTo( A10.Width(), A10.Height() ); Zero( Y10Adj_MC_STAR ); Zero( F10Adj_MR_STAR ); LocalSymmetricAccumulateRL ( ADJOINT, F(1), A00, L10_STAR_MC, L10Adj_MR_STAR, Y10Adj_MC_STAR, F10Adj_MR_STAR ); Y10Adj.SumScatterFrom( Y10Adj_MC_STAR ); Y10Adj_MR_MC = Y10Adj; Y10Adj_MR_MC.SumScatterUpdate( F(1), F10Adj_MR_STAR ); Adjoint( Y10Adj_MR_MC.LockedLocalMatrix(), Y10Local ); // X11 := A10 L10' X11_MC_STAR.ResizeTo( A11.Height(), A11.Width() ); LocalGemm ( NORMAL, NORMAL, F(1), A10, L10Adj_MR_STAR, F(0), X11_MC_STAR ); // A10 := A10 - Y10 Axpy( F(-1), Y10Local, A10.LocalMatrix() ); A10Adj_MR_STAR.AdjointFrom( A10 ); // A11 := A11 - (X11 + L10 A10') = A11 - (A10 L10' + L10 A10') LocalGemm ( NORMAL, NORMAL, F(1), L10, A10Adj_MR_STAR, F(1), X11_MC_STAR ); X11.SumScatterFrom( X11_MC_STAR ); MakeTrapezoidal( LEFT, LOWER, 0, X11 ); Axpy( F(-1), X11, A11 ); // A10 := inv(L11) A10 L11_STAR_STAR = L11; A10_STAR_VR.AdjointFrom( A10Adj_MR_STAR ); LocalTrsm ( LEFT, LOWER, NORMAL, diag, F(1), L11_STAR_STAR, A10_STAR_VR ); A10 = A10_STAR_VR; // A11 := inv(L11) A11 inv(L11)' A11_STAR_STAR = A11; LocalTwoSidedTrsm( LOWER, diag, A11_STAR_STAR, L11_STAR_STAR ); A11 = A11_STAR_STAR; // A21 := A21 - A20 L10' X21_MC_STAR.ResizeTo( A21.Height(), A21.Width() ); LocalGemm ( NORMAL, NORMAL, F(1), A20, L10Adj_MR_STAR, F(0), X21_MC_STAR ); A21.SumScatterUpdate( F(-1), X21_MC_STAR ); // A21 := A21 inv(L11)' A21_VC_STAR = A21; LocalTrsm ( RIGHT, LOWER, ADJOINT, diag, F(1), L11_STAR_STAR, A21_VC_STAR ); A21 = A21_VC_STAR; //--------------------------------------------------------------------// A10Adj_MR_STAR.FreeAlignments(); F10Adj_MR_STAR.FreeAlignments(); L10Adj_MR_STAR.FreeAlignments(); L10Adj_VC_STAR.FreeAlignments(); L10_STAR_MC.FreeAlignments(); X11.FreeAlignments(); X11_MC_STAR.FreeAlignments(); X21_MC_STAR.FreeAlignments(); Y10Adj_MC_STAR.FreeAlignments(); Y10Adj_MR_MC.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /**********************************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LUHF ( Conjugation conjugation, Int offset, const DistMatrix<F>& H, const DistMatrix<F,MD,STAR>& t, DistMatrix<F>& A ) { #ifndef RELEASE CallStackEntry cse("apply_packed_reflectors::LUHF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) LogicError("{H,t,A} must be distributed over the same grid"); // TODO: Proper dimension checks if( t.Height() != H.DiagonalLength(offset) ) LogicError("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) LogicError("t must be aligned with H's offset diagonal"); #endif const Grid& g = H.Grid(); DistMatrix<F> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<F> AT(g), A0(g), AB(g), A1(g), A2(g); DistMatrix<F,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<F,STAR,VR > HPan_STAR_VR(g); DistMatrix<F,STAR,MC > HPan_STAR_MC(g); DistMatrix<F,STAR,STAR> t1_STAR_STAR(g); DistMatrix<F,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<F,STAR,MR > Z_STAR_MR(g); DistMatrix<F,STAR,VR > Z_STAR_VR(g); LockedPartitionDownOffsetDiagonal ( offset, H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2 ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2, H11.Height() ); LockedView1x2( HPan, H11, H12 ); HPan_STAR_MC.AlignWith( AB ); Z_STAR_MR.AlignWith( AB ); Z_STAR_VR.AlignWith( AB ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTriangular( UPPER, HPanCopy ); SetDiagonal( HPanCopy, F(1) ); HPan_STAR_VR = HPanCopy; Zeros( SInv_STAR_STAR, HPan.Height(), HPan.Height() ); Herk ( LOWER, NORMAL, F(1), HPan_STAR_VR.LockedMatrix(), F(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_STAR_MC = HPan_STAR_VR; LocalGemm( NORMAL, NORMAL, F(1), HPan_STAR_MC, AB, Z_STAR_MR ); Z_STAR_VR.SumScatterFrom( Z_STAR_MR ); LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, F(1), SInv_STAR_STAR, Z_STAR_VR ); Z_STAR_MR = Z_STAR_VR; LocalGemm( ADJOINT, NORMAL, F(-1), HPan_STAR_MC, Z_STAR_MR, F(1), AB ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } }
void LLN ( UnitOrNonUnit diag, F alpha, const AbstractDistMatrix<F>& LPre, AbstractDistMatrix<F>& XPre, bool checkIfSingular ) { DEBUG_CSE const Int n = LPre.Height(); const Int bsize = Blocksize(); const Grid& g = LPre.Grid(); DistMatrixReadProxy<F,F,MC,MR> LProx( LPre ); DistMatrixReadWriteProxy<F,F,MC,MR> XProx( XPre ); auto& L = LProx.GetLocked(); auto& X = XProx.Get(); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g), X11_STAR_STAR(g); DistMatrix<F,MC, STAR> L21_MC_STAR(g); DistMatrix<F,STAR,MR > X10_STAR_MR(g), X11_STAR_MR(g); DistMatrix<F,STAR,VR > X10_STAR_VR(g); ScaleTrapezoid( alpha, LOWER, X ); for( Int k=0; k<n; k+=bsize ) { const Int nb = Min(bsize,n-k); const Range<Int> ind0( 0, k ), ind1( k, k+nb ), ind2( k+nb, n ); auto L11 = L( ind1, ind1 ); auto L21 = L( ind2, ind1 ); auto X10 = X( ind1, ind0 ); auto X11 = X( ind1, ind1 ); auto X20 = X( ind2, ind0 ); auto X21 = X( ind2, ind1 ); L11_STAR_STAR = L11; X11_STAR_STAR = X11; X10_STAR_VR = X10; LocalTrsm ( LEFT, LOWER, NORMAL, diag, F(1), L11_STAR_STAR, X10_STAR_VR, checkIfSingular ); Trstrm ( LEFT, LOWER, NORMAL, diag, F(1), L11_STAR_STAR, X11_STAR_STAR, checkIfSingular ); X11 = X11_STAR_STAR; X11_STAR_MR.AlignWith( X21 ); X11_STAR_MR = X11_STAR_STAR; MakeTrapezoidal( LOWER, X11_STAR_MR ); X10_STAR_MR.AlignWith( X20 ); X10_STAR_MR = X10_STAR_VR; X10 = X10_STAR_MR; L21_MC_STAR.AlignWith( X20 ); L21_MC_STAR = L21; LocalGemm ( NORMAL, NORMAL, F(-1), L21_MC_STAR, X10_STAR_MR, F(1), X20 ); LocalGemm ( NORMAL, NORMAL, F(-1), L21_MC_STAR, X11_STAR_MR, F(1), X21 ); } }
inline void RLHF ( int offset, const DistMatrix<R>& H, DistMatrix<R>& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLHF"); if( H.Grid() != A.Grid() ) throw std::logic_error("{H,A} must be distributed over the same grid"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); #endif const Grid& g = H.Grid(); DistMatrix<R> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<R> ALeft(g); DistMatrix<R,STAR,VR > HPan_STAR_VR(g); DistMatrix<R,STAR,MR > HPan_STAR_MR(g); DistMatrix<R,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<R,STAR,MC > ZTrans_STAR_MC(g); DistMatrix<R,STAR,VC > ZTrans_STAR_VC(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); HPan_STAR_MR.AlignWith( ALeft ); ZTrans_STAR_MC.AlignWith( ALeft ); ZTrans_STAR_VC.AlignWith( ALeft ); Zeros( HPan.Height(), ALeft.Height(), ZTrans_STAR_MC ); Zeros( HPan.Height(), HPan.Height(), SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, R(1) ); HPan_STAR_VR = HPanCopy; Syrk ( UPPER, NORMAL, R(1), HPan_STAR_VR.LockedMatrix(), R(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); HalveMainDiagonal( SInv_STAR_STAR ); HPan_STAR_MR = HPan_STAR_VR; LocalGemm ( NORMAL, TRANSPOSE, R(1), HPan_STAR_MR, ALeft, R(0), ZTrans_STAR_MC ); ZTrans_STAR_VC.SumScatterFrom( ZTrans_STAR_MC ); LocalTrsm ( LEFT, UPPER, TRANSPOSE, NON_UNIT, R(1), SInv_STAR_STAR, ZTrans_STAR_VC ); ZTrans_STAR_MC = ZTrans_STAR_VC; LocalGemm ( TRANSPOSE, NORMAL, R(-1), ZTrans_STAR_MC, HPan_STAR_MR, R(1), ALeft ); //--------------------------------------------------------------------// HPan_STAR_MR.FreeAlignments(); ZTrans_STAR_MC.FreeAlignments(); ZTrans_STAR_VC.FreeAlignments(); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmLLTLarge ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F>& L, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLLTLarge"); if( orientation == NORMAL ) throw std::logic_error("TrsmLLT expects a (Conjugate)Transpose option"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,MC > L10_STAR_MC(g); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,MR > X1_STAR_MR(g); DistMatrix<F,STAR,VR > X1_STAR_VR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); L10_STAR_MC.AlignWith( X0 ); X1_STAR_MR.AlignWith( X0 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[MC,MR] X1_STAR_VR = X1; // X1[* ,VR] <- X1[MC,MR] // X1[* ,VR] := L11^-[T/H][* ,* ] X1[* ,VR] LocalTrsm ( LEFT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_STAR_VR, checkIfSingular ); X1_STAR_MR = X1_STAR_VR; // X1[* ,MR] <- X1[* ,VR] X1 = X1_STAR_MR; // X1[MC,MR] <- X1[* ,MR] L10_STAR_MC = L10; // L10[* ,MC] <- L10[MC,MR] // X0[MC,MR] -= (L10[* ,MC])^(T/H) X1[* ,MR] // = L10^[T/H][MC,* ] X1[* ,MR] LocalGemm ( orientation, NORMAL, F(-1), L10_STAR_MC, X1_STAR_MR, F(1), X0 ); //--------------------------------------------------------------------// L10_STAR_MC.FreeAlignments(); X1_STAR_MR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void RLHF ( Conjugation conjugation, int offset, const DistMatrix<Complex<R> >& H, const DistMatrix<Complex<R>,MD,STAR>& t, DistMatrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLHF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) throw std::logic_error ("{H,t,A} must be distributed over the same grid"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) throw std::logic_error("t must be aligned with H's 'offset' diagonal"); #endif typedef Complex<R> C; const Grid& g = H.Grid(); DistMatrix<C> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<C> ALeft(g); DistMatrix<C,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<C,STAR,VR > HPan_STAR_VR(g); DistMatrix<C,STAR,MR > HPan_STAR_MR(g); DistMatrix<C,STAR,STAR> t1_STAR_STAR(g); DistMatrix<C,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<C,STAR,MC > ZAdj_STAR_MC(g); DistMatrix<C,STAR,VC > ZAdj_STAR_VC(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanHeight ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); HPan_STAR_MR.AlignWith( ALeft ); ZAdj_STAR_MC.AlignWith( ALeft ); ZAdj_STAR_VC.AlignWith( ALeft ); Zeros( HPan.Height(), ALeft.Height(), ZAdj_STAR_MC ); Zeros( HPan.Height(), HPan.Height(), SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, C(1) ); HPan_STAR_VR = HPanCopy; Herk ( UPPER, NORMAL, C(1), HPan_STAR_VR.LockedMatrix(), C(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_STAR_MR = HPan_STAR_VR; LocalGemm ( NORMAL, ADJOINT, C(1), HPan_STAR_MR, ALeft, C(0), ZAdj_STAR_MC ); ZAdj_STAR_VC.SumScatterFrom( ZAdj_STAR_MC ); LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, C(1), SInv_STAR_STAR, ZAdj_STAR_VC ); ZAdj_STAR_MC = ZAdj_STAR_VC; LocalGemm ( ADJOINT, NORMAL, C(-1), ZAdj_STAR_MC, HPan_STAR_MR, C(1), ALeft ); //--------------------------------------------------------------------// HPan_STAR_MR.FreeAlignments(); ZAdj_STAR_MC.FreeAlignments(); ZAdj_STAR_VC.FreeAlignments(); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmLVar5 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& L ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmLVar5"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( L.Height() != L.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != L.Height() ) LogicError("A and L must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,MC, STAR> A21_MC_STAR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,VR, STAR> A21_VR_STAR(g); DistMatrix<F,STAR,MR > A21Adj_STAR_MR(g); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,MC, STAR> L21_MC_STAR(g); DistMatrix<F,VC, STAR> L21_VC_STAR(g); DistMatrix<F,VR, STAR> L21_VR_STAR(g); DistMatrix<F,STAR,MR > L21Adj_STAR_MR(g); DistMatrix<F,VC, STAR> Y21_VC_STAR(g); DistMatrix<F> Y21(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); A21_MC_STAR.AlignWith( A22 ); A21_VC_STAR.AlignWith( A22 ); A21_VR_STAR.AlignWith( A22 ); A21Adj_STAR_MR.AlignWith( A22 ); L21_MC_STAR.AlignWith( A22 ); L21_VC_STAR.AlignWith( A22 ); L21_VR_STAR.AlignWith( A22 ); L21Adj_STAR_MR.AlignWith( A22 ); Y21.AlignWith( A21 ); Y21_VC_STAR.AlignWith( A22 ); //--------------------------------------------------------------------// // A11 := inv(L11) A11 inv(L11)' L11_STAR_STAR = L11; A11_STAR_STAR = A11; LocalTwoSidedTrsm( LOWER, diag, A11_STAR_STAR, L11_STAR_STAR ); A11 = A11_STAR_STAR; // Y21 := L21 A11 L21_VC_STAR = L21; Zeros( Y21_VC_STAR, A21.Height(), A21.Width() ); Hemm ( RIGHT, LOWER, F(1), A11_STAR_STAR.Matrix(), L21_VC_STAR.Matrix(), F(0), Y21_VC_STAR.Matrix() ); Y21 = Y21_VC_STAR; // A21 := A21 inv(L11)' A21_VC_STAR = A21; LocalTrsm ( RIGHT, LOWER, ADJOINT, diag, F(1), L11_STAR_STAR, A21_VC_STAR ); A21 = A21_VC_STAR; // A21 := A21 - 1/2 Y21 Axpy( F(-1)/F(2), Y21, A21 ); // A22 := A22 - (L21 A21' + A21 L21') A21_MC_STAR = A21; L21_MC_STAR = L21; A21_VC_STAR = A21_MC_STAR; A21_VR_STAR = A21_VC_STAR; L21_VR_STAR = L21_VC_STAR; A21Adj_STAR_MR.AdjointFrom( A21_VR_STAR ); L21Adj_STAR_MR.AdjointFrom( L21_VR_STAR ); LocalTrr2k ( LOWER, F(-1), L21_MC_STAR, A21Adj_STAR_MR, A21_MC_STAR, L21Adj_STAR_MR, F(1), A22 ); // A21 := A21 - 1/2 Y21 Axpy( F(-1)/F(2), Y21, A21 ); // A21 := inv(L22) A21 // // This is the bottleneck because A21 only has blocksize columns Trsm( LEFT, LOWER, NORMAL, diag, F(1), L22, A21 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /**********************************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); } }
inline void UVar3( UnitOrNonUnit diag, DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("triangular_inverse::UVar3"); if( U.Height() != U.Width() ) LogicError("Nonsquare matrices cannot be triangular"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,VC, STAR> U01_VC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,STAR,MC > U01Trans_STAR_MC(g); DistMatrix<F,MR, STAR> U12Trans_MR_STAR(g); // Start the algorithm PartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( UBR.Height() < U.Height() ) { RepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); U01Trans_STAR_MC.AlignWith( U02 ); U12Trans_MR_STAR.AlignWith( U02 ); //--------------------------------------------------------------------// U01_VC_STAR = U01; U11_STAR_STAR = U11; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(-1), U11_STAR_STAR, U01_VC_STAR ); // We transpose before the communication to avoid cache-thrashing // in the unpacking stage. U12Trans_MR_STAR.TransposeFrom( U12 ); U01Trans_STAR_MC.TransposeFrom( U01_VC_STAR ); LocalGemm ( TRANSPOSE, TRANSPOSE, F(1), U01Trans_STAR_MC, U12Trans_MR_STAR, F(1), U02 ); U01.TransposeFrom( U01Trans_STAR_MC ); U12_STAR_VR.TransposeFrom( U12Trans_MR_STAR ); LocalTrsm ( LEFT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, U12_STAR_VR ); LocalTriangularInverse( UPPER, diag, U11_STAR_STAR ); U11 = U11_STAR_STAR; U12 = U12_STAR_VR; //--------------------------------------------------------------------// SlidePartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); } }
inline void TrsmLUNMedium ( UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE CallStackEntry entry("internal::TrsmLUNMedium"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MR, STAR> X1Trans_MR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); U01_MC_STAR.AlignWith( X0 ); X1Trans_MR_STAR.AlignWith( X0 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; // U11[* ,* ] <- U11[MC,MR] X1Trans_MR_STAR.TransposeFrom( X1 ); // X1[* ,MR] <- X1[MC,MR] // X1[* ,MR] := U11^-1[* ,* ] X1[* ,MR] // // X1^T[MR,* ] := X1^T[MR,* ] U11^-T[* ,* ] LocalTrsm ( RIGHT, UPPER, TRANSPOSE, diag, F(1), U11_STAR_STAR, X1Trans_MR_STAR, checkIfSingular ); X1.TransposeFrom( X1Trans_MR_STAR ); U01_MC_STAR = U01; // U01[MC,* ] <- U01[MC,MR] // X0[MC,MR] -= U01[MC,* ] X1[* ,MR] LocalGemm ( NORMAL, TRANSPOSE, F(-1), U01_MC_STAR, X1Trans_MR_STAR, F(1), X0 ); //--------------------------------------------------------------------// U01_MC_STAR.FreeAlignments(); X1Trans_MR_STAR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } }
inline void TrsmLLTSmall ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F,STAR,VR>& L, DistMatrix<F,VR,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLLTSmall"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error("TrsmLLT expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLLT: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( L.RowAlignment() != X.ColAlignment() ) throw std::logic_error("L and X must be aligned"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F,STAR,VR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,VR,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[* ,VR] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VR,* ] // X1[* ,* ] := L11^-[T/H][* ,* ] X1[* ,* ] LocalTrsm ( LEFT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_STAR_STAR, checkIfSingular ); X1 = X1_STAR_STAR; // X0[VR,* ] -= L10[* ,VR]^(T/H) X1[* ,* ] LocalGemm( orientation, NORMAL, F(-1), L10, X1_STAR_STAR, F(1), X0 ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmUVar4 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar4"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,MC > A01Trans_STAR_MC(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,VC > A12_STAR_VC(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MR, STAR> U12Trans_MR_STAR(g); DistMatrix<F,VR, STAR> U12Trans_VR_STAR(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,STAR,VC > U12_STAR_VC(g); DistMatrix<F,STAR,MC > U12_STAR_MC(g); DistMatrix<F,STAR,VR > Y12_STAR_VR(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A01_VC_STAR.AlignWith( A02 ); A01Trans_STAR_MC.AlignWith( A02 ); A12_STAR_VR.AlignWith( A22 ); A12_STAR_VC.AlignWith( A22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); U12Trans_MR_STAR.AlignWith( A02 ); U12Trans_VR_STAR.AlignWith( A02 ); U12_STAR_VR.AlignWith( A02 ); U12_STAR_VC.AlignWith( A22 ); U12_STAR_MC.AlignWith( A22 ); Y12_STAR_VR.AlignWith( A12 ); //--------------------------------------------------------------------// // A01 := A01 inv(U11) A01_VC_STAR = A01; U11_STAR_STAR = U11; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; // A11 := inv(U11)' A11 inv(U11) A11_STAR_STAR = A11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A02 := A02 - A01 U12 A01Trans_STAR_MC.TransposeFrom( A01_VC_STAR ); U12Trans_MR_STAR.TransposeFrom( U12 ); LocalGemm ( TRANSPOSE, TRANSPOSE, F(-1), A01Trans_STAR_MC, U12Trans_MR_STAR, F(1), A02 ); // Y12 := A11 U12 U12Trans_VR_STAR = U12Trans_MR_STAR; Zeros( U12_STAR_VR, A12.Height(), A12.Width() ); Transpose( U12Trans_VR_STAR.Matrix(), U12_STAR_VR.Matrix() ); Zeros( Y12_STAR_VR, A12.Height(), A12.Width() ); Hemm ( LEFT, UPPER, F(1), A11_STAR_STAR.Matrix(), U12_STAR_VR.Matrix(), F(0), Y12_STAR_VR.Matrix() ); // A12 := inv(U11)' A12 A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, diag, F(1), U11_STAR_STAR, A12_STAR_VR ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12_STAR_VR, A12_STAR_VR ); // A22 := A22 - (A12' U12 + U12' A12) A12_STAR_MR = A12_STAR_VR; A12_STAR_VC = A12_STAR_VR; U12_STAR_VC = U12_STAR_VR; A12_STAR_MC = A12_STAR_VC; U12_STAR_MC = U12_STAR_VC; LocalTrr2k ( UPPER, ADJOINT, TRANSPOSE, ADJOINT, F(-1), A12_STAR_MC, U12Trans_MR_STAR, U12_STAR_MC, A12_STAR_MR, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12_STAR_VR, A12_STAR_VR ); A12 = A12_STAR_VR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /**********************************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void TwoSidedTrsmUVar1 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar1"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,VC, STAR> U01_VC_STAR(g); DistMatrix<F,VR, STAR> U01_VR_STAR(g); DistMatrix<F,STAR,MR > U01Adj_STAR_MR(g); DistMatrix<F,STAR,STAR> X11_STAR_STAR(g); DistMatrix<F,MR, MC > Z01_MR_MC(g); DistMatrix<F,MC, STAR> Z01_MC_STAR(g); DistMatrix<F,MR, STAR> Z01_MR_STAR(g); DistMatrix<F> Y01(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A01_VC_STAR.AlignWith( A01 ); U01_MC_STAR.AlignWith( A00 ); U01_VR_STAR.AlignWith( A00 ); U01_VC_STAR.AlignWith( A00 ); U01Adj_STAR_MR.AlignWith( A00 ); Y01.AlignWith( A01 ); Z01_MR_MC.AlignWith( A01 ); Z01_MC_STAR.AlignWith( A00 ); Z01_MR_STAR.AlignWith( A00 ); //--------------------------------------------------------------------// // Y01 := A00 U01 U01_MC_STAR = U01; U01_VR_STAR = U01_MC_STAR; U01Adj_STAR_MR.AdjointFrom( U01_VR_STAR ); Zeros( Z01_MC_STAR, A01.Height(), A01.Width() ); Zeros( Z01_MR_STAR, A01.Height(), A01.Width() ); LocalSymmetricAccumulateLU ( ADJOINT, F(1), A00, U01_MC_STAR, U01Adj_STAR_MR, Z01_MC_STAR, Z01_MR_STAR ); Z01_MR_MC.SumScatterFrom( Z01_MR_STAR ); Y01 = Z01_MR_MC; Y01.SumScatterUpdate( F(1), Z01_MC_STAR ); // A01 := inv(U00)' A01 // // This is the bottleneck because A01 only has blocksize columns Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U00, A01 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A11 := A11 - (U01' A01 + A01' U01) A01_VC_STAR = A01; U01_VC_STAR = U01_MC_STAR; Zeros( X11_STAR_STAR, A11.Height(), A11.Width() ); Her2k ( UPPER, ADJOINT, F(-1), A01_VC_STAR.Matrix(), U01_VC_STAR.Matrix(), F(0), X11_STAR_STAR.Matrix() ); A11.SumScatterUpdate( F(1), X11_STAR_STAR ); // A11 := inv(U11)' A11 inv(U11) A11_STAR_STAR = A11; U11_STAR_STAR = U11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A01 := A01 inv(U11) A01_VC_STAR = A01; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void LU( DistMatrix<F>& A, DistMatrix<int,VC,STAR>& p ) { #ifndef RELEASE CallStackEntry entry("LU"); if( A.Grid() != p.Grid() ) throw std::logic_error("{A,p} must be distributed over the same grid"); if( p.Viewing() && (std::min(A.Height(),A.Width()) != p.Height() || p.Width() != 1) ) throw std::logic_error ("p must be a vector of the same height as the min dimension of A."); #endif const Grid& g = A.Grid(); if( !p.Viewing() ) p.ResizeTo( std::min(A.Height(),A.Width()), 1 ); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), AB(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<int,VC,STAR> pT(g), p0(g), pB(g), p1(g), p2(g); // Temporary distributions DistMatrix<F, STAR,STAR> A11_STAR_STAR(g); DistMatrix<F, MC, STAR> A21_MC_STAR(g); DistMatrix<F, STAR,VR > A12_STAR_VR(g); DistMatrix<F, STAR,MR > A12_STAR_MR(g); DistMatrix<int,STAR,STAR> p1_STAR_STAR(g); // Pivot composition std::vector<int> image, preimage; // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( p, pT, pB, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); RepartitionDown ( pT, p0, /**/ /**/ p1, pB, p2 ); View1x2( AB, ABL, ABR ); const int pivotOffset = A01.Height(); A12_STAR_VR.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A21_MC_STAR.AlignWith( A22 ); A11_STAR_STAR.ResizeTo( A11.Height(), A11.Width() ); p1_STAR_STAR.ResizeTo( p1.Height(), 1 ); //--------------------------------------------------------------------// A21_MC_STAR = A21; A11_STAR_STAR = A11; lu::Panel( A11_STAR_STAR, A21_MC_STAR, p1_STAR_STAR, pivotOffset ); ComposePivots( p1_STAR_STAR, pivotOffset, image, preimage ); ApplyRowPivots( AB, image, preimage ); // Perhaps we should give up perfectly distributing this operation since // it's total contribution is only O(n^2) A12_STAR_VR = A12; LocalTrsm ( LEFT, LOWER, NORMAL, UNIT, F(1), A11_STAR_STAR, A12_STAR_VR ); A12_STAR_MR = A12_STAR_VR; LocalGemm( NORMAL, NORMAL, F(-1), A21_MC_STAR, A12_STAR_MR, F(1), A22 ); A11 = A11_STAR_STAR; A12 = A12_STAR_MR; A21 = A21_MC_STAR; p1 = p1_STAR_STAR; //--------------------------------------------------------------------// A12_STAR_VR.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A21_MC_STAR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlidePartitionDown ( pT, p0, p1, /**/ /**/ pB, p2 ); } }
inline void ApplyPackedReflectorsLUVF ( int offset, const DistMatrix<R>& H, DistMatrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUVF"); if( H.Grid() != A.Grid() ) throw std::logic_error("{H,A} must be distributed over the same grid"); if( offset < 0 || offset > H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); #endif const Grid& g = H.Grid(); DistMatrix<R> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<R> AT(g), A0(g), ATop(g), AB(g), A1(g), A2(g); DistMatrix<R> HPanCopy(g); DistMatrix<R,VC, STAR> HPan_VC_STAR(g); DistMatrix<R,MC, STAR> HPan_MC_STAR(g); DistMatrix<R,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<R,STAR,MR > Z_STAR_MR(g); DistMatrix<R,STAR,VR > Z_STAR_VR(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanHeight = H01.Height() + H11.Height(); const int HPanOffset = std::min( H11.Width(), std::max(offset-H00.Width(),0) ); const int HPanWidth = H11.Width()-HPanOffset; HPan.LockedView( H, 0, H00.Width()+HPanOffset, HPanHeight, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); ATop.View2x1( A0, A1 ); HPan_MC_STAR.AlignWith( ATop ); Z_STAR_MR.AlignWith( ATop ); Z_STAR_VR.AlignWith( ATop ); Zeros( HPan.Width(), ATop.Width(), Z_STAR_MR ); Zeros( HPan.Width(), HPan.Width(), SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, UPPER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); HPan_VC_STAR = HPanCopy; Syrk ( LOWER, TRANSPOSE, R(1), HPan_VC_STAR.LockedLocalMatrix(), R(0), SInv_STAR_STAR.LocalMatrix() ); SInv_STAR_STAR.SumOverGrid(); HalveMainDiagonal( SInv_STAR_STAR ); HPan_MC_STAR = HPanCopy; LocalGemm ( TRANSPOSE, NORMAL, R(1), HPan_MC_STAR, ATop, R(0), Z_STAR_MR ); Z_STAR_VR.SumScatterFrom( Z_STAR_MR ); LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, R(1), SInv_STAR_STAR, Z_STAR_VR ); Z_STAR_MR = Z_STAR_VR; LocalGemm( NORMAL, NORMAL, R(-1), HPan_MC_STAR, Z_STAR_MR, R(1), ATop ); //--------------------------------------------------------------------// HPan_MC_STAR.FreeAlignments(); Z_STAR_MR.FreeAlignments(); Z_STAR_VR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LU( DistMatrix<F>& A ) { #ifndef RELEASE CallStackEntry entry("LU"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,MC, STAR> A21_MC_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); A12_STAR_VR.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A21_MC_STAR.AlignWith( A22 ); A11_STAR_STAR.ResizeTo( A11.Height(), A11.Width() ); //--------------------------------------------------------------------// A11_STAR_STAR = A11; LocalLU( A11_STAR_STAR ); A11 = A11_STAR_STAR; A21_MC_STAR = A21; LocalTrsm ( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), A11_STAR_STAR, A21_MC_STAR ); A21 = A21_MC_STAR; // Perhaps we should give up perfectly distributing this operation since // it's total contribution is only O(n^2) A12_STAR_VR = A12; LocalTrsm ( LEFT, LOWER, NORMAL, UNIT, F(1), A11_STAR_STAR, A12_STAR_VR ); A12_STAR_MR = A12_STAR_VR; LocalGemm( NORMAL, NORMAL, F(-1), A21_MC_STAR, A12_STAR_MR, F(1), A22 ); A12 = A12_STAR_MR; //--------------------------------------------------------------------// A12_STAR_VR.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A21_MC_STAR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } }
inline void LVar2( DistMatrix<F>& A ) { #ifndef RELEASE CallStackEntry entry("cholesky::LVar2"); if( A.Height() != A.Width() ) LogicError("Can only compute Cholesky factor of square matrices"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); // Temporary distributions DistMatrix<F,MR, STAR> A10Adj_MR_STAR(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,MC, STAR> X11_MC_STAR(g); DistMatrix<F,MC, STAR> X21_MC_STAR(g); // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); A10Adj_MR_STAR.AlignWith( A10 ); X11_MC_STAR.AlignWith( A10 ); X21_MC_STAR.AlignWith( A20 ); //--------------------------------------------------------------------// A10Adj_MR_STAR.AdjointFrom( A10 ); LocalGemm( NORMAL, NORMAL, F(1), A10, A10Adj_MR_STAR, X11_MC_STAR ); A11.SumScatterUpdate( F(-1), X11_MC_STAR ); A11_STAR_STAR = A11; LocalCholesky( LOWER, A11_STAR_STAR ); A11 = A11_STAR_STAR; LocalGemm( NORMAL, NORMAL, F(1), A20, A10Adj_MR_STAR, X21_MC_STAR ); A21.SumScatterUpdate( F(-1), X21_MC_STAR ); A21_VC_STAR = A21; LocalTrsm ( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), A11_STAR_STAR, A21_VC_STAR ); A21 = A21_VC_STAR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } }
inline void TwoSidedTrsmUVar5 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmUVar5"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); DistMatrix<F,STAR,VC > A12_STAR_VC(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MC > U12_STAR_MC(g); DistMatrix<F,STAR,MR > U12_STAR_MR(g); DistMatrix<F,STAR,VC > U12_STAR_VC(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,STAR,VR > Y12_STAR_VR(g); DistMatrix<F> Y12(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A12_STAR_VC.AlignWith( A22 ); A12_STAR_VR.AlignWith( A22 ); U12_STAR_MC.AlignWith( A22 ); U12_STAR_MR.AlignWith( A22 ); U12_STAR_VC.AlignWith( A22 ); U12_STAR_VR.AlignWith( A22 ); Y12.AlignWith( A12 ); Y12_STAR_VR.AlignWith( A12 ); //--------------------------------------------------------------------// // A11 := inv(U11)' A11 inv(U11) U11_STAR_STAR = U11; A11_STAR_STAR = A11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // Y12 := A11 U12 U12_STAR_VR = U12; Y12_STAR_VR.ResizeTo( A12.Height(), A12.Width() ); Hemm ( LEFT, UPPER, F(1), A11_STAR_STAR.LocalMatrix(), U12_STAR_VR.LocalMatrix(), F(0), Y12_STAR_VR.LocalMatrix() ); Y12 = Y12_STAR_VR; // A12 := inv(U11)' A12 A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, diag, F(1), U11_STAR_STAR, A12_STAR_VR ); A12 = A12_STAR_VR; // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A22 := A22 - (A12' U12 + U12' A12) A12_STAR_VR = A12; A12_STAR_VC = A12_STAR_VR; U12_STAR_VC = U12_STAR_VR; A12_STAR_MC = A12_STAR_VC; U12_STAR_MC = U12_STAR_VC; A12_STAR_MR = A12_STAR_VR; U12_STAR_MR = U12_STAR_VR; LocalTrr2k ( UPPER, ADJOINT, ADJOINT, F(-1), U12_STAR_MC, A12_STAR_MR, A12_STAR_MC, U12_STAR_MR, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A12 := A12 inv(U22) // // This is the bottleneck because A12 only has blocksize rows Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U22, A12 ); //--------------------------------------------------------------------// A12_STAR_MC.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A12_STAR_VC.FreeAlignments(); A12_STAR_VR.FreeAlignments(); U12_STAR_MC.FreeAlignments(); U12_STAR_MR.FreeAlignments(); U12_STAR_VC.FreeAlignments(); U12_STAR_VR.FreeAlignments(); Y12.FreeAlignments(); Y12_STAR_VR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }