inline void TrmmLLNA ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmLLNA"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( L.Height() != L.Width() || L.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLNA: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); DistMatrix<T> XL(g), XR(g), X0(g), X1(g), X2(g); DistMatrix<T,VR, STAR> X1_VR_STAR(g); DistMatrix<T,STAR,MR > X1Trans_STAR_MR(g); DistMatrix<T,MC, STAR> Z1_MC_STAR(g); X1_VR_STAR.AlignWith( L ); X1Trans_STAR_MR.AlignWith( L ); Z1_MC_STAR.AlignWith( L ); PartitionRight( X, XL, XR, 0 ); while( XL.Width() < X.Width() ) { RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); Zeros( X1.Height(), X1.Width(), Z1_MC_STAR ); //--------------------------------------------------------------------// X1_VR_STAR = X1; X1Trans_STAR_MR.TransposeFrom( X1_VR_STAR ); LocalTrmmAccumulateLLN ( TRANSPOSE, diag, alpha, L, X1Trans_STAR_MR, Z1_MC_STAR ); X1.SumScatterFrom( Z1_MC_STAR ); //--------------------------------------------------------------------// SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void GemmNNA ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmNNA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal GemmNNA: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR> B1Trans_STAR_MR(g); DistMatrix<T,MC,STAR> D1_MC_STAR(g); B1_VR_STAR.AlignWith( A ); B1Trans_STAR_MR.AlignWith( A ); D1_MC_STAR.AlignWith( A ); // Start the algorithm Scale( beta, C ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); Zeros( C1.Height(), C1.Width(), D1_MC_STAR ); //--------------------------------------------------------------------// B1_VR_STAR = B1; B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR ); // D1[MC,*] := alpha A[MC,MR] B1[MR,*] LocalGemm ( NORMAL, TRANSPOSE, alpha, A, B1Trans_STAR_MR, T(0), D1_MC_STAR ); // C1[MC,MR] += scattered result of D1[MC,*] summed over grid rows C1.SumScatterUpdate( T(1), D1_MC_STAR ); //--------------------------------------------------------------------// SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmRLT ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F>& L, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE CallStackEntry entry("internal::TrsmRLT"); if( orientation == NORMAL ) LogicError("TrsmRLT expects a (Conjugate)Transpose option"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,VR, STAR> L21_VR_STAR(g); DistMatrix<F,STAR,MR > L21AdjOrTrans_STAR_MR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); X1_VC_STAR.AlignWith( X2 ); X1Trans_STAR_MC.AlignWith( X2 ); L21_VR_STAR.AlignWith( X2 ); L21AdjOrTrans_STAR_MR.AlignWith( X2 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); L21_VR_STAR = L21; if( orientation == ADJOINT ) L21AdjOrTrans_STAR_MR.AdjointFrom( L21_VR_STAR ); else L21AdjOrTrans_STAR_MR.TransposeFrom( L21_VR_STAR ); // X2[MC,MR] -= X1[MC,*] (L21[MR,*])^(T/H) // = X1^T[* ,MC] (L21^(T/H))[*,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, L21AdjOrTrans_STAR_MR, F(1), X2 ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } }
void ApplyPackedReflectorsRUHF ( Conjugation conjugation, int offset, const DistMatrix<Complex<R> >& H, const DistMatrix<Complex<R>,MD,STAR>& t, DistMatrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsRUHF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) throw std::logic_error ("{H,t,A} must be distributed over the same grid"); if( offset < 0 || offset > H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Length of transforms must equal width of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) throw std::logic_error("t must be aligned with H's 'offset' diagonal"); #endif typedef Complex<R> C; const Grid& g = H.Grid(); DistMatrix<C> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<C> AL(g), AR(g), A0(g), A1(g), A2(g); DistMatrix<C,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<C,STAR,VR > HPan_STAR_VR(g); DistMatrix<C,STAR,MR > HPan_STAR_MR(g); DistMatrix<C,STAR,STAR> t1_STAR_STAR(g); DistMatrix<C,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<C,STAR,MC > ZAdj_STAR_MC(g); DistMatrix<C,STAR,VC > ZAdj_STAR_VC(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionRight ( A, AL, AR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H11.Width() + H12.Width(); const int HPanHeight = std::min( H11.Height(), std::max(HPanWidth-offset,0) ); LockedView( HPan, H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanHeight ); RepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2 ); HPan_STAR_MR.AlignWith( AR ); ZAdj_STAR_MC.AlignWith( AR ); ZAdj_STAR_VC.AlignWith( AR ); Zeros( HPanHeight, AR.Height(), ZAdj_STAR_MC ); Zeros( HPanHeight, HPanHeight, SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, UPPER, offset, HPanCopy ); SetDiagonalToOne( LEFT, offset, HPanCopy ); HPan_STAR_VR = HPanCopy; Herk ( UPPER, NORMAL, C(1), HPan_STAR_VR.LockedLocalMatrix(), C(0), SInv_STAR_STAR.LocalMatrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_STAR_MR = HPan_STAR_VR; LocalGemm ( NORMAL, ADJOINT, C(1), HPan_STAR_MR, AR, C(0), ZAdj_STAR_MC ); ZAdj_STAR_VC.SumScatterFrom( ZAdj_STAR_MC ); LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, C(1), SInv_STAR_STAR, ZAdj_STAR_VC ); ZAdj_STAR_MC = ZAdj_STAR_VC; LocalGemm ( ADJOINT, NORMAL, C(-1), ZAdj_STAR_MC, HPan_STAR_MR, C(1), AR ); //--------------------------------------------------------------------// HPan_STAR_MR.FreeAlignments(); ZAdj_STAR_MC.FreeAlignments(); ZAdj_STAR_VC.FreeAlignments(); SlidePartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyPackedReflectorsRUHF ( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsRUHF"); if( offset < 0 || offset > H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Length of transforms must equal width of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> AL, AR, A0, A1, A2; Matrix<R> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); PartitionRight( A, AL, AR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /**************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); RepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2 ); const int HPanWidth = H11.Width() + H12.Width(); const int HPanHeight = std::min( H11.Height(), std::max(HPanWidth-offset,0) ); LockedView( HPan, H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); Zeros( AR.Height(), HPanHeight, Z ); Zeros( HPanHeight, HPanHeight, SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, UPPER, offset, HPanCopy ); SetDiagonalToOne( LEFT, offset, HPanCopy ); Syrk( UPPER, NORMAL, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( NORMAL, TRANSPOSE, R(1), AR, HPanCopy, R(0), Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, NORMAL, R(-1), Z, HPanCopy, R(1), AR ); //--------------------------------------------------------------------// SlidePartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }
void ApplyPackedReflectorsRUHF ( Conjugation conjugation, int offset, const Matrix<Complex<R> >& H, const Matrix<Complex<R> >& t, Matrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsRUHF"); if( offset < 0 || offset > H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Length of transforms must equal width of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); #endif typedef Complex<R> C; Matrix<C> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<C> AL, AR, A0, A1, A2; Matrix<C> tT, t0, tB, t1, t2; Matrix<C> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionRight ( A, AL, AR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H11.Width() + H12.Width(); const int HPanHeight = std::min( H11.Height(), std::max(HPanWidth-offset,0) ); LockedView( HPan, H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanHeight ); RepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2 ); Zeros( AR.Height(), HPanHeight, Z ); Zeros( HPanHeight, HPanHeight, SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, UPPER, offset, HPanCopy ); SetDiagonalToOne( LEFT, offset, HPanCopy ); Herk( UPPER, NORMAL, C(1), HPanCopy, C(0), SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, ADJOINT, C(1), AR, HPanCopy, C(0), Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, C(1), SInv, Z ); Gemm( NORMAL, NORMAL, C(-1), Z, HPanCopy, C(1), AR ); //--------------------------------------------------------------------// SlidePartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void internal::GemmTNA ( Orientation orientationOfA, T alpha, const DistMatrix<T,MC,MR>& A, const DistMatrix<T,MC,MR>& B, T beta, DistMatrix<T,MC,MR>& C ) { #ifndef RELEASE PushCallStack("internal::GemmTNA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( orientationOfA == NORMAL ) throw std::logic_error("GemmTNA assumes A is (Conjugate)Transposed"); if( A.Width() != C.Height() || B.Width() != C.Width() || A.Height() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal GemmTNA: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T,MC,MR> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T,MC,MR> CL(g), CR(g), C0(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,MR,STAR> D1_MR_STAR(g); DistMatrix<T,MR,MC > D1_MR_MC(g); DistMatrix<T,MC,MR > D1(g); // Start the algorithm Scal( beta, C ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); B1_MC_STAR.AlignWith( A ); D1_MR_STAR.AlignWith( A ); D1_MR_STAR.ResizeTo( C1.Height(), C1.Width() ); D1.AlignWith( C1 ); //--------------------------------------------------------------------// B1_MC_STAR = B1; // B1[MC,*] <- B1[MC,MR] // D1[MR,*] := alpha (A1[MC,MR])^T B1[MC,*] // = alpha (A1^T)[MR,MC] B1[MC,*] internal::LocalGemm ( orientationOfA, NORMAL, alpha, A, B1_MC_STAR, (T)0, D1_MR_STAR ); // C1[MC,MR] += scattered & transposed D1[MR,*] summed over grid cols D1_MR_MC.SumScatterFrom( D1_MR_STAR ); D1 = D1_MR_MC; Axpy( (T)1, D1, C1 ); //--------------------------------------------------------------------// B1_MC_STAR.FreeAlignments(); D1_MR_STAR.FreeAlignments(); D1.FreeAlignments(); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void SymmLLA ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::SymmLLA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); #endif const Grid& g = A.Grid(); DistMatrix<T> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g); DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR> B1Trans_STAR_MR(g); DistMatrix<T> Z1(g); DistMatrix<T,MC,STAR> Z1_MC_STAR(g); DistMatrix<T,MR,STAR> Z1_MR_STAR(g); DistMatrix<T,MR,MC > Z1_MR_MC(g); B1_MC_STAR.AlignWith( A ); B1_VR_STAR.AlignWith( A ); B1Trans_STAR_MR.AlignWith( A ); Z1_MC_STAR.AlignWith( A ); Z1_MR_STAR.AlignWith( A ); Scale( beta, C ); LockedPartitionRight ( B, BL, BR, 0 ); PartitionRight ( C, CL, CR, 0 ); while( CL.Width() < C.Width() ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); Z1.AlignWith( C1 ); Zeros( C1.Height(), C1.Width(), Z1_MC_STAR ); Zeros( C1.Height(), C1.Width(), Z1_MR_STAR ); //--------------------------------------------------------------------// B1_MC_STAR = B1; B1_VR_STAR = B1_MC_STAR; B1Trans_STAR_MR.TransposeFrom( B1_VR_STAR ); LocalSymmetricAccumulateLL ( TRANSPOSE, alpha, A, B1_MC_STAR, B1Trans_STAR_MR, Z1_MC_STAR, Z1_MR_STAR ); Z1_MR_MC.SumScatterFrom( Z1_MR_STAR ); Z1 = Z1_MR_MC; Z1.SumScatterUpdate( T(1), Z1_MC_STAR ); Axpy( T(1), Z1, C1 ); //--------------------------------------------------------------------// Z1.FreeAlignments(); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
void RLVF ( Conjugation conjugation, Int offset, const DistMatrix<F>& H, const DistMatrix<F,MD,STAR>& t, DistMatrix<F>& A ) { #ifndef RELEASE CallStackEntry cse("apply_packed_reflectors::RLVF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) LogicError("{H,t,A} must be distributed over the same grid"); // TODO: Proper dimension checks if( t.Height() != H.DiagonalLength(offset) ) LogicError("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) LogicError("t must be aligned with H's offset diagonal"); #endif const Grid& g = H.Grid(); DistMatrix<F> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<F> AL(g), AR(g), A0(g), A1(g), A2(g); DistMatrix<F,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<F,VC, STAR> HPan_VC_STAR(g); DistMatrix<F,MR, STAR> HPan_MR_STAR(g); DistMatrix<F,STAR,STAR> t1_STAR_STAR(g); DistMatrix<F,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<F,STAR,MC > ZAdj_STAR_MC(g); DistMatrix<F,STAR,VC > ZAdj_STAR_VC(g); LockedPartitionDownOffsetDiagonal ( offset, H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionRight( A, AL, AR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2 ); RepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2, H11.Height() ); LockedView2x1( HPan, H11, H21 ); HPan_MR_STAR.AlignWith( AR ); ZAdj_STAR_MC.AlignWith( AR ); ZAdj_STAR_VC.AlignWith( AR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTriangular( LOWER, HPanCopy ); SetDiagonal( HPanCopy, F(1) ); HPan_VC_STAR = HPanCopy; Zeros( SInv_STAR_STAR, HPan.Width(), HPan.Width() ); Herk ( UPPER, ADJOINT, F(1), HPan_VC_STAR.LockedMatrix(), F(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_MR_STAR = HPan_VC_STAR; LocalGemm( ADJOINT, ADJOINT, F(1), HPan_MR_STAR, AR, ZAdj_STAR_MC ); ZAdj_STAR_VC.SumScatterFrom( ZAdj_STAR_MC ); LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), SInv_STAR_STAR, ZAdj_STAR_VC ); ZAdj_STAR_MC = ZAdj_STAR_VC; LocalGemm ( ADJOINT, ADJOINT, F(-1), ZAdj_STAR_MC, HPan_MR_STAR, F(1), AR ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlidePartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); } }
inline void TrsmRUN ( UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmRUN"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MR > U12_STAR_MR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); X1_VC_STAR.AlignWith( X2 ); X1Trans_STAR_MC.AlignWith( X2 ); U12_STAR_MR.AlignWith( X2 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); U12_STAR_MR = U12; // X2[MC,MR] -= X1[MC,* ] U12[* ,MR] // = X1^T[* ,MC] U12[* ,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, U12_STAR_MR, F(1), X2 ); //--------------------------------------------------------------------// X1_VC_STAR.FreeAlignments(); X1Trans_STAR_MC.FreeAlignments(); U12_STAR_MR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void PanelLU ( DistMatrix<F, STAR,STAR>& A, DistMatrix<F, MC, STAR>& B, DistMatrix<int,STAR,STAR>& p, int pivotOffset ) { #ifndef RELEASE PushCallStack("internal::PanelLU"); if( A.Grid() != p.Grid() || p.Grid() != B.Grid() ) throw std::logic_error ("Matrices must be distributed over the same grid"); if( A.Width() != B.Width() ) throw std::logic_error("A and B must be the same width"); if( A.Height() != p.Height() || p.Width() != 1 ) throw std::logic_error("p must be a vector that conforms with A"); #endif const Grid& g = A.Grid(); const int r = g.Height(); const int colShift = B.ColShift(); const int colAlignment = B.ColAlignment(); // Matrix views DistMatrix<F,STAR,STAR> ATL(g), ATR(g), A00(g), a01(g), A02(g), ABL(g), ABR(g), a10(g), alpha11(g), a12(g), A20(g), a21(g), A22(g); DistMatrix<F,MC,STAR> BL(g), BR(g), B0(g), b1(g), B2(g); const int width = A.Width(); const int numBytes = (width+1)*sizeof(F)+sizeof(int); std::vector<byte> sendData(numBytes); std::vector<byte> recvData(numBytes); // Extract pointers to send and recv data // TODO: Think of how to make this safer with respect to alignment issues F* sendBufFloat = (F*)&sendData[0]; F* recvBufFloat = (F*)&recvData[0]; int* sendBufInt = (int*)&sendData[(width+1)*sizeof(F)]; int* recvBufInt = (int*)&recvData[(width+1)*sizeof(F)]; // Start the algorithm PushBlocksizeStack( 1 ); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionRight( B, BL, BR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ a01, A02, /*************/ /**********************/ /**/ a10, /**/ alpha11, a12, ABL, /**/ ABR, A20, /**/ a21, A22 ); RepartitionRight ( BL, /**/ BR, B0, /**/ b1, B2 ); //--------------------------------------------------------------------// const int currentRow = a01.Height(); // Store the index/value of the pivot candidate in A F pivot = alpha11.GetLocal(0,0); int pivotRow = currentRow; for( int i=0; i<a21.Height(); ++i ) { F value = a21.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = currentRow + i + 1; } } // Update the pivot candidate to include local data from B for( int i=0; i<B.LocalHeight(); ++i ) { F value = b1.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = A.Height() + colShift + i*r; } } // Fill the send buffer with: // [ pivotValue | pivot row data | pivotRow ] if( pivotRow < A.Height() ) { sendBufFloat[0] = A.GetLocal(pivotRow,a10.Width()); const int ALDim = A.LocalLDim(); const F* ABuffer = A.LocalBuffer(pivotRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = ABuffer[j*ALDim]; } else { const int localRow = ((pivotRow-A.Height())-colShift)/r; sendBufFloat[0] = b1.GetLocal(localRow,0); const int BLDim = B.LocalLDim(); const F* BBuffer = B.LocalBuffer(localRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = BBuffer[j*BLDim]; } *sendBufInt = pivotRow; // Communicate to establish the pivot information mpi::AllReduce ( &sendData[0], &recvData[0], numBytes, PivotOp<F>(), g.ColComm() ); // Update the pivot vector pivotRow = *recvBufInt; p.SetLocal(currentRow,0,pivotRow+pivotOffset); // Copy the current row into the pivot row if( pivotRow < A.Height() ) { const int ALDim = A.LocalLDim(); F* ASetBuffer = A.LocalBuffer(pivotRow,0); const F* AGetBuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) ASetBuffer[j*ALDim] = AGetBuffer[j*ALDim]; } else { const int ownerRank = (colAlignment+(pivotRow-A.Height())) % r; if( g.Row() == ownerRank ) { const int localRow = ((pivotRow-A.Height())-colShift) / r; const int ALDim = A.LocalLDim(); const int BLDim = B.LocalLDim(); F* BBuffer = B.LocalBuffer(localRow,0); const F* ABuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) BBuffer[j*BLDim] = ABuffer[j*ALDim]; } } // Copy the pivot row into the current row { F* ABuffer = A.LocalBuffer(currentRow,0); const int ALDim = A.LocalLDim(); for( int j=0; j<width; ++j ) ABuffer[j*ALDim] = recvBufFloat[j+1]; } // Now we can perform the update of the current panel const F alpha = alpha11.GetLocal(0,0); if( alpha == F(0) ) throw SingularMatrixException(); const F alpha11Inv = F(1) / alpha; Scale( alpha11Inv, a21.LocalMatrix() ); Scale( alpha11Inv, b1.LocalMatrix() ); Geru( F(-1), a21.LocalMatrix(), a12.LocalMatrix(), A22.LocalMatrix() ); Geru( F(-1), b1.LocalMatrix(), a12.LocalMatrix(), B2.LocalMatrix() ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, a01, /**/ A02, /**/ a10, alpha11, /**/ a12, /*************/ /**********************/ ABL, /**/ ABR, A20, a21, /**/ A22 ); SlidePartitionRight ( BL, /**/ BR, B0, b1, /**/ B2 ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void TrmmRLNCOld ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmRLNCOld"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( L.Height() != L.Width() || X.Width() != L.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmRLNC: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,STAR,STAR> L11_STAR_STAR(g); DistMatrix<T,MR, STAR> L21_MR_STAR(g); DistMatrix<T,VC, STAR> X1_VC_STAR(g); DistMatrix<T,MC, STAR> D1_MC_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); L21_MR_STAR.AlignWith( X2 ); D1_MC_STAR.AlignWith( X1 ); Zeros( X1.Height(), X1.Width(), D1_MC_STAR ); //--------------------------------------------------------------------// X1_VC_STAR = X1; L11_STAR_STAR = L11; LocalTrmm ( RIGHT, LOWER, NORMAL, diag, T(1), L11_STAR_STAR, X1_VC_STAR ); X1 = X1_VC_STAR; L21_MR_STAR = L21; LocalGemm( NORMAL, NORMAL, T(1), X2, L21_MR_STAR, T(0), D1_MC_STAR ); X1.SumScatterUpdate( T(1), D1_MC_STAR ); //--------------------------------------------------------------------// L21_MR_STAR.FreeAlignments(); D1_MC_STAR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void HemmRUC ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::HemmRUC"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error("{A,B,C} must be distributed on the same grid"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> ATL(g), ATR(g), A00(g), A01(g), A02(g), AColPan(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), ARowPan(g), A20(g), A21(g), A22(g); DistMatrix<T> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g), CLeft(g), CRight(g); // Temporary distributions DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,VR, STAR> AColPan_VR_STAR(g); DistMatrix<T,STAR,MR > AColPanAdj_STAR_MR(g); DistMatrix<T,MR, STAR> ARowPanAdj_MR_STAR(g); B1_MC_STAR.AlignWith( C ); // Start the algorithm Scale( beta, C ); LockedPartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( CR.Width() > 0 ) { LockedRepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); ARowPan.LockedView1x2( A11, A12 ); AColPan.LockedView2x1 ( A01, A11 ); CLeft.View1x2( C0, C1 ); CRight.View1x2( C1, C2 ); AColPan_VR_STAR.AlignWith( CLeft ); AColPanAdj_STAR_MR.AlignWith( CLeft ); ARowPanAdj_MR_STAR.AlignWith( CRight ); //--------------------------------------------------------------------// B1_MC_STAR = B1; AColPan_VR_STAR = AColPan; AColPanAdj_STAR_MR.AdjointFrom( AColPan_VR_STAR ); ARowPanAdj_MR_STAR.AdjointFrom( ARowPan ); MakeTrapezoidal( LEFT, LOWER, 0, ARowPanAdj_MR_STAR ); MakeTrapezoidal( RIGHT, LOWER, -1, AColPanAdj_STAR_MR ); LocalGemm ( NORMAL, ADJOINT, alpha, B1_MC_STAR, ARowPanAdj_MR_STAR, T(1), CRight ); LocalGemm ( NORMAL, NORMAL, alpha, B1_MC_STAR, AColPanAdj_STAR_MR, T(1), CLeft ); //--------------------------------------------------------------------// AColPan_VR_STAR.FreeAlignments(); AColPanAdj_STAR_MR.FreeAlignments(); ARowPanAdj_MR_STAR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void internal::TrsvLN ( UnitOrNonUnit diag, const DistMatrix<F,MC,MR>& L, DistMatrix<F,MC,MR>& x ) { #ifndef RELEASE PushCallStack("internal::TrsvLN"); if( L.Grid() != x.Grid() ) throw std::logic_error("{L,x} must be distributed over the same grid"); if( L.Height() != L.Width() ) throw std::logic_error("L must be square"); if( x.Width() != 1 && x.Height() != 1 ) throw std::logic_error("x must be a vector"); const int xLength = ( x.Width() == 1 ? x.Height() : x.Width() ); if( L.Width() != xLength ) throw std::logic_error("Nonconformal TrsvLN"); #endif const Grid& g = L.Grid(); if( x.Width() == 1 ) { // Matrix views DistMatrix<F,MC,MR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,MC,MR> xT(g), x0(g), xB(g), x1(g), x2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> x1_STAR_STAR(g); DistMatrix<F,MR, STAR> x1_MR_STAR(g); DistMatrix<F,MC, STAR> z2_MC_STAR(g); // Start the algorithm LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionDown ( x, xT, xB, 0 ); while( xB.Height() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionDown ( xT, x0, /**/ /**/ x1, xB, x2 ); x1_MR_STAR.AlignWith( L21 ); z2_MC_STAR.AlignWith( L21 ); z2_MC_STAR.ResizeTo( x2.Height(), 1 ); //----------------------------------------------------------------// x1_STAR_STAR = x1; L11_STAR_STAR = L11; Trsv ( LOWER, NORMAL, diag, L11_STAR_STAR.LockedLocalMatrix(), x1_STAR_STAR.LocalMatrix() ); x1 = x1_STAR_STAR; x1_MR_STAR = x1_STAR_STAR; Gemv ( NORMAL, (F)-1, L21.LockedLocalMatrix(), x1_MR_STAR.LockedLocalMatrix(), (F)0, z2_MC_STAR.LocalMatrix() ); x2.SumScatterUpdate( (F)1, z2_MC_STAR ); //----------------------------------------------------------------// x1_MR_STAR.FreeAlignments(); z2_MC_STAR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionDown ( xT, x0, x1, /**/ /**/ xB, x2 ); } } else { // Matrix views DistMatrix<F,MC,MR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,MC,MR> xL(g), xR(g), x0(g), x1(g), x2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> x1_STAR_STAR(g); DistMatrix<F,STAR,MR > x1_STAR_MR(g); DistMatrix<F,STAR,MC > z2_STAR_MC(g); DistMatrix<F,MR, MC > z2_MR_MC(g); DistMatrix<F,MC, MR > z2(g); // Start the algorithm LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionRight( x, xL, xR, 0 ); while( xR.Width() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionRight ( xL, /**/ xR, x0, /**/ x1, x2 ); x1_STAR_MR.AlignWith( L21 ); z2_STAR_MC.AlignWith( L21 ); z2.AlignWith( x2 ); z2_STAR_MC.ResizeTo( 1, x2.Width() ); //----------------------------------------------------------------// x1_STAR_STAR = x1; L11_STAR_STAR = L11; Trsv ( LOWER, NORMAL, diag, L11_STAR_STAR.LockedLocalMatrix(), x1_STAR_STAR.LocalMatrix() ); x1 = x1_STAR_STAR; x1_STAR_MR = x1_STAR_STAR; Gemv ( NORMAL, (F)-1, L21.LockedLocalMatrix(), x1_STAR_MR.LockedLocalMatrix(), (F)0, z2_STAR_MC.LocalMatrix() ); z2_MR_MC.SumScatterFrom( z2_STAR_MC ); z2 = z2_MR_MC; Axpy( (F)1, z2, x2 ); //----------------------------------------------------------------// x1_STAR_MR.FreeAlignments(); z2_STAR_MC.FreeAlignments(); z2.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionRight ( xL, /**/ xR, x0, x1, /**/ x2 ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void GemmNNDot ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmNNDot"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal GemmNNDot: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); if( A.Height() > B.Width() ) { // Matrix views DistMatrix<T> AT(g), AB(g), A0(g), A1(g), A2(g); DistMatrix<T> BL(g), B0(g), BR(g), B1(g), B2(g); DistMatrix<T> CT(g), C0(g), C1L(g), C1R(g), CB(g), C1(g), C10(g), C11(g), C12(g), C2(g); // Temporary distributions DistMatrix<T,STAR,VC> A1_STAR_VC(g); DistMatrix<T,VC,STAR> B1_VC_STAR(g); DistMatrix<T,STAR,STAR> C11_STAR_STAR(g); // Star the algorithm Scale( beta, C ); LockedPartitionDown ( A, AT, AB, 0 ); PartitionDown ( C, CT, CB, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); RepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); A1_STAR_VC = A1; B1_VC_STAR.AlignWith( A1_STAR_VC ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C1, C1L, C1R, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( C1L, /**/ C1R, C10, /**/ C11, C12 ); Zeros( C11.Height(), C11.Width(), C11_STAR_STAR ); //------------------------------------------------------------// B1_VC_STAR = B1; LocalGemm ( NORMAL, NORMAL, alpha, A1_STAR_VC, B1_VC_STAR, T(0), C11_STAR_STAR ); C11.SumScatterUpdate( T(1), C11_STAR_STAR ); //------------------------------------------------------------// SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( C1L, /**/ C1R, C10, C11, /**/ C12 ); } B1_VC_STAR.FreeAlignments(); SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlidePartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); } } else { // Matrix views DistMatrix<T> AT(g), AB(g), A0(g), A1(g), A2(g); DistMatrix<T> BL(g), B0(g), BR(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C1T(g), C01(g), C0(g), C1(g), C2(g), C1B(g), C11(g), C21(g); // Temporary distributions DistMatrix<T,STAR,VR> A1_STAR_VR(g); DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,STAR> C11_STAR_STAR(g); // Star the algorithm Scale( beta, C ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); B1_VR_STAR = B1; A1_STAR_VR.AlignWith( B1_VR_STAR ); LockedPartitionDown ( A, AT, AB, 0 ); PartitionDown ( C1, C1T, C1B, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); RepartitionDown ( C1T, C01, /***/ /***/ C11, C1B, C21 ); Zeros( C11.Height(), C11.Width(), C11_STAR_STAR ); //------------------------------------------------------------// A1_STAR_VR = A1; LocalGemm ( NORMAL, NORMAL, alpha, A1_STAR_VR, B1_VR_STAR, T(0), C11_STAR_STAR ); C11.SumScatterUpdate( T(1), C11_STAR_STAR ); //------------------------------------------------------------// SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlidePartitionDown ( C1T, C01, C11, /***/ /***/ C1B, C21 ); } A1_STAR_VR.FreeAlignments(); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void GemmTTA ( Orientation orientationOfA, Orientation orientationOfB, T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmTTA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( orientationOfA == NORMAL || orientationOfB == NORMAL ) throw std::logic_error ("GemmTTA expects A and B to be (Conjugate)Transposed"); if( A.Width() != C.Height() || B.Height() != C.Width() || A.Height() != B.Width() ) { std::ostringstream msg; msg << "Nonconformal GemmTTA: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> BT(g), B0(g), BB(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,STAR,MC > B1_STAR_MC(g); DistMatrix<T,MR, STAR> D1_MR_STAR(g); DistMatrix<T,MR, MC > D1_MR_MC(g); DistMatrix<T> D1(g); B1_STAR_MC.AlignWith( A ); D1_MR_STAR.AlignWith( A ); // Start the algorithm Scale( beta, C ); LockedPartitionDown ( B, BT, BB, 0 ); PartitionRight( C, CL, CR, 0 ); while( BB.Height() > 0 ) { LockedRepartitionDown ( BT, B0, /**/ /**/ B1, BB, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); D1.AlignWith( C1 ); Zeros( C1.Height(), C1.Width(), D1_MR_STAR ); //--------------------------------------------------------------------// B1_STAR_MC = B1; // B1[*,MC] <- B1[MC,MR] // D1[MR,*] := alpha (A[MC,MR])^T (B1[*,MC])^T // = alpha (A^T)[MR,MC] (B1^T)[MC,*] LocalGemm ( orientationOfA, orientationOfB, alpha, A, B1_STAR_MC, T(0), D1_MR_STAR ); // C1[MC,MR] += scattered & transposed D1[MR,*] summed over grid cols D1_MR_MC.SumScatterFrom( D1_MR_STAR ); D1 = D1_MR_MC; Axpy( T(1), D1, C1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); SlideLockedPartitionDown ( BT, B0, B1, /**/ /**/ BB, B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
void RLVF ( Conjugation conjugation, Int offset, const Matrix<F>& H, const Matrix<F>& t, Matrix<F>& A ) { #ifndef RELEASE CallStackEntry cse("apply_packed_reflectors::RLVF"); // TODO: Proper dimension checks if( t.Height() != H.DiagonalLength(offset) ) LogicError("t must be the same length as H's offset diag"); #endif Matrix<F> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<F> AL, AR, A0, A1, A2; Matrix<F> tT, t0, tB, t1, t2; Matrix<F> SInv, Z; LockedPartitionDownOffsetDiagonal ( offset, H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionRight( A, AL, AR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2 ); RepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2, H11.Height() ); LockedView2x1( HPan, H11, H21 ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTriangular( LOWER, HPanCopy ); SetDiagonal( HPanCopy, F(1) ); Herk( UPPER, ADJOINT, F(1), HPanCopy, SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, NORMAL, F(1), AR, HPanCopy, Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, F(1), SInv, Z ); Gemm( NORMAL, ADJOINT, F(-1), Z, HPanCopy, F(1), AR ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlidePartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); } }
inline void TrmmLLTA ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmLLTA"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error ("TrmmLLTA expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLTA: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XL(g), XR(g), X0(g), X1(g), X2(g); DistMatrix<T,MC,STAR> X1_MC_STAR(g); DistMatrix<T,MR,STAR> Z1_MR_STAR(g); DistMatrix<T,MR,MC > Z1_MR_MC(g); X1_MC_STAR.AlignWith( L ); Z1_MR_STAR.AlignWith( L ); PartitionRight( X, XL, XR, 0 ); while( XL.Width() < X.Width() ) { RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); Zeros( X1.Height(), X1.Width(), Z1_MR_STAR ); //--------------------------------------------------------------------// X1_MC_STAR = X1; LocalTrmmAccumulateLLT ( orientation, diag, alpha, L, X1_MC_STAR, Z1_MR_STAR ); Z1_MR_MC.SumScatterFrom( Z1_MR_STAR ); X1 = Z1_MR_MC; //--------------------------------------------------------------------// SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyPackedReflectorsRUHF ( int offset, const DistMatrix<R>& H, DistMatrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsRUHF"); if( H.Grid() != A.Grid() ) throw std::logic_error("{H,A} must be distributed over the same grid"); if( offset < 0 || offset > H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Length of transforms must equal width of target matrix"); #endif const Grid& g = H.Grid(); DistMatrix<R> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<R> AL(g), AR(g), A0(g), A1(g), A2(g); DistMatrix<R,STAR,VR > HPan_STAR_VR(g); DistMatrix<R,STAR,MR > HPan_STAR_MR(g); DistMatrix<R,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<R,STAR,MC > ZTrans_STAR_MC(g); DistMatrix<R,STAR,VC > ZTrans_STAR_VC(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); PartitionRight( A, AL, AR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /**************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); RepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2 ); const int HPanWidth = H11.Width() + H12.Width(); const int HPanHeight = std::min( H11.Height(), std::max(HPanWidth-offset,0) ); LockedView( HPan, H, H00.Height(), H00.Width(), HPanHeight, HPanWidth ); HPan_STAR_MR.AlignWith( AR ); ZTrans_STAR_MC.AlignWith( AR ); ZTrans_STAR_VC.AlignWith( AR ); Zeros( HPanHeight, AR.Height(), ZTrans_STAR_MC ); Zeros( HPanHeight, HPanHeight, SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( LEFT, UPPER, offset, HPanCopy ); SetDiagonalToOne( LEFT, offset, HPanCopy ); HPan_STAR_VR = HPanCopy; Syrk ( UPPER, NORMAL, R(1), HPan_STAR_VR.LockedLocalMatrix(), R(0), SInv_STAR_STAR.LocalMatrix() ); SInv_STAR_STAR.SumOverGrid(); HalveMainDiagonal( SInv_STAR_STAR ); HPan_STAR_MR = HPan_STAR_VR; LocalGemm ( NORMAL, TRANSPOSE, R(1), HPan_STAR_MR, AR, R(0), ZTrans_STAR_MC ); ZTrans_STAR_VC.SumScatterFrom( ZTrans_STAR_MC ); LocalTrsm ( LEFT, UPPER, TRANSPOSE, NON_UNIT, R(1), SInv_STAR_STAR, ZTrans_STAR_VC ); ZTrans_STAR_MC = ZTrans_STAR_VC; LocalGemm ( TRANSPOSE, NORMAL, R(-1), ZTrans_STAR_MC, HPan_STAR_MR, R(1), AR ); //--------------------------------------------------------------------// HPan_STAR_MR.FreeAlignments(); ZTrans_STAR_MC.FreeAlignments(); ZTrans_STAR_VC.FreeAlignments(); SlidePartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }