inline void internal::HegstRLVar3( DistMatrix<F,MC,MR>& A, const DistMatrix<F,MC,MR>& L ) { #ifndef RELEASE PushCallStack("internal::HegstRLVar4"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( L.Height() != L.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != L.Height() ) throw std::logic_error("A and L must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F,MC,MR> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F,MC,MR> YTL(g), YTR(g), Y00(g), Y01(g), Y02(g), YBL(g), YBR(g), Y10(g), Y11(g), Y12(g), Y20(g), Y21(g), Y22(g); DistMatrix<F,MC,MR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); // Temporary distributions DistMatrix<F,STAR,MR > A11_STAR_MR(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,STAR,VR > A10_STAR_VR(g); DistMatrix<F,STAR,MR > A10_STAR_MR(g); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,VR > L10_STAR_VR(g); DistMatrix<F,STAR,MR > L10_STAR_MR(g); DistMatrix<F,MC, STAR> L21_MC_STAR(g); DistMatrix<F,STAR,STAR> X11_STAR_STAR(g); DistMatrix<F,MC, STAR> X21_MC_STAR(g); DistMatrix<F,MC, STAR> Z21_MC_STAR(g); // We will use an entire extra matrix as temporary storage. If this is not // acceptable, use HegstRLVar4 instead. DistMatrix<F,MC,MR> Y(g); Y.AlignWith( A ); Y.ResizeTo( A.Height(), A.Width() ); Zero( Y ); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDownDiagonal ( Y, YTL, YTR, YBL, YBR, 0 ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); RepartitionDownDiagonal ( YTL, /**/ YTR, Y00, /**/ Y01, Y02, /*************/ /******************/ /**/ Y10, /**/ Y11, Y12, YBL, /**/ YBR, Y20, /**/ Y21, Y22 ); LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); A11_STAR_MR.AlignWith( Y21 ); A21_VC_STAR.AlignWith( A21 ); A10_STAR_VR.AlignWith( A10 ); A10_STAR_MR.AlignWith( A10 ); L10_STAR_VR.AlignWith( A10 ); L10_STAR_MR.AlignWith( A10 ); L21_MC_STAR.AlignWith( Y21 ); X21_MC_STAR.AlignWith( A20 ); Z21_MC_STAR.AlignWith( L20 ); //--------------------------------------------------------------------// // A10 := A10 - 1/2 Y10 Axpy( (F)-0.5, Y10, A10 ); // A11 := A11 - (A10 L10' + L10 A10') A10_STAR_VR = A10; L10_STAR_VR = L10; X11_STAR_STAR.ResizeTo( A11.Height(), A11.Width() ); Her2k ( LOWER, NORMAL, (F)1, A10_STAR_VR.LocalMatrix(), L10_STAR_VR.LocalMatrix(), (F)0, X11_STAR_STAR.LocalMatrix() ); MakeTrapezoidal( LEFT, LOWER, 0, X11_STAR_STAR ); A11.SumScatterUpdate( (F)-1, X11_STAR_STAR ); // A11 := inv(L11) A11 inv(L11)' A11_STAR_STAR = A11; L11_STAR_STAR = L11; internal::LocalHegst( RIGHT, LOWER, A11_STAR_STAR, L11_STAR_STAR ); A11 = A11_STAR_STAR; // A21 := A21 - A20 L10' L10_STAR_MR = L10_STAR_VR; X21_MC_STAR.ResizeTo( A21.Height(), A21.Width() ); internal::LocalGemm ( NORMAL, ADJOINT, (F)1, A20, L10_STAR_MR, (F)0, X21_MC_STAR ); A21.SumScatterUpdate( (F)-1, X21_MC_STAR ); // A21 := A21 inv(L11)' A21_VC_STAR = A21; internal::LocalTrsm ( RIGHT, LOWER, ADJOINT, NON_UNIT, (F)1, L11_STAR_STAR, A21_VC_STAR ); A21 = A21_VC_STAR; // A10 := A10 - 1/2 Y10 Axpy( (F)-0.5, Y10, A10 ); // A10 := inv(L11) A10 A10_STAR_VR = A10; internal::LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, (F)1, L11_STAR_STAR, A10_STAR_VR ); // Y20 := Y20 + L21 A10 A10_STAR_MR = A10_STAR_VR; A10 = A10_STAR_MR; L21_MC_STAR = L21; internal::LocalGemm ( NORMAL, NORMAL, (F)1, L21_MC_STAR, A10_STAR_MR, (F)1, Y20 ); // Y21 := L21 A11 // // Symmetrize A11[* ,* ] by copying the lower triangle into the upper // so that we can call a local gemm instead of worrying about // reproducing a hemm with nonsymmetric local matrices. { const int height = A11_STAR_STAR.LocalHeight(); const int ldim = A11_STAR_STAR.LocalLDim(); F* A11Buffer = A11_STAR_STAR.LocalBuffer(); for( int i=1; i<height; ++i ) for( int j=0; j<i; ++j ) A11Buffer[j+i*ldim] = Conj(A11Buffer[i+j*ldim]); } A11_STAR_MR = A11_STAR_STAR; internal::LocalGemm ( NORMAL, NORMAL, (F)1, L21_MC_STAR, A11_STAR_MR, (F)0, Y21 ); // Y21 := Y21 + L20 A10' Z21_MC_STAR.ResizeTo( A21.Height(), A21.Width() ); internal::LocalGemm ( NORMAL, ADJOINT, (F)1, L20, A10_STAR_MR, (F)0, Z21_MC_STAR ); Y21.SumScatterUpdate( (F)1, Z21_MC_STAR ); //--------------------------------------------------------------------// A11_STAR_MR.FreeAlignments(); A21_VC_STAR.FreeAlignments(); A10_STAR_VR.FreeAlignments(); A10_STAR_MR.FreeAlignments(); L10_STAR_VR.FreeAlignments(); L10_STAR_MR.FreeAlignments(); L21_MC_STAR.FreeAlignments(); X21_MC_STAR.FreeAlignments(); Z21_MC_STAR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlidePartitionDownDiagonal ( YTL, /**/ YTR, Y00, Y01, /**/ Y02, /**/ Y10, Y11, /**/ Y12, /*************/ /******************/ YBL, /**/ YBR, Y20, Y21, /**/ Y22 ); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /**********************************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmUVar5 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmUVar5"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); DistMatrix<F,STAR,VC > A12_STAR_VC(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MC > U12_STAR_MC(g); DistMatrix<F,STAR,MR > U12_STAR_MR(g); DistMatrix<F,STAR,VC > U12_STAR_VC(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,STAR,VR > Y12_STAR_VR(g); DistMatrix<F> Y12(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A12_STAR_VC.AlignWith( A22 ); A12_STAR_VR.AlignWith( A22 ); U12_STAR_MC.AlignWith( A22 ); U12_STAR_MR.AlignWith( A22 ); U12_STAR_VC.AlignWith( A22 ); U12_STAR_VR.AlignWith( A22 ); Y12.AlignWith( A12 ); Y12_STAR_VR.AlignWith( A12 ); //--------------------------------------------------------------------// // A11 := inv(U11)' A11 inv(U11) U11_STAR_STAR = U11; A11_STAR_STAR = A11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // Y12 := A11 U12 U12_STAR_VR = U12; Y12_STAR_VR.ResizeTo( A12.Height(), A12.Width() ); Hemm ( LEFT, UPPER, F(1), A11_STAR_STAR.LocalMatrix(), U12_STAR_VR.LocalMatrix(), F(0), Y12_STAR_VR.LocalMatrix() ); Y12 = Y12_STAR_VR; // A12 := inv(U11)' A12 A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, diag, F(1), U11_STAR_STAR, A12_STAR_VR ); A12 = A12_STAR_VR; // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A22 := A22 - (A12' U12 + U12' A12) A12_STAR_VR = A12; A12_STAR_VC = A12_STAR_VR; U12_STAR_VC = U12_STAR_VR; A12_STAR_MC = A12_STAR_VC; U12_STAR_MC = U12_STAR_VC; A12_STAR_MR = A12_STAR_VR; U12_STAR_MR = U12_STAR_VR; LocalTrr2k ( UPPER, ADJOINT, ADJOINT, F(-1), U12_STAR_MC, A12_STAR_MR, A12_STAR_MC, U12_STAR_MR, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A12 := A12 inv(U22) // // This is the bottleneck because A12 only has blocksize rows Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U22, A12 ); //--------------------------------------------------------------------// A12_STAR_MC.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A12_STAR_VC.FreeAlignments(); A12_STAR_VR.FreeAlignments(); U12_STAR_MC.FreeAlignments(); U12_STAR_MR.FreeAlignments(); U12_STAR_VC.FreeAlignments(); U12_STAR_VR.FreeAlignments(); Y12.FreeAlignments(); Y12_STAR_VR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void internal::HegstLUVar2( DistMatrix<F,MC,MR>& A, const DistMatrix<F,MC,MR>& U ) { #ifndef RELEASE PushCallStack("internal::HegstLUVar2"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F,MC,MR> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F,MC,MR> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MC > U12_STAR_MC(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,MR, STAR> U12Adj_MR_STAR(g); DistMatrix<F,VC, STAR> U12Adj_VC_STAR(g); DistMatrix<F,MC, STAR> X01_MC_STAR(g); DistMatrix<F,STAR,STAR> X11_STAR_STAR(g); DistMatrix<F,MC, MR > Y12(g); DistMatrix<F,MC, MR > Z12Adj(g); DistMatrix<F,MR, MC > Z12Adj_MR_MC(g); DistMatrix<F,MC, STAR> Z12Adj_MC_STAR(g); DistMatrix<F,MR, STAR> Z12Adj_MR_STAR(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A12_STAR_VR.AlignWith( A12 ); U12_STAR_MC.AlignWith( A22 ); U12_STAR_VR.AlignWith( A12 ); U12Adj_MR_STAR.AlignWith( A22 ); U12Adj_VC_STAR.AlignWith( A22 ); X01_MC_STAR.AlignWith( A01 ); Y12.AlignWith( A12 ); Z12Adj.AlignWith( A12 ); Z12Adj_MR_MC.AlignWith( A12 ); Z12Adj_MC_STAR.AlignWith( A22 ); Z12Adj_MR_STAR.AlignWith( A22 ); //--------------------------------------------------------------------// // A01 := A01 U11' U11_STAR_STAR = U11; A01_VC_STAR = A01; internal::LocalTrmm ( RIGHT, UPPER, ADJOINT, NON_UNIT, (F)1, U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; // A01 := A01 + A02 U12' U12Adj_MR_STAR.AdjointFrom( U12 ); X01_MC_STAR.ResizeTo( A01.Height(), A01.Width() ); internal::LocalGemm ( NORMAL, NORMAL, (F)1, A02, U12Adj_MR_STAR, (F)0, X01_MC_STAR ); A01.SumScatterUpdate( (F)1, X01_MC_STAR ); // Y12 := U12 A22 U12Adj_VC_STAR = U12Adj_MR_STAR; U12_STAR_MC.AdjointFrom( U12Adj_VC_STAR ); Z12Adj_MC_STAR.ResizeTo( A12.Width(), A12.Height() ); Z12Adj_MR_STAR.ResizeTo( A12.Width(), A12.Height() ); Zero( Z12Adj_MC_STAR ); Zero( Z12Adj_MR_STAR ); internal::LocalSymmetricAccumulateRU ( ADJOINT, (F)1, A22, U12_STAR_MC, U12Adj_MR_STAR, Z12Adj_MC_STAR, Z12Adj_MR_STAR ); Z12Adj.SumScatterFrom( Z12Adj_MC_STAR ); Z12Adj_MR_MC = Z12Adj; Z12Adj_MR_MC.SumScatterUpdate( (F)1, Z12Adj_MR_STAR ); Y12.ResizeTo( A12.Height(), A12.Width() ); Adjoint( Z12Adj_MR_MC.LockedLocalMatrix(), Y12.LocalMatrix() ); // A12 := U11 A12 A12_STAR_VR = A12; U11_STAR_STAR = U11; internal::LocalTrmm ( LEFT, UPPER, NORMAL, NON_UNIT, (F)1, U11_STAR_STAR, A12_STAR_VR ); A12 = A12_STAR_VR; // A12 := A12 + 1/2 Y12 Axpy( (F)0.5, Y12, A12 ); // A11 := U11 A11 U11' A11_STAR_STAR = A11; internal::LocalHegst( LEFT, UPPER, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A11 := A11 + (A12 U12' + U12 A12') A12_STAR_VR = A12; U12_STAR_VR = U12; X11_STAR_STAR.ResizeTo( A11.Height(), A11.Width() ); Her2k ( UPPER, NORMAL, (F)1, A12_STAR_VR.LocalMatrix(), U12_STAR_VR.LocalMatrix(), (F)0, X11_STAR_STAR.LocalMatrix() ); A11.SumScatterUpdate( (F)1, X11_STAR_STAR ); // A12 := A12 + 1/2 Y12 Axpy( (F)0.5, Y12, A12 ); //--------------------------------------------------------------------// A12_STAR_VR.FreeAlignments(); U12_STAR_MC.FreeAlignments(); U12_STAR_VR.FreeAlignments(); U12Adj_MR_STAR.FreeAlignments(); U12Adj_VC_STAR.FreeAlignments(); X01_MC_STAR.FreeAlignments(); Y12.FreeAlignments(); Z12Adj.FreeAlignments(); Z12Adj_MR_MC.FreeAlignments(); Z12Adj_MC_STAR.FreeAlignments(); Z12Adj_MR_STAR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }