inline void TwoSidedTrsmUVar1 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar1"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,VC, STAR> U01_VC_STAR(g); DistMatrix<F,VR, STAR> U01_VR_STAR(g); DistMatrix<F,STAR,MR > U01Adj_STAR_MR(g); DistMatrix<F,STAR,STAR> X11_STAR_STAR(g); DistMatrix<F,MR, MC > Z01_MR_MC(g); DistMatrix<F,MC, STAR> Z01_MC_STAR(g); DistMatrix<F,MR, STAR> Z01_MR_STAR(g); DistMatrix<F> Y01(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A01_VC_STAR.AlignWith( A01 ); U01_MC_STAR.AlignWith( A00 ); U01_VR_STAR.AlignWith( A00 ); U01_VC_STAR.AlignWith( A00 ); U01Adj_STAR_MR.AlignWith( A00 ); Y01.AlignWith( A01 ); Z01_MR_MC.AlignWith( A01 ); Z01_MC_STAR.AlignWith( A00 ); Z01_MR_STAR.AlignWith( A00 ); //--------------------------------------------------------------------// // Y01 := A00 U01 U01_MC_STAR = U01; U01_VR_STAR = U01_MC_STAR; U01Adj_STAR_MR.AdjointFrom( U01_VR_STAR ); Zeros( Z01_MC_STAR, A01.Height(), A01.Width() ); Zeros( Z01_MR_STAR, A01.Height(), A01.Width() ); LocalSymmetricAccumulateLU ( ADJOINT, F(1), A00, U01_MC_STAR, U01Adj_STAR_MR, Z01_MC_STAR, Z01_MR_STAR ); Z01_MR_MC.SumScatterFrom( Z01_MR_STAR ); Y01 = Z01_MR_MC; Y01.SumScatterUpdate( F(1), Z01_MC_STAR ); // A01 := inv(U00)' A01 // // This is the bottleneck because A01 only has blocksize columns Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U00, A01 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A11 := A11 - (U01' A01 + A01' U01) A01_VC_STAR = A01; U01_VC_STAR = U01_MC_STAR; Zeros( X11_STAR_STAR, A11.Height(), A11.Width() ); Her2k ( UPPER, ADJOINT, F(-1), A01_VC_STAR.Matrix(), U01_VC_STAR.Matrix(), F(0), X11_STAR_STAR.Matrix() ); A11.SumScatterUpdate( F(1), X11_STAR_STAR ); // A11 := inv(U11)' A11 inv(U11) A11_STAR_STAR = A11; U11_STAR_STAR = U11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A01 := A01 inv(U11) A01_VC_STAR = A01; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void TwoSidedTrsmUVar5 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmUVar5"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); DistMatrix<F,STAR,VC > A12_STAR_VC(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MC > U12_STAR_MC(g); DistMatrix<F,STAR,MR > U12_STAR_MR(g); DistMatrix<F,STAR,VC > U12_STAR_VC(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,STAR,VR > Y12_STAR_VR(g); DistMatrix<F> Y12(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A12_STAR_VC.AlignWith( A22 ); A12_STAR_VR.AlignWith( A22 ); U12_STAR_MC.AlignWith( A22 ); U12_STAR_MR.AlignWith( A22 ); U12_STAR_VC.AlignWith( A22 ); U12_STAR_VR.AlignWith( A22 ); Y12.AlignWith( A12 ); Y12_STAR_VR.AlignWith( A12 ); //--------------------------------------------------------------------// // A11 := inv(U11)' A11 inv(U11) U11_STAR_STAR = U11; A11_STAR_STAR = A11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // Y12 := A11 U12 U12_STAR_VR = U12; Y12_STAR_VR.ResizeTo( A12.Height(), A12.Width() ); Hemm ( LEFT, UPPER, F(1), A11_STAR_STAR.LocalMatrix(), U12_STAR_VR.LocalMatrix(), F(0), Y12_STAR_VR.LocalMatrix() ); Y12 = Y12_STAR_VR; // A12 := inv(U11)' A12 A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, diag, F(1), U11_STAR_STAR, A12_STAR_VR ); A12 = A12_STAR_VR; // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A22 := A22 - (A12' U12 + U12' A12) A12_STAR_VR = A12; A12_STAR_VC = A12_STAR_VR; U12_STAR_VC = U12_STAR_VR; A12_STAR_MC = A12_STAR_VC; U12_STAR_MC = U12_STAR_VC; A12_STAR_MR = A12_STAR_VR; U12_STAR_MR = U12_STAR_VR; LocalTrr2k ( UPPER, ADJOINT, ADJOINT, F(-1), U12_STAR_MC, A12_STAR_MR, A12_STAR_MC, U12_STAR_MR, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12, A12 ); // A12 := A12 inv(U22) // // This is the bottleneck because A12 only has blocksize rows Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U22, A12 ); //--------------------------------------------------------------------// A12_STAR_MC.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A12_STAR_VC.FreeAlignments(); A12_STAR_VR.FreeAlignments(); U12_STAR_MC.FreeAlignments(); U12_STAR_MR.FreeAlignments(); U12_STAR_VC.FreeAlignments(); U12_STAR_VR.FreeAlignments(); Y12.FreeAlignments(); Y12_STAR_VR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmUVar4 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar4"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,MC > A01Trans_STAR_MC(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,VC > A12_STAR_VC(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MR, STAR> U12Trans_MR_STAR(g); DistMatrix<F,VR, STAR> U12Trans_VR_STAR(g); DistMatrix<F,STAR,VR > U12_STAR_VR(g); DistMatrix<F,STAR,VC > U12_STAR_VC(g); DistMatrix<F,STAR,MC > U12_STAR_MC(g); DistMatrix<F,STAR,VR > Y12_STAR_VR(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A01_VC_STAR.AlignWith( A02 ); A01Trans_STAR_MC.AlignWith( A02 ); A12_STAR_VR.AlignWith( A22 ); A12_STAR_VC.AlignWith( A22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); U12Trans_MR_STAR.AlignWith( A02 ); U12Trans_VR_STAR.AlignWith( A02 ); U12_STAR_VR.AlignWith( A02 ); U12_STAR_VC.AlignWith( A22 ); U12_STAR_MC.AlignWith( A22 ); Y12_STAR_VR.AlignWith( A12 ); //--------------------------------------------------------------------// // A01 := A01 inv(U11) A01_VC_STAR = A01; U11_STAR_STAR = U11; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; // A11 := inv(U11)' A11 inv(U11) A11_STAR_STAR = A11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A02 := A02 - A01 U12 A01Trans_STAR_MC.TransposeFrom( A01_VC_STAR ); U12Trans_MR_STAR.TransposeFrom( U12 ); LocalGemm ( TRANSPOSE, TRANSPOSE, F(-1), A01Trans_STAR_MC, U12Trans_MR_STAR, F(1), A02 ); // Y12 := A11 U12 U12Trans_VR_STAR = U12Trans_MR_STAR; Zeros( U12_STAR_VR, A12.Height(), A12.Width() ); Transpose( U12Trans_VR_STAR.Matrix(), U12_STAR_VR.Matrix() ); Zeros( Y12_STAR_VR, A12.Height(), A12.Width() ); Hemm ( LEFT, UPPER, F(1), A11_STAR_STAR.Matrix(), U12_STAR_VR.Matrix(), F(0), Y12_STAR_VR.Matrix() ); // A12 := inv(U11)' A12 A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, diag, F(1), U11_STAR_STAR, A12_STAR_VR ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12_STAR_VR, A12_STAR_VR ); // A22 := A22 - (A12' U12 + U12' A12) A12_STAR_MR = A12_STAR_VR; A12_STAR_VC = A12_STAR_VR; U12_STAR_VC = U12_STAR_VR; A12_STAR_MC = A12_STAR_VC; U12_STAR_MC = U12_STAR_VC; LocalTrr2k ( UPPER, ADJOINT, TRANSPOSE, ADJOINT, F(-1), A12_STAR_MC, U12Trans_MR_STAR, U12_STAR_MC, A12_STAR_MR, F(1), A22 ); // A12 := A12 - 1/2 Y12 Axpy( F(-1)/F(2), Y12_STAR_VR, A12_STAR_VR ); A12 = A12_STAR_VR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /**********************************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void TwoSidedTrsmLVar2 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& L ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrsmLVar2"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( L.Height() != L.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != L.Height() ) throw std::logic_error("A and L must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); // Temporary distributions DistMatrix<F,MR, STAR> A10Adj_MR_STAR(g); DistMatrix<F,STAR,VR > A10_STAR_VR(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,MR, STAR> F10Adj_MR_STAR(g); DistMatrix<F,MR, STAR> L10Adj_MR_STAR(g); DistMatrix<F,VC, STAR> L10Adj_VC_STAR(g); DistMatrix<F,STAR,MC > L10_STAR_MC(g); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,MC, STAR> X11_MC_STAR(g); DistMatrix<F,MC, STAR> X21_MC_STAR(g); DistMatrix<F,MC, STAR> Y10Adj_MC_STAR(g); DistMatrix<F,MR, MC > Y10Adj_MR_MC(g); DistMatrix<F> X11(g); DistMatrix<F> Y10Adj(g); Matrix<F> Y10Local; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); A10Adj_MR_STAR.AlignWith( L10 ); F10Adj_MR_STAR.AlignWith( A00 ); L10Adj_MR_STAR.AlignWith( A00 ); L10Adj_VC_STAR.AlignWith( A00 ); L10_STAR_MC.AlignWith( A00 ); X11.AlignWith( A11 ); X11_MC_STAR.AlignWith( L10 ); X21_MC_STAR.AlignWith( A20 ); Y10Adj_MC_STAR.AlignWith( A00 ); Y10Adj_MR_MC.AlignWith( A10 ); //--------------------------------------------------------------------// // Y10 := L10 A00 L10Adj_MR_STAR.AdjointFrom( L10 ); L10Adj_VC_STAR = L10Adj_MR_STAR; L10_STAR_MC.AdjointFrom( L10Adj_VC_STAR ); Y10Adj_MC_STAR.ResizeTo( A10.Width(), A10.Height() ); F10Adj_MR_STAR.ResizeTo( A10.Width(), A10.Height() ); Zero( Y10Adj_MC_STAR ); Zero( F10Adj_MR_STAR ); LocalSymmetricAccumulateRL ( ADJOINT, F(1), A00, L10_STAR_MC, L10Adj_MR_STAR, Y10Adj_MC_STAR, F10Adj_MR_STAR ); Y10Adj.SumScatterFrom( Y10Adj_MC_STAR ); Y10Adj_MR_MC = Y10Adj; Y10Adj_MR_MC.SumScatterUpdate( F(1), F10Adj_MR_STAR ); Adjoint( Y10Adj_MR_MC.LockedLocalMatrix(), Y10Local ); // X11 := A10 L10' X11_MC_STAR.ResizeTo( A11.Height(), A11.Width() ); LocalGemm ( NORMAL, NORMAL, F(1), A10, L10Adj_MR_STAR, F(0), X11_MC_STAR ); // A10 := A10 - Y10 Axpy( F(-1), Y10Local, A10.LocalMatrix() ); A10Adj_MR_STAR.AdjointFrom( A10 ); // A11 := A11 - (X11 + L10 A10') = A11 - (A10 L10' + L10 A10') LocalGemm ( NORMAL, NORMAL, F(1), L10, A10Adj_MR_STAR, F(1), X11_MC_STAR ); X11.SumScatterFrom( X11_MC_STAR ); MakeTrapezoidal( LEFT, LOWER, 0, X11 ); Axpy( F(-1), X11, A11 ); // A10 := inv(L11) A10 L11_STAR_STAR = L11; A10_STAR_VR.AdjointFrom( A10Adj_MR_STAR ); LocalTrsm ( LEFT, LOWER, NORMAL, diag, F(1), L11_STAR_STAR, A10_STAR_VR ); A10 = A10_STAR_VR; // A11 := inv(L11) A11 inv(L11)' A11_STAR_STAR = A11; LocalTwoSidedTrsm( LOWER, diag, A11_STAR_STAR, L11_STAR_STAR ); A11 = A11_STAR_STAR; // A21 := A21 - A20 L10' X21_MC_STAR.ResizeTo( A21.Height(), A21.Width() ); LocalGemm ( NORMAL, NORMAL, F(1), A20, L10Adj_MR_STAR, F(0), X21_MC_STAR ); A21.SumScatterUpdate( F(-1), X21_MC_STAR ); // A21 := A21 inv(L11)' A21_VC_STAR = A21; LocalTrsm ( RIGHT, LOWER, ADJOINT, diag, F(1), L11_STAR_STAR, A21_VC_STAR ); A21 = A21_VC_STAR; //--------------------------------------------------------------------// A10Adj_MR_STAR.FreeAlignments(); F10Adj_MR_STAR.FreeAlignments(); L10Adj_MR_STAR.FreeAlignments(); L10Adj_VC_STAR.FreeAlignments(); L10_STAR_MC.FreeAlignments(); X11.FreeAlignments(); X11_MC_STAR.FreeAlignments(); X21_MC_STAR.FreeAlignments(); Y10Adj_MC_STAR.FreeAlignments(); Y10Adj_MR_MC.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /**********************************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmLVar5 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& L ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmLVar5"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( L.Height() != L.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != L.Height() ) LogicError("A and L must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,MC, STAR> A21_MC_STAR(g); DistMatrix<F,VC, STAR> A21_VC_STAR(g); DistMatrix<F,VR, STAR> A21_VR_STAR(g); DistMatrix<F,STAR,MR > A21Adj_STAR_MR(g); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,MC, STAR> L21_MC_STAR(g); DistMatrix<F,VC, STAR> L21_VC_STAR(g); DistMatrix<F,VR, STAR> L21_VR_STAR(g); DistMatrix<F,STAR,MR > L21Adj_STAR_MR(g); DistMatrix<F,VC, STAR> Y21_VC_STAR(g); DistMatrix<F> Y21(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); A21_MC_STAR.AlignWith( A22 ); A21_VC_STAR.AlignWith( A22 ); A21_VR_STAR.AlignWith( A22 ); A21Adj_STAR_MR.AlignWith( A22 ); L21_MC_STAR.AlignWith( A22 ); L21_VC_STAR.AlignWith( A22 ); L21_VR_STAR.AlignWith( A22 ); L21Adj_STAR_MR.AlignWith( A22 ); Y21.AlignWith( A21 ); Y21_VC_STAR.AlignWith( A22 ); //--------------------------------------------------------------------// // A11 := inv(L11) A11 inv(L11)' L11_STAR_STAR = L11; A11_STAR_STAR = A11; LocalTwoSidedTrsm( LOWER, diag, A11_STAR_STAR, L11_STAR_STAR ); A11 = A11_STAR_STAR; // Y21 := L21 A11 L21_VC_STAR = L21; Zeros( Y21_VC_STAR, A21.Height(), A21.Width() ); Hemm ( RIGHT, LOWER, F(1), A11_STAR_STAR.Matrix(), L21_VC_STAR.Matrix(), F(0), Y21_VC_STAR.Matrix() ); Y21 = Y21_VC_STAR; // A21 := A21 inv(L11)' A21_VC_STAR = A21; LocalTrsm ( RIGHT, LOWER, ADJOINT, diag, F(1), L11_STAR_STAR, A21_VC_STAR ); A21 = A21_VC_STAR; // A21 := A21 - 1/2 Y21 Axpy( F(-1)/F(2), Y21, A21 ); // A22 := A22 - (L21 A21' + A21 L21') A21_MC_STAR = A21; L21_MC_STAR = L21; A21_VC_STAR = A21_MC_STAR; A21_VR_STAR = A21_VC_STAR; L21_VR_STAR = L21_VC_STAR; A21Adj_STAR_MR.AdjointFrom( A21_VR_STAR ); L21Adj_STAR_MR.AdjointFrom( L21_VR_STAR ); LocalTrr2k ( LOWER, F(-1), L21_MC_STAR, A21Adj_STAR_MR, A21_MC_STAR, L21Adj_STAR_MR, F(1), A22 ); // A21 := A21 - 1/2 Y21 Axpy( F(-1)/F(2), Y21, A21 ); // A21 := inv(L22) A21 // // This is the bottleneck because A21 only has blocksize columns Trsm( LEFT, LOWER, NORMAL, diag, F(1), L22, A21 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /**********************************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); } }
inline void TwoSidedTrsmUVar2 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar2"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,MC, STAR> A01_MC_STAR(g); DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,MC, STAR> F01_MC_STAR(g); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,VR, STAR> U01_VR_STAR(g); DistMatrix<F,STAR,MR > U01Adj_STAR_MR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MR > X11_STAR_MR(g); DistMatrix<F,MR, STAR> X12Adj_MR_STAR(g); DistMatrix<F,MR, MC > X12Adj_MR_MC(g); DistMatrix<F,MR, MC > Y01_MR_MC(g); DistMatrix<F,MR, STAR> Y01_MR_STAR(g); DistMatrix<F> X11(g); DistMatrix<F> Y01(g); Matrix<F> X12Local; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A01_MC_STAR.AlignWith( U01 ); Y01.AlignWith( A01 ); Y01_MR_STAR.AlignWith( A00 ); U01_MC_STAR.AlignWith( A00 ); U01_VR_STAR.AlignWith( A00 ); U01Adj_STAR_MR.AlignWith( A00 ); X11_STAR_MR.AlignWith( U01 ); X11.AlignWith( A11 ); X12Adj_MR_STAR.AlignWith( A02 ); X12Adj_MR_MC.AlignWith( A12 ); F01_MC_STAR.AlignWith( A00 ); //--------------------------------------------------------------------// // Y01 := A00 U01 U01_MC_STAR = U01; U01_VR_STAR = U01_MC_STAR; U01Adj_STAR_MR.AdjointFrom( U01_VR_STAR ); Zeros( Y01_MR_STAR, A01.Height(), A01.Width() ); Zeros( F01_MC_STAR, A01.Height(), A01.Width() ); LocalSymmetricAccumulateLU ( ADJOINT, F(1), A00, U01_MC_STAR, U01Adj_STAR_MR, F01_MC_STAR, Y01_MR_STAR ); Y01_MR_MC.SumScatterFrom( Y01_MR_STAR ); Y01 = Y01_MR_MC; Y01.SumScatterUpdate( F(1), F01_MC_STAR ); // X11 := U01' A01 LocalGemm( ADJOINT, NORMAL, F(1), U01_MC_STAR, A01, X11_STAR_MR ); // A01 := A01 - Y01 Axpy( F(-1), Y01, A01 ); A01_MC_STAR = A01; // A11 := A11 - triu(X11 + A01' U01) = A11 - (U01 A01 + A01' U01) LocalGemm( ADJOINT, NORMAL, F(1), A01_MC_STAR, U01, F(1), X11_STAR_MR ); X11.SumScatterFrom( X11_STAR_MR ); MakeTriangular( UPPER, X11 ); Axpy( F(-1), X11, A11 ); // A01 := A01 inv(U11) U11_STAR_STAR = U11; A01_VC_STAR = A01_MC_STAR; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; // A11 := inv(U11)' A11 inv(U11) A11_STAR_STAR = A11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A12 := A12 - A02' U01 LocalGemm( ADJOINT, NORMAL, F(1), A02, U01_MC_STAR, X12Adj_MR_STAR ); X12Adj_MR_MC.SumScatterFrom( X12Adj_MR_STAR ); Adjoint( X12Adj_MR_MC.LockedMatrix(), X12Local ); Axpy( F(-1), X12Local, A12.Matrix() ); // A12 := inv(U11)' A12 A12_STAR_VR = A12; LocalTrsm ( LEFT, UPPER, ADJOINT, diag, F(1), U11_STAR_STAR, A12_STAR_VR ); A12 = A12_STAR_VR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }