inline void Syr2kUT ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C, bool conjugate=false ) { #ifndef RELEASE CallStackEntry entry("internal::Syr2kUT"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Width() != C.Height() || A.Width() != C.Width() || B.Width() != C.Height() || B.Width() != C.Width() || A.Height() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal Syr2kUT:\n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE ); // Matrix views DistMatrix<T> AT(g), A0(g), AB(g), A1(g), A2(g); DistMatrix<T> BT(g), B0(g), BB(g), B1(g), B2(g); // Temporary distributions DistMatrix<T,MR, STAR> A1Trans_MR_STAR(g); DistMatrix<T,MR, STAR> B1Trans_MR_STAR(g); DistMatrix<T,STAR,VR > A1_STAR_VR(g); DistMatrix<T,STAR,VR > B1_STAR_VR(g); DistMatrix<T,STAR,MC > A1_STAR_MC(g); DistMatrix<T,STAR,MC > B1_STAR_MC(g); A1Trans_MR_STAR.AlignWith( C ); B1Trans_MR_STAR.AlignWith( C ); A1_STAR_MC.AlignWith( C ); B1_STAR_MC.AlignWith( C ); // Start the algorithm ScaleTrapezoid( beta, LEFT, UPPER, 0, C ); LockedPartitionDown ( A, AT, AB, 0 ); LockedPartitionDown ( B, BT, BB, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); LockedRepartitionDown ( BT, B0, /**/ /**/ B1, BB, B2 ); //--------------------------------------------------------------------// A1Trans_MR_STAR.TransposeFrom( A1 ); A1_STAR_VR.TransposeFrom( A1Trans_MR_STAR ); A1_STAR_MC = A1_STAR_VR; B1Trans_MR_STAR.TransposeFrom( B1 ); B1_STAR_VR.TransposeFrom( B1Trans_MR_STAR ); B1_STAR_MC = B1_STAR_VR; LocalTrr2k ( UPPER, orientation, TRANSPOSE, orientation, TRANSPOSE, alpha, A1_STAR_MC, B1Trans_MR_STAR, B1_STAR_MC, A1Trans_MR_STAR, T(1), C ); //--------------------------------------------------------------------// SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlideLockedPartitionDown ( BT, B0, B1, /**/ /**/ BB, B2 ); } }
inline void GemmTTA ( Orientation orientationOfA, Orientation orientationOfB, T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmTTA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( orientationOfA == NORMAL || orientationOfB == NORMAL ) throw std::logic_error ("GemmTTA expects A and B to be (Conjugate)Transposed"); if( A.Width() != C.Height() || B.Height() != C.Width() || A.Height() != B.Width() ) { std::ostringstream msg; msg << "Nonconformal GemmTTA: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> BT(g), B0(g), BB(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,STAR,MC > B1_STAR_MC(g); DistMatrix<T,MR, STAR> D1_MR_STAR(g); DistMatrix<T,MR, MC > D1_MR_MC(g); DistMatrix<T> D1(g); B1_STAR_MC.AlignWith( A ); D1_MR_STAR.AlignWith( A ); // Start the algorithm Scale( beta, C ); LockedPartitionDown ( B, BT, BB, 0 ); PartitionRight( C, CL, CR, 0 ); while( BB.Height() > 0 ) { LockedRepartitionDown ( BT, B0, /**/ /**/ B1, BB, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); D1.AlignWith( C1 ); Zeros( C1.Height(), C1.Width(), D1_MR_STAR ); //--------------------------------------------------------------------// B1_STAR_MC = B1; // B1[*,MC] <- B1[MC,MR] // D1[MR,*] := alpha (A[MC,MR])^T (B1[*,MC])^T // = alpha (A^T)[MR,MC] (B1^T)[MC,*] LocalGemm ( orientationOfA, orientationOfB, alpha, A, B1_STAR_MC, T(0), D1_MR_STAR ); // C1[MC,MR] += scattered & transposed D1[MR,*] summed over grid cols D1_MR_MC.SumScatterFrom( D1_MR_STAR ); D1 = D1_MR_MC; Axpy( T(1), D1, C1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); SlideLockedPartitionDown ( BT, B0, B1, /**/ /**/ BB, B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void HemmRUA ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::HemmRUA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); #endif const Grid& g = A.Grid(); DistMatrix<T> BT(g), B0(g), BB(g), B1(g), B2(g); DistMatrix<T> CT(g), C0(g), CB(g), C1(g), C2(g); DistMatrix<T,MR, STAR> B1Adj_MR_STAR(g); DistMatrix<T,VC, STAR> B1Adj_VC_STAR(g); DistMatrix<T,STAR,MC > B1_STAR_MC(g); DistMatrix<T,MC, STAR> Z1Adj_MC_STAR(g); DistMatrix<T,MR, STAR> Z1Adj_MR_STAR(g); DistMatrix<T,MR, MC > Z1Adj_MR_MC(g); DistMatrix<T> Z1Adj(g); B1Adj_MR_STAR.AlignWith( A ); B1Adj_VC_STAR.AlignWith( A ); B1_STAR_MC.AlignWith( A ); Z1Adj_MC_STAR.AlignWith( A ); Z1Adj_MR_STAR.AlignWith( A ); Matrix<T> Z1Local; Scale( beta, C ); LockedPartitionDown ( B, BT, BB, 0 ); PartitionDown ( C, CT, CB, 0 ); while( CT.Height() < C.Height() ) { LockedRepartitionDown ( BT, B0, /**/ /**/ B1, BB, B2 ); RepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); Z1Adj_MR_MC.AlignWith( C1 ); Zeros( C1.Width(), C1.Height(), Z1Adj_MC_STAR ); Zeros( C1.Width(), C1.Height(), Z1Adj_MR_STAR ); //--------------------------------------------------------------------// B1Adj_MR_STAR.AdjointFrom( B1 ); B1Adj_VC_STAR = B1Adj_MR_STAR; B1_STAR_MC.AdjointFrom( B1Adj_VC_STAR ); LocalSymmetricAccumulateRU ( ADJOINT, alpha, A, B1_STAR_MC, B1Adj_MR_STAR, Z1Adj_MC_STAR, Z1Adj_MR_STAR ); Z1Adj.SumScatterFrom( Z1Adj_MC_STAR ); Z1Adj_MR_MC = Z1Adj; Z1Adj_MR_MC.SumScatterUpdate( T(1), Z1Adj_MR_STAR ); Adjoint( Z1Adj_MR_MC.LockedLocalMatrix(), Z1Local ); Axpy( T(1), Z1Local, C1.LocalMatrix() ); //--------------------------------------------------------------------// Z1Adj_MR_MC.FreeAlignments(); SlideLockedPartitionDown ( BT, B0, B1, /**/ /**/ BB, B2 ); SlidePartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LocalSymmetricAccumulateRU ( Orientation orientation, T alpha, const DistMatrix<T,MC, MR >& A, const DistMatrix<T,STAR,MC >& B_STAR_MC, const DistMatrix<T,MR, STAR>& BAdjOrTrans_MR_STAR, DistMatrix<T,MC, STAR>& ZAdjOrTrans_MC_STAR, DistMatrix<T,MR, STAR>& ZAdjOrTrans_MR_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalSymmetricAccumulateRU"); if( A.Grid() != B_STAR_MC.Grid() || B_STAR_MC.Grid() != BAdjOrTrans_MR_STAR.Grid() || BAdjOrTrans_MR_STAR.Grid() != ZAdjOrTrans_MC_STAR.Grid() || ZAdjOrTrans_MC_STAR.Grid() != ZAdjOrTrans_MR_STAR.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != A.Width() || A.Height() != B_STAR_MC.Width() || A.Height() != BAdjOrTrans_MR_STAR.Height() || A.Height() != ZAdjOrTrans_MC_STAR.Height() || A.Height() != ZAdjOrTrans_MR_STAR.Height() || B_STAR_MC.Height() != BAdjOrTrans_MR_STAR.Width() || BAdjOrTrans_MR_STAR.Width() != ZAdjOrTrans_MC_STAR.Width() || ZAdjOrTrans_MC_STAR.Width() != ZAdjOrTrans_MR_STAR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalSymmetricAccumulateRU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B[* ,MC] ~ " << B_STAR_MC.Height() << " x " << B_STAR_MC.Width() << "\n" << " B^H/T[MR,* ] ~ " << BAdjOrTrans_MR_STAR.Height() << " x " << BAdjOrTrans_MR_STAR.Width() << "\n" << " Z^H/T[MC,* ] ~ " << ZAdjOrTrans_MC_STAR.Height() << " x " << ZAdjOrTrans_MC_STAR.Width() << "\n" << " Z^H/T[MR,* ] ~ " << ZAdjOrTrans_MR_STAR.Height() << " x " << ZAdjOrTrans_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( B_STAR_MC.RowAlignment() != A.ColAlignment() || BAdjOrTrans_MR_STAR.ColAlignment() != A.RowAlignment() || ZAdjOrTrans_MC_STAR.ColAlignment() != A.ColAlignment() || ZAdjOrTrans_MR_STAR.ColAlignment() != A.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<T> D11(g); DistMatrix<T,STAR,MC> BL_STAR_MC(g), BR_STAR_MC(g), B0_STAR_MC(g), B1_STAR_MC(g), B2_STAR_MC(g); DistMatrix<T,MR,STAR> BTAdjOrTrans_MR_STAR(g), B0AdjOrTrans_MR_STAR(g), BBAdjOrTrans_MR_STAR(g), B1AdjOrTrans_MR_STAR(g), B2AdjOrTrans_MR_STAR(g); DistMatrix<T,MC,STAR> ZTAdjOrTrans_MC_STAR(g), Z0AdjOrTrans_MC_STAR(g), ZBAdjOrTrans_MC_STAR(g), Z1AdjOrTrans_MC_STAR(g), Z2AdjOrTrans_MC_STAR(g); DistMatrix<T,MR,STAR> ZBAdjOrTrans_MR_STAR(g), Z0AdjOrTrans_MR_STAR(g), ZTAdjOrTrans_MR_STAR(g), Z1AdjOrTrans_MR_STAR(g), Z2AdjOrTrans_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionRight( B_STAR_MC, BL_STAR_MC, BR_STAR_MC, 0 ); LockedPartitionDown ( BAdjOrTrans_MR_STAR, BTAdjOrTrans_MR_STAR, BBAdjOrTrans_MR_STAR, 0 ); PartitionDown ( ZAdjOrTrans_MC_STAR, ZTAdjOrTrans_MC_STAR, ZBAdjOrTrans_MC_STAR, 0 ); PartitionDown ( ZAdjOrTrans_MR_STAR, ZTAdjOrTrans_MR_STAR, ZBAdjOrTrans_MR_STAR, 0 ); while( ATL.Height() < A.Height() ) { LockedRepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionRight ( BL_STAR_MC, /**/ BR_STAR_MC, B0_STAR_MC, /**/ B1_STAR_MC, B2_STAR_MC ); LockedRepartitionDown ( BTAdjOrTrans_MR_STAR, B0AdjOrTrans_MR_STAR, /********************/ /********************/ B1AdjOrTrans_MR_STAR, BBAdjOrTrans_MR_STAR, B2AdjOrTrans_MR_STAR ); RepartitionDown ( ZTAdjOrTrans_MC_STAR, Z0AdjOrTrans_MC_STAR, /********************/ /********************/ Z1AdjOrTrans_MC_STAR, ZBAdjOrTrans_MC_STAR, Z2AdjOrTrans_MC_STAR ); RepartitionDown ( ZTAdjOrTrans_MR_STAR, Z0AdjOrTrans_MR_STAR, /********************/ /********************/ Z1AdjOrTrans_MR_STAR, ZBAdjOrTrans_MR_STAR, Z2AdjOrTrans_MR_STAR ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// D11 = A11; MakeTrapezoidal( LEFT, UPPER, 0, D11 ); LocalGemm ( orientation, orientation, alpha, D11, B1_STAR_MC, T(1), Z1AdjOrTrans_MR_STAR ); MakeTrapezoidal( LEFT, UPPER, 1, D11 ); LocalGemm ( NORMAL, NORMAL, alpha, D11, B1AdjOrTrans_MR_STAR, T(1), Z1AdjOrTrans_MC_STAR ); LocalGemm ( orientation, orientation, alpha, A12, B1_STAR_MC, T(1), Z2AdjOrTrans_MR_STAR ); LocalGemm ( NORMAL, NORMAL, alpha, A12, B2AdjOrTrans_MR_STAR, T(1), Z1AdjOrTrans_MC_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionRight ( BL_STAR_MC, /**/ BR_STAR_MC, B0_STAR_MC, B1_STAR_MC, /**/ B2_STAR_MC ); SlideLockedPartitionDown ( BTAdjOrTrans_MR_STAR, B0AdjOrTrans_MR_STAR, B1AdjOrTrans_MR_STAR, /********************/ /********************/ BBAdjOrTrans_MR_STAR, B2AdjOrTrans_MR_STAR ); SlidePartitionDown ( ZTAdjOrTrans_MC_STAR, Z0AdjOrTrans_MC_STAR, Z1AdjOrTrans_MC_STAR, /********************/ /********************/ ZBAdjOrTrans_MC_STAR, Z2AdjOrTrans_MC_STAR ); SlidePartitionDown ( ZTAdjOrTrans_MR_STAR, Z0AdjOrTrans_MR_STAR, Z1AdjOrTrans_MR_STAR, /********************/ /********************/ ZBAdjOrTrans_MR_STAR, Z2AdjOrTrans_MR_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void SymmRUA ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C, bool conjugate=false ) { #ifndef RELEASE PushCallStack("internal::SymmRUA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); #endif const Grid& g = A.Grid(); const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE ); DistMatrix<T> BT(g), B0(g), BB(g), B1(g), B2(g); DistMatrix<T> CT(g), C0(g), CB(g), C1(g), C2(g); DistMatrix<T,MR, STAR> B1Trans_MR_STAR(g); DistMatrix<T,VC, STAR> B1Trans_VC_STAR(g); DistMatrix<T,STAR,MC > B1_STAR_MC(g); DistMatrix<T,MC, STAR> Z1Trans_MC_STAR(g); DistMatrix<T,MR, STAR> Z1Trans_MR_STAR(g); DistMatrix<T,MC, MR > Z1Trans(g); DistMatrix<T,MR, MC > Z1Trans_MR_MC(g); B1Trans_MR_STAR.AlignWith( A ); B1Trans_VC_STAR.AlignWith( A ); B1_STAR_MC.AlignWith( A ); Z1Trans_MC_STAR.AlignWith( A ); Z1Trans_MR_STAR.AlignWith( A ); Matrix<T> Z1Local; Scale( beta, C ); LockedPartitionDown ( B, BT, BB, 0 ); PartitionDown ( C, CT, CB, 0 ); while( CT.Height() < C.Height() ) { LockedRepartitionDown ( BT, B0, /**/ /**/ B1, BB, B2 ); RepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); Z1Trans_MR_MC.AlignWith( C1 ); Zeros( C1.Width(), C1.Height(), Z1Trans_MC_STAR ); Zeros( C1.Width(), C1.Height(), Z1Trans_MR_STAR ); //--------------------------------------------------------------------// B1Trans_MR_STAR.TransposeFrom( B1, conjugate ); B1Trans_VC_STAR = B1Trans_MR_STAR; B1_STAR_MC.TransposeFrom( B1Trans_VC_STAR, conjugate ); LocalSymmetricAccumulateRU ( orientation, alpha, A, B1_STAR_MC, B1Trans_MR_STAR, Z1Trans_MC_STAR, Z1Trans_MR_STAR ); Z1Trans.SumScatterFrom( Z1Trans_MC_STAR ); Z1Trans_MR_MC = Z1Trans; Z1Trans_MR_MC.SumScatterUpdate( T(1), Z1Trans_MR_STAR ); Transpose( Z1Trans_MR_MC.LockedMatrix(), Z1Local, conjugate ); Axpy( T(1), Z1Local, C1.Matrix() ); //--------------------------------------------------------------------// Z1Trans_MR_MC.FreeAlignments(); SlideLockedPartitionDown ( BT, B0, B1, /**/ /**/ BB, B2 ); SlidePartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); } #ifndef RELEASE PopCallStack(); #endif }