inline void CheckInput ( const DistMatrix<T,MC, STAR>& A, const DistMatrix<T,STAR,MR >& B, const DistMatrix<T,MC, MR >& C ) { if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("A, B, and C must be distributed over the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() || A.Height() != B.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalTrrk: \n" << " A[MC,* ] ~ " << A.Height() << " x " << A.Width() << "\n" << " B[* ,MR] ~ " << B.Height() << " x " << B.Width() << "\n" << " C[MC,MR] ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( A.ColAlignment() != C.ColAlignment() || B.RowAlignment() != C.RowAlignment() ) { std::ostringstream msg; msg << "Misaligned LocalTrrk: \n" << " A[MC,* ] ~ " << A.ColAlignment() << "\n" << " B[* ,MR] ~ " << B.RowAlignment() << "\n" << " C[MC,MR] ~ " << C.ColAlignment() << " , " << C.RowAlignment() << "\n"; throw std::logic_error( msg.str().c_str() ); } }
inline F Reflector( DistMatrix<F>& chi, DistMatrix<F>& x ) { #ifndef RELEASE CallStackEntry entry("Reflector"); if( chi.Grid() != x.Grid() ) LogicError("chi and x must be distributed over the same grid"); if( chi.Height() != 1 || chi.Width() != 1 ) LogicError("chi must be a scalar"); if( x.Height() != 1 && x.Width() != 1 ) LogicError("x must be a vector"); #endif const Grid& g = x.Grid(); F tau; if( x.Width() == 1 && x.RowAlignment() == chi.RowAlignment() ) { if( g.Col() == x.RowAlignment() ) tau = reflector::Col( chi, x ); mpi::Broadcast( tau, x.RowAlignment(), g.RowComm() ); } else { if( g.Row() == x.ColAlignment() ) tau = reflector::Row( chi, x ); mpi::Broadcast( tau, x.ColAlignment(), g.ColComm() ); } return tau; }
const DistMatrix<T,STAR,STAR>& DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,STAR,VR>& A ) { #ifndef RELEASE CallStackEntry entry("[* ,* ] = [* ,VR]"); this->AssertNotLocked(); this->AssertSameGrid( A.Grid() ); #endif const elem::Grid& g = this->Grid(); this->ResizeTo( A.Height(), A.Width() ); if( !this->Participating() ) return *this; const Int p = g.Size(); const Int height = this->Height(); const Int width = this->Width(); const Int localWidthOfA = A.LocalWidth(); const Int maxLocalWidth = MaxLength(width,p); const Int portionSize = mpi::Pad( height*maxLocalWidth ); T* buffer = this->auxMemory_.Require( (p+1)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[portionSize]; // Pack const Int ALDim = A.LDim(); const T* ABuf = A.LockedBuffer(); PARALLEL_FOR for( Int jLoc=0; jLoc<localWidthOfA; ++jLoc ) MemCopy( &sendBuf[jLoc*height], &ABuf[jLoc*ALDim], height ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, g.VRComm() ); // Unpack T* thisBuf = this->Buffer(); const Int thisLDim = this->LDim(); const Int rowAlignmentOfA = A.RowAlignment(); OUTER_PARALLEL_FOR for( Int k=0; k<p; ++k ) { const T* data = &recvBuf[k*portionSize]; const Int rowShift = Shift_( k, rowAlignmentOfA, p ); const Int localWidth = Length_( width, rowShift, p ); INNER_PARALLEL_FOR for( Int jLoc=0; jLoc<localWidth; ++jLoc ) MemCopy ( &thisBuf[(rowShift+jLoc*p)*thisLDim], &data[jLoc*height], height ); } this->auxMemory_.Release(); return *this; }
inline void CheckInput ( Orientation orientationOfA, Orientation orientationOfB, const DistMatrix<T,STAR,MC >& A, const DistMatrix<T,MR, STAR>& B, const DistMatrix<T,MC, MR >& C ) { if( orientationOfA == NORMAL ) throw std::logic_error("A[* ,MC] must be (Conjugate)Transpose'd"); if( orientationOfB == NORMAL ) throw std::logic_error("B[MR,* ] must be (Conjugate)Transpose'd"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("A, B, and C must be distributed over the same grid"); if( A.Width() != C.Height() || B.Height() != C.Width() || A.Height() != B.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalTrrk: \n" << " A[* ,MC] ~ " << A.Height() << " x " << A.Width() << "\n" << " B[MR,* ] ~ " << B.Height() << " x " << B.Width() << "\n" << " C[MC,MR] ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( A.RowAlignment() != C.ColAlignment() || B.ColAlignment() != C.RowAlignment() ) { std::ostringstream msg; msg << "Misaligned LocalTrrk: \n" << " A[* ,MC] ~ " << A.RowAlignment() << "\n" << " B[MR,* ] ~ " << B.ColAlignment() << "\n" << " C[MC,MR] ~ " << C.ColAlignment() << " , " << C.RowAlignment() << "\n"; throw std::logic_error( msg.str().c_str() ); } }
inline void DiagonalScale ( LeftOrRight side, Orientation orientation, const DistMatrix<typename Base<T>::type,U,V>& d, DistMatrix<T,W,Z>& X ) { #ifndef RELEASE PushCallStack("DiagonalScale"); #endif typedef typename Base<T>::type R; if( side == LEFT ) { if( U == W && V == STAR && d.ColAlignment() == X.ColAlignment() ) { DiagonalScale( LEFT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,W,STAR> d_W_STAR( X.Grid() ); d_W_STAR = d; DiagonalScale ( LEFT, orientation, d_W_STAR.LockedMatrix(), X.Matrix() ); } } else { if( U == Z && V == STAR && d.ColAlignment() == X.RowAlignment() ) { DiagonalScale( RIGHT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,Z,STAR> d_Z_STAR( X.Grid() ); d_Z_STAR = d; DiagonalScale ( RIGHT, orientation, d_Z_STAR.LockedMatrix(), X.Matrix() ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void DistMatrix<T,MD,STAR,Int>::AlignWithDiagonal ( const DistMatrix<S,MR,MC,N>& A, Int offset ) { #ifndef RELEASE PushCallStack("[MD,* ]::AlignWithDiagonal([MR,MC])"); this->AssertFreeColAlignment(); this->AssertSameGrid( A ); #endif const elem::Grid& g = this->Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int lcm = g.LCM(); const Int colAlignment = A.ColAlignment(); const Int rowAlignment = A.RowAlignment(); this->Empty(); Int owner; if( offset >= 0 ) { const Int ownerRow = rowAlignment; const Int ownerCol = (colAlignment + offset) % c; owner = ownerRow + r*ownerCol; } else { const Int ownerRow = (rowAlignment-offset) % r; const Int ownerCol = colAlignment; owner = ownerRow + r*ownerCol; } this->diagPath_ = g.DiagPath(owner); this->colAlignment_ = g.DiagPathRank(owner); this->constrainedColAlignment_ = true; if( this->Participating() ) this->colShift_ = (g.DiagPathRank()+lcm-this->colAlignment_) % lcm; else this->colShift_ = 0; #ifndef RELEASE PopCallStack(); #endif }
inline bool DistMatrix<T,MD,STAR,Int>::AlignedWithDiagonal ( const DistMatrix<S,MR,MC,N>& A, Int offset ) const { #ifndef RELEASE PushCallStack("[MD,* ]::AlignedWithDiagonal([MR,MC])"); this->AssertSameGrid( A ); #endif const elem::Grid& g = this->Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int colAlignment = A.ColAlignment(); const Int rowAlignment = A.RowAlignment(); const Int firstDiagRow = 0; const Int firstDiagCol = this->diagPath_; const Int diagRow = (firstDiagRow+this->ColAlignment()) % r; const Int diagCol = (firstDiagCol+this->ColAlignment()) % c; bool aligned; if( offset >= 0 ) { const Int ownerCol = colAlignment; const Int ownerRow = (rowAlignment + offset) % r; aligned = ( ownerRow==diagRow && ownerCol==diagCol ); } else { const Int ownerCol = (colAlignment-offset) % c; const Int ownerRow = rowAlignment; aligned = ( ownerRow==diagRow && ownerCol==diagCol ); } #ifndef RELEASE PopCallStack(); #endif return aligned; }
inline void internal::CholeskyUVar3Square( DistMatrix<F,MC,MR>& A ) { #ifndef RELEASE PushCallStack("internal::CholeskyUVar3Square"); if( A.Height() != A.Width() ) throw std::logic_error ("Can only compute Cholesky factor of square matrices."); if( A.Grid().Height() != A.Grid().Width() ) throw std::logic_error ("CholeskyUVar3Square assumes a square process grid."); #endif const Grid& g = A.Grid(); // Find the process holding our transposed data const int r = g.Height(); int transposeRank; { const int colAlignment = A.ColAlignment(); const int rowAlignment = A.RowAlignment(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int transposeRow = (colAlignment+rowShift) % r; const int transposeCol = (rowAlignment+colShift) % r; transposeRank = transposeRow + r*transposeCol; } const bool onDiagonal = ( transposeRank == g.VCRank() ); // Matrix views DistMatrix<F,MC,MR> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); // Temporary matrix distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,STAR,VR > A12_STAR_VR(g); DistMatrix<F,STAR,MC > A12_STAR_MC(g); DistMatrix<F,STAR,MR > A12_STAR_MR(g); // Start the algorithm PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); while( ABR.Height() > 0 ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); A12_STAR_MC.AlignWith( A22 ); A12_STAR_MR.AlignWith( A22 ); A12_STAR_VR.AlignWith( A22 ); //--------------------------------------------------------------------// A11_STAR_STAR = A11; internal::LocalCholesky( UPPER, A11_STAR_STAR ); A11 = A11_STAR_STAR; A12_STAR_VR = A12; internal::LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, (F)1, A11_STAR_STAR, A12_STAR_VR ); A12_STAR_MR = A12_STAR_VR; // SendRecv to form A12[* ,MC] from A12[* ,MR] A12_STAR_MC.ResizeTo( A12.Height(), A12.Width() ); { if( onDiagonal ) { const int size = A11.Height()*A22.LocalWidth(); MemCopy ( A12_STAR_MC.LocalBuffer(), A12_STAR_MR.LocalBuffer(), size ); } else { const int sendSize = A11.Height()*A22.LocalWidth(); const int recvSize = A11.Width()*A22.LocalHeight(); // We know that the ldim is the height since we have manually // created both temporary matrices. mpi::SendRecv ( A12_STAR_MR.LocalBuffer(), sendSize, transposeRank, 0, A12_STAR_MC.LocalBuffer(), recvSize, transposeRank, 0, g.VCComm() ); } } internal::LocalTrrk ( UPPER, ADJOINT, (F)-1, A12_STAR_MC, A12_STAR_MR, (F)1, A22 ); A12 = A12_STAR_MR; //--------------------------------------------------------------------// A12_STAR_MC.FreeAlignments(); A12_STAR_MR.FreeAlignments(); A12_STAR_VR.FreeAlignments(); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void LocalSymvColAccumulateU ( T alpha, const DistMatrix<T>& A, const DistMatrix<T,MC,STAR>& x_MC_STAR, const DistMatrix<T,MR,STAR>& x_MR_STAR, DistMatrix<T,MC,STAR>& z_MC_STAR, DistMatrix<T,MR,STAR>& z_MR_STAR, bool conjugate=false ) { #ifndef RELEASE CallStackEntry entry("internal::LocalSymvColAccumulateU"); if( A.Grid() != x_MC_STAR.Grid() || x_MC_STAR.Grid() != x_MR_STAR.Grid() || x_MR_STAR.Grid() != z_MC_STAR.Grid() || z_MC_STAR.Grid() != z_MR_STAR.Grid() ) LogicError ("{A,x,z} must be distributed over the same grid"); if( x_MC_STAR.Width() != 1 || x_MR_STAR.Width() != 1 || z_MC_STAR.Width() != 1 || z_MR_STAR.Width() != 1 ) LogicError("Expected x and z to be column vectors"); if( A.Height() != A.Width() || A.Height() != x_MC_STAR.Height() || A.Height() != x_MR_STAR.Height() || A.Height() != z_MC_STAR.Height() || A.Height() != z_MR_STAR.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalSymvColAccumulateU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x[MC,* ] ~ " << x_MC_STAR.Height() << " x " << x_MC_STAR.Width() << "\n" << " x[MR,* ] ~ " << x_MR_STAR.Height() << " x " << x_MR_STAR.Width() << "\n" << " z[MC,* ] ~ " << z_MC_STAR.Height() << " x " << z_MC_STAR.Width() << "\n" << " z[MR,* ] ~ " << z_MR_STAR.Height() << " x " << z_MR_STAR.Width() << "\n"; LogicError( msg.str() ); } if( x_MC_STAR.ColAlignment() != A.ColAlignment() || x_MR_STAR.ColAlignment() != A.RowAlignment() || z_MC_STAR.ColAlignment() != A.ColAlignment() || z_MR_STAR.ColAlignment() != A.RowAlignment() ) LogicError("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); const Orientation orientation = ( conjugate ? ADJOINT : TRANSPOSE ); // Matrix views DistMatrix<T> A11(g), A12(g); DistMatrix<T> D11(g); DistMatrix<T,MC,STAR> x1_MC_STAR(g); DistMatrix<T,MR,STAR> xT_MR_STAR(g), x0_MR_STAR(g), xB_MR_STAR(g), x1_MR_STAR(g), x2_MR_STAR(g); DistMatrix<T,MC,STAR> z1_MC_STAR(g); DistMatrix<T,MR,STAR> z1_MR_STAR(g), z2_MR_STAR(g); // We want our local gemvs to be of width blocksize, so we will // temporarily change to max(r,c) times the current blocksize const Int ratio = Max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*LocalSymvBlocksize<T>() ); LockedPartitionDown ( x_MR_STAR, xT_MR_STAR, xB_MR_STAR, 0 ); while( xT_MR_STAR.Height() < x_MR_STAR.Height() ) { LockedRepartitionDown ( xT_MR_STAR, x0_MR_STAR, /**********/ /**********/ x1_MR_STAR, xB_MR_STAR, x2_MR_STAR ); const Int n0 = x0_MR_STAR.Height(); const Int n1 = x1_MR_STAR.Height(); const Int n2 = x2_MR_STAR.Height(); LockedView( A11, A, n0, n0, n1, n1 ); LockedView( A12, A, n0, n0+n1, n1, n2 ); LockedView( x1_MC_STAR, x_MC_STAR, n0, 0, n1, 1 ); View( z1_MC_STAR, z_MC_STAR, n0, 0, n1, 1 ); View( z1_MR_STAR, z_MR_STAR, n0, 0, n1, 1 ); View( z2_MR_STAR, z_MR_STAR, n0+n1, 0, n2, 1 ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// // TODO: These diagonal block updates can be greatly improved D11 = A11; MakeTriangular( UPPER, D11 ); LocalGemv( NORMAL, alpha, D11, x1_MR_STAR, T(1), z1_MC_STAR ); SetDiagonal( D11, T(0) ); LocalGemv( orientation, alpha, D11, x1_MC_STAR, T(1), z1_MR_STAR ); LocalGemv( NORMAL, alpha, A12, x2_MR_STAR, T(1), z1_MC_STAR ); LocalGemv( orientation, alpha, A12, x1_MC_STAR, T(1), z2_MR_STAR ); //--------------------------------------------------------------------// SlideLockedPartitionDown ( xT_MR_STAR, x0_MR_STAR, x1_MR_STAR, /**********/ /**********/ xB_MR_STAR, x2_MR_STAR ); } PopBlocksizeStack(); }
inline void LocalTrmmAccumulateRUN ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T,MC, MR >& U, const DistMatrix<T,STAR,MC >& X_STAR_MC, DistMatrix<T,MR, STAR>& ZTrans_MR_STAR ) { #ifndef RELEASE CallStackEntry entry("internal::LocalTrmmAccumulateRUN"); if( U.Grid() != X_STAR_MC.Grid() || X_STAR_MC.Grid() != ZTrans_MR_STAR.Grid() ) throw std::logic_error ("{U,X,Z} must be distributed over the same grid"); if( U.Height() != U.Width() || U.Height() != X_STAR_MC.Width() || U.Height() != ZTrans_MR_STAR.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalTrmmAccumulateRUN: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X[* ,MC] ~ " << X_STAR_MC.Height() << " x " << X_STAR_MC.Width() << "\n" << " Z^H/T[MR,* ] ~ " << ZTrans_MR_STAR.Height() << " x " << ZTrans_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( X_STAR_MC.RowAlignment() != U.ColAlignment() || ZTrans_MR_STAR.ColAlignment() != U.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<T> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<T> D11(g); DistMatrix<T,STAR,MC> XL_STAR_MC(g), XR_STAR_MC(g), X0_STAR_MC(g), X1_STAR_MC(g), X2_STAR_MC(g); DistMatrix<T,MR,STAR> ZTTrans_MR_STAR(g), Z0Trans_MR_STAR(g), ZBTrans_MR_STAR(g), Z1Trans_MR_STAR(g), Z2Trans_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); LockedPartitionRight( X_STAR_MC, XL_STAR_MC, XR_STAR_MC, 0 ); PartitionDown ( ZTrans_MR_STAR, ZTTrans_MR_STAR, ZBTrans_MR_STAR, 0 ); while( UTL.Height() < U.Height() ) { LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); LockedRepartitionRight ( XL_STAR_MC, /**/ XR_STAR_MC, X0_STAR_MC, /**/ X1_STAR_MC, X2_STAR_MC ); RepartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, /***************/ /***************/ Z1Trans_MR_STAR, ZBTrans_MR_STAR, Z2Trans_MR_STAR ); D11.AlignWith( U11 ); //--------------------------------------------------------------------// D11 = U11; MakeTriangular( UPPER, D11 ); if( diag == UNIT ) SetDiagonal( D11, T(1) ); LocalGemm ( orientation, orientation, alpha, D11, X1_STAR_MC, T(1), Z1Trans_MR_STAR ); LocalGemm ( orientation, orientation, alpha, U01, X0_STAR_MC, T(1), Z1Trans_MR_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); SlideLockedPartitionRight ( XL_STAR_MC, /**/ XR_STAR_MC, X0_STAR_MC, X1_STAR_MC, /**/ X2_STAR_MC ); SlidePartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, Z1Trans_MR_STAR, /***************/ /***************/ ZBTrans_MR_STAR, Z2Trans_MR_STAR ); } PopBlocksizeStack(); }
inline void Cannon_NN ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE CallStackEntry entry("gemm::Cannon_NN"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) LogicError("{A,B,C} must have the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal matrices: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; LogicError( msg.str() ); } #endif const Grid& g = A.Grid(); if( g.Height() != g.Width() ) LogicError("Process grid must be square for Cannon's"); if( C.ColAlignment() != A.ColAlignment() || C.RowAlignment() != B.RowAlignment() ) LogicError("C is not properly aligned"); const Int row = g.Row(); const Int col = g.Col(); const Int pSqrt = g.Height(); mpi::Comm rowComm = g.RowComm(); mpi::Comm colComm = g.ColComm(); if( A.Width() % pSqrt != 0 ) LogicError("For now, width(A) must be integer multiple of sqrt(p)"); // Begin by scaling our local portion of C Scale( beta, C ); // Load the initial A and B packages (may want to transpose B...) const Int localHeightA = A.LocalHeight(); const Int localHeightB = B.LocalHeight(); const Int localWidthA = A.LocalWidth(); const Int localWidthB = B.LocalWidth(); Matrix<T> pkgA(localHeightA,localWidthA,localHeightA), pkgB(localHeightB,localWidthB,localHeightB); for( Int jLoc=0; jLoc<localWidthA; ++jLoc ) MemCopy ( pkgA.Buffer(0,jLoc), A.LockedBuffer(0,jLoc), localHeightA ); for( Int jLoc=0; jLoc<localWidthB; ++jLoc ) MemCopy ( pkgB.Buffer(0,jLoc), B.LockedBuffer(0,jLoc), localHeightB ); // Perform the initial circular shifts so that our A and B packages align const Int rowShiftA = A.RowShift(); const Int colShiftB = B.ColShift(); const Int leftInitA = (col+pSqrt-colShiftB) % pSqrt; const Int rightInitA = (col+colShiftB) % pSqrt; const Int aboveInitB = (row+pSqrt-rowShiftA) % pSqrt; const Int belowInitB = (row+rowShiftA) % pSqrt; const Int pkgSizeA = localHeightA*localWidthA; const Int pkgSizeB = localHeightB*localWidthB; mpi::SendRecv( pkgA.Buffer(), pkgSizeA, leftInitA, rightInitA, rowComm ); mpi::SendRecv( pkgB.Buffer(), pkgSizeB, aboveInitB, belowInitB, colComm ); // Now begin the data flow const Int aboveRow = (row+pSqrt-1) % pSqrt; const Int belowRow = (row+1) % pSqrt; const Int leftCol = (col+pSqrt-1) % pSqrt; const Int rightCol = (col+1) % pSqrt; for( Int q=0; q<pSqrt; ++q ) { Gemm( NORMAL, NORMAL, alpha, pkgA, pkgB, T(1), C.Matrix() ); if( q != pSqrt-1 ) { mpi::SendRecv ( pkgA.Buffer(), pkgSizeA, leftCol, rightCol, rowComm ); mpi::SendRecv ( pkgB.Buffer(), pkgSizeB, aboveRow, belowRow, colComm ); } } }
void TestCorrectness ( bool print, UpperOrLower uplo, const DistMatrix<Complex<double> >& A, const DistMatrix<double,VR,STAR>& w, const DistMatrix<Complex<double> >& Z, const DistMatrix<Complex<double> >& AOrig ) { const Grid& g = A.Grid(); const int n = Z.Height(); const int k = Z.Width(); if( g.Rank() == 0 ) { cout << " Gathering computed eigenvalues..."; cout.flush(); } DistMatrix<double,MR,STAR> w_MR_STAR(true,Z.RowAlignment(),g); w_MR_STAR = w; if( g.Rank() == 0 ) cout << "DONE" << endl; if( g.Rank() == 0 ) cout << " Testing orthogonality of eigenvectors..." << endl; DistMatrix<Complex<double> > X( g ); Identity( X, k, k ); Herk ( uplo, ADJOINT, Complex<double>(-1), Z, Complex<double>(1), X ); double oneNormOfError = OneNorm( X ); double infNormOfError = InfinityNorm( X ); double frobNormOfError = FrobeniusNorm( X ); if( g.Rank() == 0 ) { cout << " ||Z^H Z - I||_1 = " << oneNormOfError << "\n" << " ||Z^H Z - I||_oo = " << infNormOfError << "\n" << " ||Z^H Z - I||_F = " << frobNormOfError << "\n\n" << " Testing for deviation of AZ from ZW..." << endl; } // X := AZ X.AlignWith( Z ); Zeros( X, n, k ); Hemm ( LEFT, uplo, Complex<double>(1), AOrig, Z, Complex<double>(0), X ); // Find the residual ||X-ZW||_oo = ||AZ-ZW||_oo for( int jLocal=0; jLocal<X.LocalWidth(); ++jLocal ) { const double omega = w_MR_STAR.GetLocal(jLocal,0); for( int iLocal=0; iLocal<X.LocalHeight(); ++iLocal ) { const Complex<double> chi = X.GetLocal(iLocal,jLocal); const Complex<double> zeta = Z.GetLocal(iLocal,jLocal); X.SetLocal(iLocal,jLocal,chi-omega*zeta); } } // Find the infinity norms of A, Z, and AZ-ZW double infNormOfA = HermitianInfinityNorm( uplo, AOrig ); double frobNormOfA = HermitianFrobeniusNorm( uplo, AOrig ); double oneNormOfZ = OneNorm( Z ); double infNormOfZ = InfinityNorm( Z ); double frobNormOfZ = FrobeniusNorm( Z ); oneNormOfError = OneNorm( X ); infNormOfError = InfinityNorm( X ); frobNormOfError = FrobeniusNorm( X ); if( g.Rank() == 0 ) { cout << " ||A||_1 = ||A||_oo = " << infNormOfA << "\n" << " ||A||_F = " << frobNormOfA << "\n" << " ||Z||_1 = " << oneNormOfZ << "\n" << " ||Z||_oo = " << infNormOfZ << "\n" << " ||Z||_F = " << frobNormOfZ << "\n" << " ||A Z - Z W||_1 = " << oneNormOfError << "\n" << " ||A Z - Z W||_oo = " << infNormOfError << "\n" << " ||A Z - Z W||_F = " << frobNormOfError << endl; } }
void TestCorrectness ( bool print, HermitianGenDefiniteEigType eigType, UpperOrLower uplo, const DistMatrix<Complex<double> >& A, const DistMatrix<Complex<double> >& B, const DistMatrix<double,VR,STAR>& w, const DistMatrix<Complex<double> >& X, const DistMatrix<Complex<double> >& AOrig, const DistMatrix<Complex<double> >& BOrig ) { const Grid& g = A.Grid(); const int n = X.Height(); const int k = X.Width(); if( g.Rank() == 0 ) { cout << " Gathering computed eigenvalues..."; cout.flush(); } DistMatrix<double,MR,STAR> w_MR_STAR(true,X.RowAlignment(),g); w_MR_STAR = w; if( g.Rank() == 0 ) cout << "DONE" << endl; if( eigType == AXBX ) { if( g.Rank() == 0 ) cout << " Testing for deviation of AX from BXW..." << endl; // Set Y := BXW, where W is the diagonal eigenvalue matrix DistMatrix<Complex<double> > Y( g ); Y.AlignWith( X ); Zeros( n, k, Y ); Hemm ( LEFT, uplo, Complex<double>(1), BOrig, X, Complex<double>(0), Y ); for( int jLocal=0; jLocal<Y.LocalWidth(); ++jLocal ) { const double omega = w_MR_STAR.GetLocal(jLocal,0); blas::Scal ( 2*Y.LocalHeight(), omega, (double*)Y.Buffer(0,jLocal), 1 ); } // Y := Y - AX = BXW - AX Hemm ( LEFT, uplo, Complex<double>(-1), AOrig, X, Complex<double>(1), Y ); // Find the infinity norms of A, B, X, and AX-BXW double infNormOfA = HermitianInfinityNorm( uplo, AOrig ); double frobNormOfA = HermitianFrobeniusNorm( uplo, AOrig ); double infNormOfB = HermitianInfinityNorm( uplo, BOrig ); double frobNormOfB = HermitianFrobeniusNorm( uplo, BOrig ); double oneNormOfX = OneNorm( X ); double infNormOfX = InfinityNorm( X ); double frobNormOfX = FrobeniusNorm( X ); double oneNormOfError = OneNorm( Y ); double infNormOfError = InfinityNorm( Y ); double frobNormOfError = FrobeniusNorm( Y ); if( g.Rank() == 0 ) { cout << " ||A||_1 = ||A||_oo = " << infNormOfA << "\n" << " ||A||_F = " << frobNormOfA << "\n" << " ||B||_1 = ||B||_oo = " << infNormOfB << "\n" << " ||B||_F = " << frobNormOfB << "\n" << " ||X||_1 = " << oneNormOfX << "\n" << " ||X||_oo = " << infNormOfX << "\n" << " ||X||_F = " << frobNormOfX << "\n" << " ||A X - B X W||_1 = " << oneNormOfError << "\n" << " ||A X - B X W||_oo = " << infNormOfError << "\n" << " ||A X - B X W||_F = " << frobNormOfError << "\n\n" << " Testing orthonormality of eigenvectors w.r.t. B..." << endl; } DistMatrix<Complex<double> > Z(g); Z = X; if( uplo == LOWER ) Trmm( LEFT, LOWER, ADJOINT, NON_UNIT, Complex<double>(1), B, Z ); else Trmm( LEFT, UPPER, NORMAL, NON_UNIT, Complex<double>(1), B, Z ); Identity( k, k, Y ); Herk ( uplo, ADJOINT, Complex<double>(-1), Z, Complex<double>(1), Y ); oneNormOfError = OneNorm( Y ); infNormOfError = InfinityNorm( Y ); frobNormOfError = FrobeniusNorm( Y ); if( g.Rank() == 0 ) cout << " ||X^H B X - I||_1 = " << oneNormOfError << "\n" << " ||X^H B X - I||_oo = " << infNormOfError << "\n" << " ||X^H B X - I||_F = " << frobNormOfError << endl; } else if( eigType == ABX ) { if( g.Rank() == 0 ) cout << " Testing for deviation of ABX from XW..." << endl; // Set Y := BX DistMatrix<Complex<double> > Y( g ); Y.AlignWith( X ); Zeros( n, k, Y ); Hemm ( LEFT, uplo, Complex<double>(1), BOrig, X, Complex<double>(0), Y ); // Set Z := AY = ABX DistMatrix<Complex<double> > Z( n, k, g ); Hemm ( LEFT, uplo, Complex<double>(1), AOrig, Y, Complex<double>(0), Z ); // Set Z := Z - XW = ABX - XW for( int jLocal=0; jLocal<Z.LocalWidth(); ++jLocal ) { const double omega = w_MR_STAR.GetLocal(jLocal,0); for( int iLocal=0; iLocal<Z.LocalHeight(); ++iLocal ) { const Complex<double> chi = X.GetLocal(iLocal,jLocal); const Complex<double> zeta = Z.GetLocal(iLocal,jLocal); Z.SetLocal(iLocal,jLocal,zeta-omega*chi); } } // Find the infinity norms of A, B, X, and ABX-XW double infNormOfA = HermitianInfinityNorm( uplo, AOrig ); double frobNormOfA = HermitianFrobeniusNorm( uplo, AOrig ); double infNormOfB = HermitianInfinityNorm( uplo, BOrig ); double frobNormOfB = HermitianFrobeniusNorm( uplo, BOrig ); double oneNormOfX = OneNorm( X ); double infNormOfX = InfinityNorm( X ); double frobNormOfX = FrobeniusNorm( X ); double oneNormOfError = OneNorm( Z ); double infNormOfError = InfinityNorm( Z ); double frobNormOfError = FrobeniusNorm( Z ); if( g.Rank() == 0 ) { cout << " ||A||_1 = ||A||_oo = " << infNormOfA << "\n" << " ||A||_F = " << frobNormOfA << "\n" << " ||B||_1 = ||B||_oo = " << infNormOfB << "\n" << " ||B||_F = " << frobNormOfB << "\n" << " ||X||_1 = " << oneNormOfX << "\n" << " ||X||_oo = " << infNormOfX << "\n" << " ||X||_F = " << frobNormOfX << "\n" << " ||A B X - X W||_1 = " << oneNormOfError << "\n" << " ||A B X - X W||_oo = " << infNormOfError << "\n" << " ||A B X - X W||_F = " << frobNormOfError << "\n\n" << " Testing orthonormality of eigenvectors w.r.t. B..." << endl; } Z = X; if( uplo == LOWER ) Trmm( LEFT, LOWER, ADJOINT, NON_UNIT, Complex<double>(1), B, Z ); else Trmm( LEFT, UPPER, NORMAL, NON_UNIT, Complex<double>(1), B, Z ); Identity( k, k, Y ); Herk ( uplo, ADJOINT, Complex<double>(-1), Z, Complex<double>(1), Y ); oneNormOfError = OneNorm( Y ); infNormOfError = InfinityNorm( Y ); frobNormOfError = FrobeniusNorm( Y ); if( g.Rank() == 0 ) cout << " ||X^H B X - I||_1 = " << oneNormOfError << "\n" << " ||X^H B X - I||_oo = " << infNormOfError << "\n" << " ||X^H B X - I||_F = " << frobNormOfError << endl; } else /* eigType == BAX */ { if( g.Rank() == 0 ) cout << " Testing for deviation of BAX from XW..." << endl; // Set Y := AX DistMatrix<Complex<double> > Y( g ); Y.AlignWith( X ); Zeros( n, k, Y ); Hemm ( LEFT, uplo, Complex<double>(1), AOrig, X, Complex<double>(0), Y ); // Set Z := BY = BAX DistMatrix<Complex<double> > Z( n, k, g ); Hemm ( LEFT, uplo, Complex<double>(1), BOrig, Y, Complex<double>(0), Z ); // Set Z := Z - XW = BAX-XW for( int jLocal=0; jLocal<Z.LocalWidth(); ++jLocal ) { const double omega = w_MR_STAR.GetLocal(jLocal,0); for( int iLocal=0; iLocal<Z.LocalHeight(); ++iLocal ) { const Complex<double> chi = X.GetLocal(iLocal,jLocal); const Complex<double> zeta = Z.GetLocal(iLocal,jLocal); Z.SetLocal(iLocal,jLocal,zeta-omega*chi); } } // Find the infinity norms of A, B, X, and BAX-XW double infNormOfA = HermitianInfinityNorm( uplo, AOrig ); double frobNormOfA = HermitianFrobeniusNorm( uplo, AOrig ); double infNormOfB = HermitianInfinityNorm( uplo, BOrig ); double frobNormOfB = HermitianFrobeniusNorm( uplo, BOrig ); double oneNormOfX = OneNorm( X ); double infNormOfX = InfinityNorm( X ); double frobNormOfX = FrobeniusNorm( X ); double oneNormOfError = OneNorm( Z ); double infNormOfError = InfinityNorm( Z ); double frobNormOfError = FrobeniusNorm( Z ); if( g.Rank() == 0 ) { cout << " ||A||_1 = ||A||_oo = " << infNormOfA << "\n" << " ||A||_F = " << frobNormOfA << "\n" << " ||B||_1 = ||B||_oo = " << infNormOfB << "\n" << " ||B||_F = " << frobNormOfB << "\n" << " ||X||_1 = " << oneNormOfX << "\n" << " ||X||_oo = " << infNormOfX << "\n" << " ||X||_F = " << frobNormOfX << "\n" << " ||B A X - X W||_1 = " << oneNormOfError << "\n" << " ||B A X - X W||_oo = " << infNormOfError << "\n" << " ||B A X - X W||_F = " << frobNormOfError << "\n\n" << " Testing orthonormality of eigenvectors w.r.t. B^-1..." << endl; } Z = X; if( uplo == LOWER ) Trsm( LEFT, LOWER, NORMAL, NON_UNIT, Complex<double>(1), B, Z ); else Trsm( LEFT, UPPER, ADJOINT, NON_UNIT, Complex<double>(1), B, Z ); Identity( k, k, Y ); Herk ( uplo, ADJOINT, Complex<double>(-1), Z, Complex<double>(1), Y ); oneNormOfError = OneNorm( Y ); infNormOfError = InfinityNorm( Y ); frobNormOfError = FrobeniusNorm( Y ); if( g.Rank() == 0 ) cout << " ||X^H B^-1 X - I||_1 = " << oneNormOfError << "\n" << " ||X^H B^-1 X - I||_oo = " << infNormOfError << "\n" << " ||X^H B^-1 X - I||_F = " << frobNormOfError << endl; } }
inline T DotuHelper( const DistMatrix<T,U,V>& x, const DistMatrix<T,MC,MR>& y ) { #ifndef RELEASE PushCallStack("internal::DotuHelper"); if( x.Grid() != y.Grid() ) throw std::logic_error("{x,y} must be distributed over the same grid"); if( (x.Height() != 1 && x.Width() != 1) || (y.Height() != 1 && y.Width() != 1) ) throw std::logic_error("Dotu requires x and y to be vectors"); int xLength = ( x.Width() == 1 ? x.Height() : x.Width() ); int yLength = ( y.Width() == 1 ? y.Height() : y.Width() ); if( xLength != yLength ) throw std::logic_error("Dotu requires x and y to be the same length"); #endif const Grid& g = x.Grid(); T globalDotu; if( x.Width() == 1 && y.Width() == 1 ) { DistMatrix<T,MC,MR> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerCol = y.RowAlignment(); if( g.Col() == ownerCol ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.ColComm() ); } mpi::Broadcast( &globalDotu, 1, ownerCol, g.RowComm() ); } else if( x.Width() == 1 ) { DistMatrix<T,MR,MC> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerRow = y.ColAlignment(); if( g.Row() == ownerRow ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.RowComm() ); } mpi::Broadcast( &globalDotu, 1, ownerRow, g.ColComm() ); } else if( y.Width() == 1 ) { DistMatrix<T,MR,MC> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerCol = y.RowAlignment(); if( g.Col() == ownerCol ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.ColComm() ); } mpi::Broadcast( &globalDotu, 1, ownerCol, g.RowComm() ); } else { DistMatrix<T,MC,MR> xRedist(g); xRedist.AlignWith( y ); xRedist = x; int ownerRow = y.ColAlignment(); if( g.Row() == ownerRow ) { T localDotu = Dotu( xRedist.LockedLocalMatrix(), y.LockedLocalMatrix() ); mpi::AllReduce( &localDotu, &globalDotu, 1, mpi::SUM, g.RowComm() ); } mpi::Broadcast( &globalDotu, 1, ownerRow, g.ColComm() ); } #ifndef RELEASE PopCallStack(); #endif return globalDotu; }
inline void LocalSymmetricAccumulateLU ( Orientation orientation, T alpha, const DistMatrix<T>& A, const DistMatrix<T,MC, STAR>& B_MC_STAR, const DistMatrix<T,STAR,MR >& BAdjOrTrans_STAR_MR, DistMatrix<T,MC, STAR>& Z_MC_STAR, DistMatrix<T,MR, STAR>& Z_MR_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalSymmetricAccumulateLU"); if( A.Grid() != B_MC_STAR.Grid() || B_MC_STAR.Grid() != BAdjOrTrans_STAR_MR.Grid() || BAdjOrTrans_STAR_MR.Grid() != Z_MC_STAR.Grid() || Z_MC_STAR.Grid() != Z_MR_STAR.Grid() ) throw std::logic_error ("{A,B,Z} must be distributed over the same grid"); if( A.Height() != A.Width() || A.Height() != B_MC_STAR.Height() || A.Height() != BAdjOrTrans_STAR_MR.Width() || A.Height() != Z_MC_STAR.Height() || A.Height() != Z_MR_STAR.Height() || B_MC_STAR.Width() != BAdjOrTrans_STAR_MR.Height() || BAdjOrTrans_STAR_MR.Height() != Z_MC_STAR.Width() || Z_MC_STAR.Width() != Z_MR_STAR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalSymmetricAccumulateLU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B[MC,* ] ~ " << B_MC_STAR.Height() << " x " << B_MC_STAR.Width() << "\n" << " B^H/T[* ,MR] ~ " << BAdjOrTrans_STAR_MR.Height() << " x " << BAdjOrTrans_STAR_MR.Width() << "\n" << " Z[MC,* ] ~ " << Z_MC_STAR.Height() << " x " << Z_MC_STAR.Width() << "\n" << " Z[MR,* ] ` " << Z_MR_STAR.Height() << " x " << Z_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( B_MC_STAR.ColAlignment() != A.ColAlignment() || BAdjOrTrans_STAR_MR.RowAlignment() != A.RowAlignment() || Z_MC_STAR.ColAlignment() != A.ColAlignment() || Z_MR_STAR.ColAlignment() != A.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); DistMatrix<T> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<T> D11(g); DistMatrix<T,MC,STAR> BT_MC_STAR(g), B0_MC_STAR(g), BB_MC_STAR(g), B1_MC_STAR(g), B2_MC_STAR(g); DistMatrix<T,STAR,MR> BLAdjOrTrans_STAR_MR(g), BRAdjOrTrans_STAR_MR(g), B0AdjOrTrans_STAR_MR(g), B1AdjOrTrans_STAR_MR(g), B2AdjOrTrans_STAR_MR(g); DistMatrix<T,MC,STAR> ZT_MC_STAR(g), Z0_MC_STAR(g), ZB_MC_STAR(g), Z1_MC_STAR(g), Z2_MC_STAR(g); DistMatrix<T,MR,STAR> ZT_MR_STAR(g), Z0_MR_STAR(g), ZB_MR_STAR(g), Z1_MR_STAR(g), Z2_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDown ( B_MC_STAR, BT_MC_STAR, BB_MC_STAR, 0 ); LockedPartitionRight ( BAdjOrTrans_STAR_MR, BLAdjOrTrans_STAR_MR, BRAdjOrTrans_STAR_MR, 0 ); PartitionDown ( Z_MC_STAR, ZT_MC_STAR, ZB_MC_STAR, 0 ); PartitionDown ( Z_MR_STAR, ZT_MR_STAR, ZB_MR_STAR, 0 ); while( ATL.Height() < A.Height() ) { LockedRepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDown ( BT_MC_STAR, B0_MC_STAR, /**********/ /**********/ B1_MC_STAR, BB_MC_STAR, B2_MC_STAR ); LockedRepartitionRight ( BLAdjOrTrans_STAR_MR, /**/ BRAdjOrTrans_STAR_MR, B0AdjOrTrans_STAR_MR, /**/ B1AdjOrTrans_STAR_MR, B2AdjOrTrans_STAR_MR ); RepartitionDown ( ZT_MC_STAR, Z0_MC_STAR, /**********/ /**********/ Z1_MC_STAR, ZB_MC_STAR, Z2_MC_STAR ); RepartitionDown ( ZT_MR_STAR, Z0_MR_STAR, /**********/ /**********/ Z1_MR_STAR, ZB_MR_STAR, Z2_MR_STAR ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// D11 = A11; MakeTrapezoidal( LEFT, UPPER, 0, D11 ); LocalGemm ( NORMAL, orientation, alpha, D11, B1AdjOrTrans_STAR_MR, T(1), Z1_MC_STAR ); MakeTrapezoidal( LEFT, UPPER, 1, D11 ); LocalGemm ( orientation, NORMAL, alpha, D11, B1_MC_STAR, T(1), Z1_MR_STAR ); LocalGemm ( NORMAL, orientation, alpha, A12, B2AdjOrTrans_STAR_MR, T(1), Z1_MC_STAR ); LocalGemm ( orientation, NORMAL, alpha, A12, B1_MC_STAR, T(1), Z2_MR_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDown ( BT_MC_STAR, B0_MC_STAR, B1_MC_STAR, /**********/ /**********/ BB_MC_STAR, B2_MC_STAR ); SlideLockedPartitionRight ( BLAdjOrTrans_STAR_MR, /**/ BRAdjOrTrans_STAR_MR, B0AdjOrTrans_STAR_MR, B1AdjOrTrans_STAR_MR, /**/ B2AdjOrTrans_STAR_MR ); SlidePartitionDown ( ZT_MC_STAR, Z0_MC_STAR, Z1_MC_STAR, /**********/ /**********/ ZB_MC_STAR, Z2_MC_STAR ); SlidePartitionDown ( ZT_MR_STAR, Z0_MR_STAR, Z1_MR_STAR, /**********/ /**********/ ZB_MR_STAR, Z2_MR_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
const DistMatrix<T,STAR,STAR>& DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,MR,MC>& A ) { #ifndef RELEASE CallStackEntry entry("[* ,* ] = [MR,MC]"); this->AssertNotLocked(); this->AssertSameGrid( A.Grid() ); #endif const elem::Grid& g = this->Grid(); this->ResizeTo( A.Height(), A.Width() ); if( !this->Participating() ) return *this; const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); const Int height = this->Height(); const Int width = this->Width(); const Int localHeightOfA = A.LocalHeight(); const Int localWidthOfA = A.LocalWidth(); const Int maxLocalHeight = MaxLength(height,c); const Int maxLocalWidth = MaxLength(width,r); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); T* buffer = this->auxMemory_.Require( (p+1)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[portionSize]; // Pack const Int ALDim = A.LDim(); const T* ABuf = A.LockedBuffer(); PARALLEL_FOR for( Int jLoc=0; jLoc<localWidthOfA; ++jLoc ) MemCopy ( &sendBuf[jLoc*localHeightOfA], &ABuf[jLoc*ALDim], localHeightOfA ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, g.VRComm() ); // Unpack T* thisBuf = this->Buffer(); const Int thisLDim = this->LDim(); const Int colAlignmentOfA = A.ColAlignment(); const Int rowAlignmentOfA = A.RowAlignment(); OUTER_PARALLEL_FOR for( Int l=0; l<r; ++l ) { const Int rowShift = Shift_( l, rowAlignmentOfA, r ); const Int localWidth = Length_( width, rowShift, r ); for( Int k=0; k<c; ++k ) { const T* data = &recvBuf[(k+l*c)*portionSize]; const Int colShift = Shift_( k, colAlignmentOfA, c ); const Int localHeight = Length_( height, colShift, c ); INNER_PARALLEL_FOR for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { T* destCol = &thisBuf[colShift+(rowShift+jLoc*r)*thisLDim]; const T* sourceCol = &data[jLoc*localHeight]; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) destCol[iLoc*c] = sourceCol[iLoc]; } } } this->auxMemory_.Release(); return *this; }
inline void ApplyColumnPivots ( DistMatrix<F>& A, const std::vector<int>& image, const std::vector<int>& preimage ) { const int b = image.size(); #ifndef RELEASE PushCallStack("ApplyColumnPivots"); if( A.Width() < b || b != preimage.size() ) throw std::logic_error ("image and preimage must be vectors of equal length that are not " "wider than A."); #endif const int localHeight = A.LocalHeight(); if( A.Height() == 0 || A.Width() == 0 ) { #ifndef RELEASE PopCallStack(); #endif return; } // Extract the relevant process grid information const Grid& g = A.Grid(); const int c = g.Width(); const int rowAlignment = A.RowAlignment(); const int rowShift = A.RowShift(); const int myCol = g.Col(); // Extract the send and recv counts from the image and preimage. // This process's sends may be logically partitioned into two sets: // (a) sends from rows [0,...,b-1] // (b) sends from rows [b,...] // The latter is analyzed with image, the former deduced with preimage. std::vector<int> sendCounts(c,0), recvCounts(c,0); for( int j=rowShift; j<b; j+=c ) { const int sendCol = preimage[j]; const int sendTo = (rowAlignment+sendCol) % c; sendCounts[sendTo] += localHeight; const int recvCol = image[j]; const int recvFrom = (rowAlignment+recvCol) % c; recvCounts[recvFrom] += localHeight; } for( int j=0; j<b; ++j ) { const int sendCol = preimage[j]; if( sendCol >= b ) { const int sendTo = (rowAlignment+sendCol) % c; if( sendTo == myCol ) { const int sendFrom = (rowAlignment+j) % c; recvCounts[sendFrom] += localHeight; } } const int recvCol = image[j]; if( recvCol >= b ) { const int recvFrom = (rowAlignment+recvCol) % c; if( recvFrom == myCol ) { const int recvTo = (rowAlignment+j) % c; sendCounts[recvTo] += localHeight; } } } // Construct the send and recv displacements from the counts std::vector<int> sendDispls(c), recvDispls(c); int totalSend=0, totalRecv=0; for( int i=0; i<c; ++i ) { sendDispls[i] = totalSend; recvDispls[i] = totalRecv; totalSend += sendCounts[i]; totalRecv += recvCounts[i]; } #ifndef RELEASE if( totalSend != totalRecv ) { std::ostringstream msg; msg << "Send and recv counts do not match: (send,recv)=" << totalSend << "," << totalRecv; throw std::logic_error( msg.str().c_str() ); } #endif // Fill vectors with the send data std::vector<F> sendData(std::max(1,totalSend)); std::vector<int> offsets(c,0); const int localWidth = LocalLength( b, rowShift, c ); for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int sendCol = preimage[rowShift+jLocal*c]; const int sendTo = (rowAlignment+sendCol) % c; const int offset = sendDispls[sendTo]+offsets[sendTo]; MemCopy( &sendData[offset], A.LocalBuffer(0,jLocal), localHeight ); offsets[sendTo] += localHeight; } for( int j=0; j<b; ++j ) { const int recvCol = image[j]; if( recvCol >= b ) { const int recvFrom = (rowAlignment+recvCol) % c; if( recvFrom == myCol ) { const int recvTo = (rowAlignment+j) % c; const int jLocal = (recvCol-rowShift) / c; const int offset = sendDispls[recvTo]+offsets[recvTo]; MemCopy ( &sendData[offset], A.LocalBuffer(0,jLocal), localHeight ); offsets[recvTo] += localHeight; } } } // Communicate all pivot rows std::vector<F> recvData(std::max(1,totalRecv)); mpi::AllToAll ( &sendData[0], &sendCounts[0], &sendDispls[0], &recvData[0], &recvCounts[0], &recvDispls[0], g.RowComm() ); // Unpack the recv data for( int k=0; k<c; ++k ) { offsets[k] = 0; int thisRowShift = Shift( k, rowAlignment, c ); for( int j=thisRowShift; j<b; j+=c ) { const int sendCol = preimage[j]; const int sendTo = (rowAlignment+sendCol) % c; if( sendTo == myCol ) { const int offset = recvDispls[k]+offsets[k]; const int jLocal = (sendCol-rowShift) / c; MemCopy ( A.LocalBuffer(0,jLocal), &recvData[offset], localHeight ); offsets[k] += localHeight; } } } for( int j=0; j<b; ++j ) { const int recvCol = image[j]; if( recvCol >= b ) { const int recvTo = (rowAlignment+j) % c; if( recvTo == myCol ) { const int recvFrom = (rowAlignment+recvCol) % c; const int jLocal = (j-rowShift) / c; const int offset = recvDispls[recvFrom]+offsets[recvFrom]; MemCopy ( A.LocalBuffer(0,jLocal), &recvData[offset], localHeight ); offsets[recvFrom] += localHeight; } } } #ifndef RELEASE PopCallStack(); #endif }
inline void internal::LocalTrmmAccumulateLUN ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T,MC, MR >& U, const DistMatrix<T,STAR,MR >& XAdjOrTrans_STAR_MR, DistMatrix<T,MC, STAR>& Z_MC_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalTrmmAccumulateLUN"); if( U.Grid() != XAdjOrTrans_STAR_MR.Grid() || XAdjOrTrans_STAR_MR.Grid() != Z_MC_STAR.Grid() ) throw std::logic_error ("{U,X,Z} must be distributed over the same grid"); if( U.Height() != U.Width() || U.Height() != XAdjOrTrans_STAR_MR.Width() || U.Height() != Z_MC_STAR.Height() || XAdjOrTrans_STAR_MR.Height() != Z_MC_STAR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalTrmmAccumulateLUN: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X^H/T[* ,MR] ~ " << XAdjOrTrans_STAR_MR.Height() << " x " << XAdjOrTrans_STAR_MR.Width() << "\n" << " Z[MC,* ] ~ " << Z_MC_STAR.Height() << " x " << Z_MC_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( XAdjOrTrans_STAR_MR.RowAlignment() != U.RowAlignment() || Z_MC_STAR.ColAlignment() != U.ColAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<T,MC,MR> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<T,MC,MR> D11(g); DistMatrix<T,STAR,MR> XLAdjOrTrans_STAR_MR(g), XRAdjOrTrans_STAR_MR(g), X0AdjOrTrans_STAR_MR(g), X1AdjOrTrans_STAR_MR(g), X2AdjOrTrans_STAR_MR(g); DistMatrix<T,MC,STAR> ZT_MC_STAR(g), Z0_MC_STAR(g), ZB_MC_STAR(g), Z1_MC_STAR(g), Z2_MC_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); LockedPartitionRight ( XAdjOrTrans_STAR_MR, XLAdjOrTrans_STAR_MR, XRAdjOrTrans_STAR_MR, 0 ); PartitionDown ( Z_MC_STAR, ZT_MC_STAR, ZB_MC_STAR, 0 ); while( UTL.Height() < U.Height() ) { LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); LockedRepartitionRight ( XLAdjOrTrans_STAR_MR, /**/ XRAdjOrTrans_STAR_MR, X0AdjOrTrans_STAR_MR, /**/ X1AdjOrTrans_STAR_MR, X2AdjOrTrans_STAR_MR ); RepartitionDown ( ZT_MC_STAR, Z0_MC_STAR, /**********/ /**********/ Z1_MC_STAR, ZB_MC_STAR, Z2_MC_STAR ); D11.AlignWith( U11 ); //--------------------------------------------------------------------// D11 = U11; MakeTrapezoidal( LEFT, UPPER, 0, D11 ); if( diag == UNIT ) SetDiagonalToOne( D11 ); internal::LocalGemm ( NORMAL, orientation, alpha, D11, X1AdjOrTrans_STAR_MR, (T)1, Z1_MC_STAR ); internal::LocalGemm ( NORMAL, orientation, alpha, U01, X1AdjOrTrans_STAR_MR, (T)1, Z0_MC_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); SlideLockedPartitionRight ( XLAdjOrTrans_STAR_MR, /**/ XRAdjOrTrans_STAR_MR, X0AdjOrTrans_STAR_MR, X1AdjOrTrans_STAR_MR, /**/ X2AdjOrTrans_STAR_MR ); SlidePartitionDown ( ZT_MC_STAR, Z0_MC_STAR, Z1_MC_STAR, /**********/ /**********/ ZB_MC_STAR, Z2_MC_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void LocalSymvRowAccumulateU ( T alpha, const DistMatrix<T>& A, const DistMatrix<T,STAR,MC>& x_STAR_MC, const DistMatrix<T,STAR,MR>& x_STAR_MR, DistMatrix<T,STAR,MC>& z_STAR_MC, DistMatrix<T,STAR,MR>& z_STAR_MR ) { #ifndef RELEASE PushCallStack("internal::LocalSymvRowAccumulateU"); if( A.Grid() != x_STAR_MC.Grid() || x_STAR_MC.Grid() != x_STAR_MR.Grid() || x_STAR_MR.Grid() != z_STAR_MC.Grid() || z_STAR_MC.Grid() != z_STAR_MR.Grid() ) throw std::logic_error ("{A,x,z} must be distributed over the same grid"); if( x_STAR_MC.Height() != 1 || x_STAR_MR.Height() != 1 || z_STAR_MC.Height() != 1 || z_STAR_MR.Height() != 1 ) throw std::logic_error("Expected x and z to be row vectors"); if( A.Height() != A.Width() || A.Height() != x_STAR_MC.Width() || A.Height() != x_STAR_MR.Width() || A.Height() != z_STAR_MC.Width() || A.Height() != z_STAR_MR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalSymvRowAccumulateU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x[* ,MC] ~ " << x_STAR_MC.Height() << " x " << x_STAR_MC.Width() << "\n" << " x[* ,MR] ~ " << x_STAR_MR.Height() << " x " << x_STAR_MR.Width() << "\n" << " z[* ,MC] ~ " << z_STAR_MC.Height() << " x " << z_STAR_MC.Width() << "\n" << " z[* ,MR] ~ " << z_STAR_MR.Height() << " x " << z_STAR_MR.Width() << "\n"; throw std::logic_error( msg.str() ); } if( x_STAR_MC.RowAlignment() != A.ColAlignment() || x_STAR_MR.RowAlignment() != A.RowAlignment() || z_STAR_MC.RowAlignment() != A.ColAlignment() || z_STAR_MR.RowAlignment() != A.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> A11(g), A12(g); DistMatrix<T> D11(g); DistMatrix<T,STAR,MC> x1_STAR_MC(g); DistMatrix<T,STAR,MR> xL_STAR_MR(g), xR_STAR_MR(g), x0_STAR_MR(g), x1_STAR_MR(g), x2_STAR_MR(g); DistMatrix<T,STAR,MC> z1_STAR_MC(g); DistMatrix<T,STAR,MR> z1_STAR_MR(g), z2_STAR_MR(g); // We want our local gemvs to be of width blocksize, so we will // temporarily change to max(r,c) times the current blocksize const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*LocalSymvBlocksize<T>() ); LockedPartitionRight( x_STAR_MR, xL_STAR_MR, xR_STAR_MR, 0 ); while( xL_STAR_MR.Width() < x_STAR_MR.Width() ) { LockedRepartitionRight ( xL_STAR_MR, /**/ xR_STAR_MR, x0_STAR_MR, /**/ x1_STAR_MR, x2_STAR_MR ); const int n0 = x0_STAR_MR.Width(); const int n1 = x1_STAR_MR.Width(); const int n2 = x2_STAR_MR.Width(); LockedView( A11, A, n0, n0, n1, n1 ); LockedView( A12, A, n0, n0+n1, n1, n2 ); LockedView( x1_STAR_MC, x_STAR_MC, 0, n0, 1, n1 ); View( z1_STAR_MC, z_STAR_MC, 0, n0, 1, n1 ); View( z1_STAR_MR, z_STAR_MR, 0, n0, 1, n1 ); View( z2_STAR_MR, z_STAR_MR, 0, n0+n1, 1, n2 ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// // TODO: These diagonal block updates can be greatly improved D11 = A11; MakeTrapezoidal( LEFT, UPPER, 0, D11 ); Gemv ( NORMAL, alpha, D11.LockedLocalMatrix(), x1_STAR_MR.LockedLocalMatrix(), T(1), z1_STAR_MC.LocalMatrix() ); MakeTrapezoidal( LEFT, UPPER, 1, D11 ); Gemv ( TRANSPOSE, alpha, D11.LockedLocalMatrix(), x1_STAR_MC.LockedLocalMatrix(), T(1), z1_STAR_MR.LocalMatrix() ); Gemv ( NORMAL, alpha, A12.LockedLocalMatrix(), x2_STAR_MR.LockedLocalMatrix(), T(1), z1_STAR_MC.LocalMatrix() ); Gemv ( TRANSPOSE, alpha, A12.LockedLocalMatrix(), x1_STAR_MC.LockedLocalMatrix(), T(1), z2_STAR_MR.LocalMatrix() ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionRight ( xL_STAR_MR, /**/ xR_STAR_MR, x0_STAR_MR, x1_STAR_MR, /**/ x2_STAR_MR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmLLTSmall ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F,STAR,VR>& L, DistMatrix<F,VR,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLLTSmall"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error("TrsmLLT expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLLT: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( L.RowAlignment() != X.ColAlignment() ) throw std::logic_error("L and X must be aligned"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F,STAR,VR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,VR,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[* ,VR] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VR,* ] // X1[* ,* ] := L11^-[T/H][* ,* ] X1[* ,* ] LocalTrsm ( LEFT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_STAR_STAR, checkIfSingular ); X1 = X1_STAR_STAR; // X0[VR,* ] -= L10[* ,VR]^(T/H) X1[* ,* ] LocalGemm( orientation, NORMAL, F(-1), L10, X1_STAR_STAR, F(1), X0 ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline R Row( DistMatrix<R>& chi, DistMatrix<R>& x ) { #ifndef RELEASE PushCallStack("reflector::Row"); if( chi.Grid() != x.Grid() ) throw std::logic_error ("chi and x must be distributed over the same grid"); if( chi.Height() != 1 || chi.Width() != 1 ) throw std::logic_error("chi must be a scalar"); if( x.Height() != 1 ) throw std::logic_error("x must be a row vector"); if( chi.Grid().Row() != chi.ColAlignment() ) throw std::logic_error("Reflecting with incorrect row of processes"); if( x.Grid().Row() != x.ColAlignment() ) throw std::logic_error("Reflecting with incorrect row of processes"); #endif const Grid& grid = x.Grid(); mpi::Comm rowComm = grid.RowComm(); const int gridCol = grid.Col(); const int gridWidth = grid.Width(); const int rowAlignment = chi.RowAlignment(); std::vector<R> localNorms(gridWidth); R localNorm = Nrm2( x.LockedMatrix() ); mpi::AllGather( &localNorm, 1, &localNorms[0], 1, rowComm ); R norm = blas::Nrm2( gridWidth, &localNorms[0], 1 ); if( norm == 0 ) { if( gridCol == rowAlignment ) chi.SetLocal(0,0,-chi.GetLocal(0,0)); #ifndef RELEASE PopCallStack(); #endif return R(2); } R alpha; if( gridCol == rowAlignment ) alpha = chi.GetLocal(0,0); mpi::Broadcast( &alpha, 1, rowAlignment, rowComm ); R beta; if( alpha <= 0 ) beta = lapack::SafeNorm( alpha, norm ); else beta = -lapack::SafeNorm( alpha, norm ); const R one = 1; const R safeMin = lapack::MachineSafeMin<R>(); const R epsilon = lapack::MachineEpsilon<R>(); const R safeInv = safeMin/epsilon; int count = 0; if( Abs(beta) < safeInv ) { R invOfSafeInv = one/safeInv; do { ++count; Scale( invOfSafeInv, x ); alpha *= invOfSafeInv; beta *= invOfSafeInv; } while( Abs(beta) < safeInv ); localNorm = Nrm2( x.LockedMatrix() ); mpi::AllGather( &localNorm, 1, &localNorms[0], 1, rowComm ); norm = blas::Nrm2( gridWidth, &localNorms[0], 1 ); if( alpha <= 0 ) beta = lapack::SafeNorm( alpha, norm ); else beta = -lapack::SafeNorm( alpha, norm ); } R tau = (beta-alpha)/beta; Scale( one/(alpha-beta), x ); for( int j=0; j<count; ++j ) beta *= safeInv; if( gridCol == rowAlignment ) chi.SetLocal(0,0,beta); #ifndef RELEASE PopCallStack(); #endif return tau; }
inline void LocalTrmmAccumulateLLT ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, const DistMatrix<T,MC,STAR>& X_MC_STAR, DistMatrix<T,MR,STAR>& Z_MR_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalTrmmAccumulateLLT"); if( L.Grid() != X_MC_STAR.Grid() || X_MC_STAR.Grid() != Z_MR_STAR.Grid() ) throw std::logic_error ("{L,X,Z} must be distributed over the same grid"); if( L.Height() != L.Width() || L.Height() != X_MC_STAR.Height() || L.Height() != Z_MR_STAR.Height() ) { std::ostringstream msg; msg << "Nonconformal LocalTrmmAccumulateLLT: " << "\n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X[MC,* ] ~ " << X_MC_STAR.Height() << " x " << X_MC_STAR.Width() << "\n" << " Z[MR,* ] ` " << Z_MR_STAR.Height() << " x " << Z_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( X_MC_STAR.ColAlignment() != L.ColAlignment() || Z_MR_STAR.ColAlignment() != L.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> D11(g); DistMatrix<T,MC,STAR> XT_MC_STAR(g), X0_MC_STAR(g), XB_MC_STAR(g), X1_MC_STAR(g), X2_MC_STAR(g); DistMatrix<T,MR,STAR> ZT_MR_STAR(g), Z0_MR_STAR(g), ZB_MR_STAR(g), Z1_MR_STAR(g), Z2_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); LockedPartitionDown ( X_MC_STAR, XT_MC_STAR, XB_MC_STAR, 0 ); PartitionDown ( Z_MR_STAR, ZT_MR_STAR, ZB_MR_STAR, 0 ); while( LTL.Height() < L.Height() ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); LockedRepartitionDown ( XT_MC_STAR, X0_MC_STAR, /**********/ /**********/ X1_MC_STAR, XB_MC_STAR, X2_MC_STAR ); RepartitionDown ( ZT_MR_STAR, Z0_MR_STAR, /**********/ /**********/ Z1_MR_STAR, ZB_MR_STAR, Z2_MR_STAR ); D11.AlignWith( L11 ); //--------------------------------------------------------------------// D11 = L11; MakeTrapezoidal( LEFT, LOWER, 0, D11 ); if( diag == UNIT ) SetDiagonalToOne( D11 ); LocalGemm ( orientation, NORMAL, alpha, D11, X1_MC_STAR, T(1), Z1_MR_STAR ); LocalGemm ( orientation, NORMAL, alpha, L21, X2_MC_STAR, T(1), Z1_MR_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlideLockedPartitionDown ( XT_MC_STAR, X0_MC_STAR, X1_MC_STAR, /**********/ /**********/ XB_MC_STAR, X2_MC_STAR ); SlidePartitionDown ( ZT_MR_STAR, Z0_MR_STAR, Z1_MR_STAR, /**********/ /**********/ ZB_MR_STAR, Z2_MR_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void LocalSymmetricAccumulateRU ( Orientation orientation, T alpha, const DistMatrix<T,MC, MR >& A, const DistMatrix<T,STAR,MC >& B_STAR_MC, const DistMatrix<T,MR, STAR>& BTrans_MR_STAR, DistMatrix<T,MC, STAR>& ZTrans_MC_STAR, DistMatrix<T,MR, STAR>& ZTrans_MR_STAR ) { #ifndef RELEASE PushCallStack("internal::LocalSymmetricAccumulateRU"); if( A.Grid() != B_STAR_MC.Grid() || B_STAR_MC.Grid() != BTrans_MR_STAR.Grid() || BTrans_MR_STAR.Grid() != ZTrans_MC_STAR.Grid() || ZTrans_MC_STAR.Grid() != ZTrans_MR_STAR.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != A.Width() || A.Height() != B_STAR_MC.Width() || A.Height() != BTrans_MR_STAR.Height() || A.Height() != ZTrans_MC_STAR.Height() || A.Height() != ZTrans_MR_STAR.Height() || B_STAR_MC.Height() != BTrans_MR_STAR.Width() || BTrans_MR_STAR.Width() != ZTrans_MC_STAR.Width() || ZTrans_MC_STAR.Width() != ZTrans_MR_STAR.Width() ) { std::ostringstream msg; msg << "Nonconformal LocalSymmetricAccumulateRU: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B[* ,MC] ~ " << B_STAR_MC.Height() << " x " << B_STAR_MC.Width() << "\n" << " B^H/T[MR,* ] ~ " << BTrans_MR_STAR.Height() << " x " << BTrans_MR_STAR.Width() << "\n" << " Z^H/T[MC,* ] ~ " << ZTrans_MC_STAR.Height() << " x " << ZTrans_MC_STAR.Width() << "\n" << " Z^H/T[MR,* ] ~ " << ZTrans_MR_STAR.Height() << " x " << ZTrans_MR_STAR.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( B_STAR_MC.RowAlignment() != A.ColAlignment() || BTrans_MR_STAR.ColAlignment() != A.RowAlignment() || ZTrans_MC_STAR.ColAlignment() != A.ColAlignment() || ZTrans_MR_STAR.ColAlignment() != A.RowAlignment() ) throw std::logic_error("Partial matrix distributions are misaligned"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<T> D11(g); DistMatrix<T,STAR,MC> BL_STAR_MC(g), BR_STAR_MC(g), B0_STAR_MC(g), B1_STAR_MC(g), B2_STAR_MC(g); DistMatrix<T,MR,STAR> BTTrans_MR_STAR(g), B0Trans_MR_STAR(g), BBTrans_MR_STAR(g), B1Trans_MR_STAR(g), B2Trans_MR_STAR(g); DistMatrix<T,MC,STAR> ZTTrans_MC_STAR(g), Z0Trans_MC_STAR(g), ZBTrans_MC_STAR(g), Z1Trans_MC_STAR(g), Z2Trans_MC_STAR(g); DistMatrix<T,MR,STAR> ZBTrans_MR_STAR(g), Z0Trans_MR_STAR(g), ZTTrans_MR_STAR(g), Z1Trans_MR_STAR(g), Z2Trans_MR_STAR(g); const int ratio = std::max( g.Height(), g.Width() ); PushBlocksizeStack( ratio*Blocksize() ); LockedPartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionRight( B_STAR_MC, BL_STAR_MC, BR_STAR_MC, 0 ); LockedPartitionDown ( BTrans_MR_STAR, BTTrans_MR_STAR, BBTrans_MR_STAR, 0 ); PartitionDown ( ZTrans_MC_STAR, ZTTrans_MC_STAR, ZBTrans_MC_STAR, 0 ); PartitionDown ( ZTrans_MR_STAR, ZTTrans_MR_STAR, ZBTrans_MR_STAR, 0 ); while( ATL.Height() < A.Height() ) { LockedRepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionRight ( BL_STAR_MC, /**/ BR_STAR_MC, B0_STAR_MC, /**/ B1_STAR_MC, B2_STAR_MC ); LockedRepartitionDown ( BTTrans_MR_STAR, B0Trans_MR_STAR, /***************/ /***************/ B1Trans_MR_STAR, BBTrans_MR_STAR, B2Trans_MR_STAR ); RepartitionDown ( ZTTrans_MC_STAR, Z0Trans_MC_STAR, /***************/ /***************/ Z1Trans_MC_STAR, ZBTrans_MC_STAR, Z2Trans_MC_STAR ); RepartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, /***************/ /***************/ Z1Trans_MR_STAR, ZBTrans_MR_STAR, Z2Trans_MR_STAR ); D11.AlignWith( A11 ); //--------------------------------------------------------------------// D11 = A11; MakeTriangular( UPPER, D11 ); LocalGemm ( orientation, orientation, alpha, D11, B1_STAR_MC, T(1), Z1Trans_MR_STAR ); SetDiagonal( D11, T(0) ); LocalGemm ( NORMAL, NORMAL, alpha, D11, B1Trans_MR_STAR, T(1), Z1Trans_MC_STAR ); LocalGemm ( orientation, orientation, alpha, A12, B1_STAR_MC, T(1), Z2Trans_MR_STAR ); LocalGemm ( NORMAL, NORMAL, alpha, A12, B2Trans_MR_STAR, T(1), Z1Trans_MC_STAR ); //--------------------------------------------------------------------// D11.FreeAlignments(); SlideLockedPartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionRight ( BL_STAR_MC, /**/ BR_STAR_MC, B0_STAR_MC, B1_STAR_MC, /**/ B2_STAR_MC ); SlideLockedPartitionDown ( BTTrans_MR_STAR, B0Trans_MR_STAR, B1Trans_MR_STAR, /***************/ /***************/ BBTrans_MR_STAR, B2Trans_MR_STAR ); SlidePartitionDown ( ZTTrans_MC_STAR, Z0Trans_MC_STAR, Z1Trans_MC_STAR, /***************/ /***************/ ZBTrans_MC_STAR, Z2Trans_MC_STAR ); SlidePartitionDown ( ZTTrans_MR_STAR, Z0Trans_MR_STAR, Z1Trans_MR_STAR, /***************/ /***************/ ZBTrans_MR_STAR, Z2Trans_MR_STAR ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }