inline void Forsythe( DistMatrix<T,U,V>& J, Int n, T alpha, T lambda ) { DEBUG_ONLY(CallStackEntry cse("Forsythe")) J.Resize( n, n ); MakeForsythe( J, alpha, lambda ); }
inline void Identity( DistMatrix<T,U,V>& I, Int m, Int n ) { DEBUG_ONLY(CallStackEntry cse("Identity")) I.Resize( m, n ); MakeIdentity( I ); }
inline void BinaryFlat ( DistMatrix<T,CIRC,CIRC>& A, Int height, Int width, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::Binary")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); const Int numBytes = FileSize( file ); const Int numBytesExp = height*width*sizeof(T); if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); if( A.CrossRank() == A.Root() ) { if( A.Height() == A.LDim() ) file.read( (char*)A.Buffer(), height*width*sizeof(T) ); else for( Int j=0; j<width; ++j ) file.read( (char*)A.Buffer(0,j), height*sizeof(T) ); } }
void Scatter ( const DistMatrix<T,CIRC,CIRC>& A, DistMatrix<T,STAR,STAR>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.Resize( height, width ); if( B.Participating() ) { const Int pkgSize = mpi::Pad( height*width ); vector<T> buffer; FastResize( buffer, pkgSize ); // Pack if( A.Participating() ) util::InterleaveMatrix ( height, width, A.LockedBuffer(), 1, A.LDim(), buffer.data(), 1, height ); // Broadcast from the process that packed mpi::Broadcast( buffer.data(), pkgSize, A.Root(), A.CrossComm() ); // Unpack util::InterleaveMatrix ( height, width, buffer.data(), 1, height, B.Buffer(), 1, B.LDim() ); } }
inline void BinaryFlat ( DistMatrix<T,STAR,V>& A, Int height, Int width, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::BinaryFlat")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); const Int numBytes = FileSize( file ); const Int numBytesExp = height*width*sizeof(T); if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); const Int localWidth = A.LocalWidth(); const Int rowShift = A.RowShift(); const Int rowStride = A.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; const Int localIndex = j*height; const std::streamoff pos = localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(0,jLoc), height*sizeof(T) ); } }
inline void Binary( DistMatrix<T,STAR,STAR>& A, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::Binary")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); Int height, width; file >> height; file >> width; const Int numBytes = FileSize( file ); const Int metaBytes = 2*sizeof(Int); const Int dataBytes = height*width*sizeof(T); const Int numBytesExp = metaBytes + dataBytes; if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); if( A.Height() == A.LDim() ) file.read( (char*)A.Buffer(), height*width*sizeof(T) ); else for( Int j=0; j<width; ++j ) file.read( (char*)A.Buffer(0,j), height*sizeof(T) ); }
void ConsistentlyComputeDecomposition ( DistMatrix<Field,MC,MR,BLOCK>& H, DistMatrix<Complex<Base<Field>>,STAR,STAR>& w, Matrix<Field>& Z, const HessenbergSchurCtrl& ctrl=HessenbergSchurCtrl() ) { EL_DEBUG_CSE // Because double-precision floating-point computation is often // non-deterministic due to extra-precision computation being frequent but // not guaranteed, we must be careful to not allow this non-determinism to // be amplified by the forward instability of Francis sweeps. const Grid& grid = H.Grid(); const int owner = H.Owner(0,0); DistMatrix<Field,CIRC,CIRC> H_CIRC_CIRC( grid, owner ); H_CIRC_CIRC = H; w.Resize( H.Height(), 1 ); if( H_CIRC_CIRC.CrossRank() == H_CIRC_CIRC.Root() ) HessenbergSchur( H_CIRC_CIRC.Matrix(), w.Matrix(), Z, ctrl ); else Z.Resize( H.Height(), H.Height() ); H = H_CIRC_CIRC; El::Broadcast( w.Matrix(), H_CIRC_CIRC.CrossComm(), H_CIRC_CIRC.Root() ); El::Broadcast( Z, H_CIRC_CIRC.CrossComm(), H_CIRC_CIRC.Root() ); }
inline void Jordan( DistMatrix<T,U,V>& J, Int n, T lambda ) { DEBUG_ONLY(CallStackEntry cse("Jordan")) J.Resize( n, n ); MakeJordan( J, lambda ); }
void RowMaxNorms ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,U,STAR>& norms ) { DEBUG_CSE norms.AlignWith( A ); norms.Resize( A.Height(), 1 ); RowMaxNorms( A.LockedMatrix(), norms.Matrix() ); AllReduce( norms, A.RowComm(), mpi::MAX ); }
inline void BinaryFlat ( DistMatrix<T,U,V>& A, Int height, Int width, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::BinaryFlat")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); const Int numBytes = FileSize( file ); const Int numBytesExp = height*width*sizeof(T); if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); if( U == A.UGath && V == A.VGath ) { if( A.CrossRank() == A.Root() ) { if( A.Height() == A.LDim() ) file.read( (char*)A.Buffer(), height*width*sizeof(T) ); else for( Int j=0; j<width; ++j ) file.read( (char*)A.Buffer(0,j), height*sizeof(T) ); } } else if( U == A.UGath ) { const Int localWidth = A.LocalWidth(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = A.GlobalCol(jLoc); const Int localIndex = j*height; const std::streamoff pos = localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(0,jLoc), height*sizeof(T) ); } } else { const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = A.GlobalRow(iLoc); const Int localIndex = i+j*height; const std::streamoff pos = localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(iLoc,jLoc), sizeof(T) ); } } } }
void ColumnMinAbs ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,V,STAR>& mins ) { EL_DEBUG_CSE const Int n = A.Width(); mins.AlignWith( A ); mins.Resize( n, 1 ); ColumnMinAbs( A.LockedMatrix(), mins.Matrix() ); AllReduce( mins.Matrix(), A.ColComm(), mpi::MIN ); }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { EL_DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { if( A.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf; FastResize( buf, (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) El::Broadcast( B, A.CrossComm(), A.Root() ); }
void StackedGeometricColumnScaling ( const DistMatrix<Field, U,V >& A, const DistMatrix<Field, U,V >& B, DistMatrix<Base<Field>,V,STAR>& geomScaling ) { EL_DEBUG_CSE // NOTE: Assuming A.ColComm() == B.ColComm() and that the row alignments // are equal typedef Base<Field> Real; DistMatrix<Real,V,STAR> maxScalingA(A.Grid()), maxScalingB(A.Grid()); ColumnMaxNorms( A, maxScalingA ); ColumnMaxNorms( B, maxScalingB ); const Int mLocalA = A.LocalHeight(); const Int mLocalB = B.LocalHeight(); const Int nLocal = A.LocalWidth(); geomScaling.AlignWith( maxScalingA ); geomScaling.Resize( A.Width(), 1 ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.LockedMatrix(); auto& geomScalingLoc = geomScaling.Matrix(); auto& maxScalingALoc = maxScalingA.Matrix(); auto& maxScalingBLoc = maxScalingB.Matrix(); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { Real minAbs = Max(maxScalingALoc(jLoc),maxScalingBLoc(jLoc)); for( Int iLoc=0; iLoc<mLocalA; ++iLoc ) { const Real absVal = Abs(ALoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } for( Int iLoc=0; iLoc<mLocalB; ++iLoc ) { const Real absVal = Abs(BLoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } geomScalingLoc(jLoc) = minAbs; } mpi::AllReduce( geomScaling.Buffer(), nLocal, mpi::MIN, A.ColComm() ); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { const Real maxAbsA = maxScalingALoc(jLoc); const Real maxAbsB = maxScalingBLoc(jLoc); const Real maxAbs = Max(maxAbsA,maxAbsB); const Real minAbs = geomScalingLoc(jLoc); geomScalingLoc(jLoc) = Sqrt(minAbs*maxAbs); } }
void RowTwoNorms ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,U,STAR>& norms ) { DEBUG_CSE norms.AlignWith( A ); norms.Resize( A.Height(), 1 ); if( A.Width() == 0 ) { Zero( norms ); return; } RowTwoNormsHelper( A.LockedMatrix(), norms.Matrix(), A.RowComm() ); }
inline void Lauchli( DistMatrix<T,U,V>& A, Int n, T mu ) { DEBUG_ONLY(CallStackEntry cse("Lauchli")) A.Resize( n+1, n ); auto ABlock = View( A, 0, 0, 1, n ); MakeOnes( ABlock ); std::vector<T> d(n,mu); ABlock = View( A, 1, 0, n, n ); Diagonal( ABlock, d ); }
void HermitianUniformSpectrum ( DistMatrix<F,U,V>& A, Int n, Base<F> lower, Base<F> upper ) { DEBUG_ONLY(CallStackEntry cse("HermitianUniformSpectrum")) A.Resize( n, n ); const Grid& grid = A.Grid(); typedef Base<F> Real; const bool isComplex = IsComplex<F>::val; const bool standardDist = ( U == MC && V == MR ); // Form d and D std::vector<F> d( n ); if( grid.Rank() == 0 ) for( Int j=0; j<n; ++j ) d[j] = SampleUniform<Real>( lower, upper ); mpi::Broadcast( d.data(), n, 0, grid.Comm() ); DistMatrix<F> ABackup( grid ); ABackup.AlignWith( A ); Diagonal( ABackup, d ); // Apply a Haar matrix from both sides DistMatrix<F> Q(grid); DistMatrix<F,MD,STAR> t(grid); DistMatrix<Real,MD,STAR> s(grid); ImplicitHaar( Q, t, s, n ); // Copy the result into the correct distribution qr::ApplyQ( LEFT, NORMAL, Q, t, s, ABackup ); qr::ApplyQ( RIGHT, ADJOINT, Q, t, s, ABackup ); A = ABackup; // Force the diagonal to be real-valued if( isComplex ) { const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = A.GlobalRow(iLoc); if( i == j ) A.SetLocalImagPart( iLoc, jLoc, Real(0) ); } } } }
void ScaLAPACKHelper ( DistMatrix<F,MC,MR,BLOCK>& A, DistMatrix<F,MR,STAR,BLOCK>& householderScalars ) { EL_DEBUG_CSE AssertScaLAPACKSupport(); #ifdef EL_HAVE_SCALAPACK const Int m = A.Height(); const Int n = A.Width(); const Int minDim = Min(m,n); householderScalars.AlignWith( A ); householderScalars.Resize( minDim, 1 ); auto descA = FillDesc( A ); scalapack::QR ( m, n, A.Buffer(), descA.data(), householderScalars.Buffer() ); #endif }
void Filter ( const DistMatrix<T,Collect<U>(),Collect<V>()>& A, DistMatrix<T, U, V >& B ) { DEBUG_CSE AssertSameGrids( A, B ); B.Resize( A.Height(), A.Width() ); if( !B.Participating() ) return; const Int colShift = B.ColShift(); const Int rowShift = B.RowShift(); util::InterleaveMatrix ( B.LocalHeight(), B.LocalWidth(), A.LockedBuffer(colShift,rowShift), B.ColStride(), B.RowStride()*A.LDim(), B.Buffer(), 1, B.LDim() ); }
inline void Lehmer( DistMatrix<F,U,V>& L, Int n ) { DEBUG_ONLY(CallStackEntry cse("Lehmer")) L.Resize( n, n ); const Int localHeight = L.LocalHeight(); const Int localWidth = L.LocalWidth(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = L.GlobalCol(jLoc); for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = L.GlobalRow(iLoc); if( i < j ) L.SetLocal( iLoc, jLoc, F(i+1)/F(j+1) ); else L.SetLocal( iLoc, jLoc, F(j+1)/F(i+1) ); } } }
inline void Binary( DistMatrix<T,U,V>& A, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::Binary")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); Int height, width; file >> height; file >> width; const Int numBytes = FileSize( file ); const Int metaBytes = 2*sizeof(Int); const Int dataBytes = height*width*sizeof(T); const Int numBytesExp = metaBytes + dataBytes; if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); const Int colShift = A.ColShift(); const Int rowShift = A.RowShift(); const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*colStride; const Int localIndex = i+j*height; const std::streamoff pos = metaBytes + localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(iLoc,jLoc), sizeof(T) ); } } }
void IndexDependentMap ( const DistMatrix<S,U,V,wrap>& A, DistMatrix<T,U,V,wrap>& B, function<T(Int,Int,const S&)> func ) { EL_DEBUG_CSE const Int mLoc = A.LocalHeight(); const Int nLoc = A.LocalWidth(); B.AlignWith( A.DistData() ); B.Resize( A.Height(), A.Width() ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.Matrix(); for( Int jLoc=0; jLoc<nLoc; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<mLoc; ++iLoc ) { const Int i = A.GlobalRow(iLoc); BLoc(iLoc,jLoc) = func(i,j,ALoc(iLoc,jLoc)); } } }
void QR ( DistMatrix<F,MC,MR,BLOCK>& A, DistMatrix<F,MR,STAR,BLOCK>& phase ) { DEBUG_CSE AssertScaLAPACKSupport(); #ifdef EL_HAVE_SCALAPACK const Int m = A.Height(); const Int n = A.Width(); const Int minDim = Min(m,n); phase.AlignWith( A ); phase.Resize( minDim, 1 ); const int bHandle = blacs::Handle( A ); const int context = blacs::GridInit( bHandle, A ); auto descA = FillDesc( A, context ); scalapack::QR( m, n, A.Buffer(), descA.data(), phase.Buffer() ); blacs::FreeGrid( context ); blacs::FreeHandle( bHandle ); #endif }
inline void Pei( DistMatrix<T,U,V>& P, Int n, T alpha ) { DEBUG_ONLY(CallStackEntry cse("Pei")) P.Resize( n, n ); const Int localHeight = P.LocalHeight(); const Int localWidth = P.LocalWidth(); const Int colShift = P.ColShift(); const Int rowShift = P.RowShift(); const Int colStride = P.ColStride(); const Int rowStride = P.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*colStride; P.SetLocal( iLoc, jLoc, T(1) ); if( i == j ) P.UpdateLocal( iLoc, jLoc, alpha ); } } }
inline void Redheffer( DistMatrix<T,U,V>& R, Int n ) { DEBUG_ONLY(CallStackEntry cse("Redheffer")) R.Resize( n, n ); const Int localHeight = R.LocalHeight(); const Int localWidth = R.LocalWidth(); const Int colShift = R.ColShift(); const Int rowShift = R.RowShift(); const Int colStride = R.ColStride(); const Int rowStride = R.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*colStride; if( j==0 || ((j+1)%(i+1))==0 ) R.SetLocal( iLoc, jLoc, T(1) ); else R.SetLocal( iLoc, jLoc, T(0) ); } } }
void FormDiagonalBlocks ( const DistMatrix<F,VC,STAR>& L, DistMatrix<F,STAR,STAR>& D, bool conjugate ) { const Grid& g = L.Grid(); const Int height = L.Width(); const Int blocksize = Blocksize(); const int commRank = g.VCRank(); const int commSize = g.Size(); const Int localHeight = Length(height,commRank,commSize); const Int maxLocalHeight = MaxLength(height,commSize); const Int portionSize = maxLocalHeight*blocksize; std::vector<F> sendBuffer( portionSize ); const Int colShift = L.ColShift(); const Int LLDim = L.LDim(); const F* LBuffer = L.LockedBuffer(); if( conjugate ) { for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*commSize; const Int block = i / blocksize; const Int jStart = block*blocksize; const Int b = std::min(height-jStart,blocksize); for( Int jOff=0; jOff<b; ++jOff ) sendBuffer[iLoc*blocksize+jOff] = Conj(LBuffer[iLoc+(jStart+jOff)*LLDim]); } } else { for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*commSize; const Int block = i / blocksize; const Int jStart = block*blocksize; const Int b = std::min(height-jStart,blocksize); for( Int jOff=0; jOff<b; ++jOff ) sendBuffer[iLoc*blocksize+jOff] = LBuffer[iLoc+(jStart+jOff)*LLDim]; } } std::vector<F> recvBuffer( portionSize*commSize ); mpi::AllGather ( &sendBuffer[0], portionSize, &recvBuffer[0], portionSize, g.VCComm() ); SwapClear( sendBuffer ); D.Resize( blocksize, height ); F* DBuffer = D.Buffer(); const Int DLDim = D.LDim(); for( Int proc=0; proc<commSize; ++proc ) { const F* procRecv = &recvBuffer[proc*portionSize]; const Int procLocalHeight = Length(height,proc,commSize); for( Int iLoc=0; iLoc<procLocalHeight; ++iLoc ) { const Int i = proc + iLoc*commSize; for( Int jOff=0; jOff<blocksize; ++jOff ) DBuffer[jOff+i*DLDim] = procRecv[jOff+iLoc*blocksize]; } } }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { DEBUG_ONLY(CSE cse("copy::AllGather")) AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf( (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) { // Pack from the root const Int BLocalHeight = B.LocalHeight(); const Int BLocalWidth = B.LocalWidth(); vector<T> buf(BLocalHeight*BLocalWidth); if( A.CrossRank() == A.Root() ) util::InterleaveMatrix ( BLocalHeight, BLocalWidth, B.LockedBuffer(), 1, B.LDim(), buf.data(), 1, BLocalHeight ); // Broadcast from the root mpi::Broadcast ( buf.data(), BLocalHeight*BLocalWidth, A.Root(), A.CrossComm() ); // Unpack if not the root if( A.CrossRank() != A.Root() ) util::InterleaveMatrix ( BLocalHeight, BLocalWidth, buf.data(), 1, BLocalHeight, B.Buffer(), 1, B.LDim() ); } }
void Gather ( const BlockMatrix<T>& A, DistMatrix<T,CIRC,CIRC,BLOCK>& B ) { DEBUG_ONLY(CSE cse("copy::Gather")) AssertSameGrids( A, B ); if( A.DistSize() == 1 && A.CrossSize() == 1 ) { B.Resize( A.Height(), A.Width() ); if( B.CrossRank() == B.Root() ) Copy( A.LockedMatrix(), B.Matrix() ); return; } const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); // Gather the colShifts and rowShifts // ================================== Int myShifts[2]; myShifts[0] = A.ColShift(); myShifts[1] = A.RowShift(); vector<Int> shifts; const Int crossSize = B.CrossSize(); if( B.CrossRank() == B.Root() ) shifts.resize( 2*crossSize ); mpi::Gather( myShifts, 2, shifts.data(), 2, B.Root(), B.CrossComm() ); // Gather the payload data // ======================= const bool irrelevant = ( A.RedundantRank()!=0 || A.CrossRank()!=A.Root() ); int totalSend = ( irrelevant ? 0 : A.LocalHeight()*A.LocalWidth() ); vector<int> recvCounts, recvOffsets; if( B.CrossRank() == B.Root() ) recvCounts.resize( crossSize ); mpi::Gather( &totalSend, 1, recvCounts.data(), 1, B.Root(), B.CrossComm() ); int totalRecv = Scan( recvCounts, recvOffsets ); //vector<T> sendBuf(totalSend), recvBuf(totalRecv); vector<T> sendBuf, recvBuf; sendBuf.reserve( totalSend ); recvBuf.reserve( totalRecv ); if( !irrelevant ) copy::util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf.data(), 1, A.LocalHeight() ); mpi::Gather ( sendBuf.data(), totalSend, recvBuf.data(), recvCounts.data(), recvOffsets.data(), B.Root(), B.CrossComm() ); // Unpack // ====== const Int mb = A.BlockHeight(); const Int nb = A.BlockWidth(); const Int colCut = A.ColCut(); const Int rowCut = A.RowCut(); if( B.Root() == B.CrossRank() ) { for( Int q=0; q<crossSize; ++q ) { if( recvCounts[q] == 0 ) continue; const Int colShift = shifts[2*q+0]; const Int rowShift = shifts[2*q+1]; const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int localHeight = BlockedLength( height, colShift, mb, colCut, colStride ); const Int localWidth = BlockedLength( width, rowShift, nb, rowCut, rowStride ); const T* data = &recvBuf[recvOffsets[q]]; for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int jBefore = rowShift*nb - rowCut; const Int jLocAdj = ( rowShift==0 ? jLoc+rowCut : jLoc ); const Int numFilledLocalBlocks = jLocAdj / nb; const Int jMid = numFilledLocalBlocks*nb*rowStride; const Int jPost = jLocAdj-numFilledLocalBlocks*nb; const Int j = jBefore + jMid + jPost; const T* sourceCol = &data[jLoc*localHeight]; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int iBefore = colShift*mb - colCut; const Int iLocAdj = (colShift==0 ? iLoc+colCut : iLoc); const Int numFilledLocalBlocks = iLocAdj / mb; const Int iMid = numFilledLocalBlocks*mb*colStride; const Int iPost = iLocAdj-numFilledLocalBlocks*mb; const Int i = iBefore + iMid + iPost; B.SetLocal(i,j,sourceCol[iLoc]); } } } } }
void Gather ( const ElementalMatrix<T>& A, DistMatrix<T,CIRC,CIRC>& B ) { DEBUG_ONLY(CSE cse("copy::Gather")) AssertSameGrids( A, B ); if( A.DistSize() == 1 && A.CrossSize() == 1 ) { B.Resize( A.Height(), A.Width() ); if( B.CrossRank() == B.Root() ) Copy( A.LockedMatrix(), B.Matrix() ); return; } const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); // Gather the colShifts and rowShifts // ================================== Int myShifts[2]; myShifts[0] = A.ColShift(); myShifts[1] = A.RowShift(); vector<Int> shifts; const Int crossSize = B.CrossSize(); if( B.CrossRank() == B.Root() ) shifts.resize( 2*crossSize ); mpi::Gather( myShifts, 2, shifts.data(), 2, B.Root(), B.CrossComm() ); // Gather the payload data // ======================= const bool irrelevant = ( A.RedundantRank()!=0 || A.CrossRank()!=A.Root() ); int totalSend = ( irrelevant ? 0 : A.LocalHeight()*A.LocalWidth() ); vector<int> recvCounts, recvOffsets; if( B.CrossRank() == B.Root() ) recvCounts.resize( crossSize ); mpi::Gather( &totalSend, 1, recvCounts.data(), 1, B.Root(), B.CrossComm() ); int totalRecv = Scan( recvCounts, recvOffsets ); //vector<T> sendBuf(totalSend), recvBuf(totalRecv); vector<T> sendBuf, recvBuf; sendBuf.reserve( totalSend ); recvBuf.reserve( totalRecv ); if( !irrelevant ) copy::util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf.data(), 1, A.LocalHeight() ); mpi::Gather ( sendBuf.data(), totalSend, recvBuf.data(), recvCounts.data(), recvOffsets.data(), B.Root(), B.CrossComm() ); // Unpack // ====== if( B.Root() == B.CrossRank() ) { for( Int q=0; q<crossSize; ++q ) { if( recvCounts[q] == 0 ) continue; const Int colShift = shifts[2*q+0]; const Int rowShift = shifts[2*q+1]; const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int localHeight = Length( height, colShift, colStride ); const Int localWidth = Length( width, rowShift, rowStride ); copy::util::InterleaveMatrix ( localHeight, localWidth, &recvBuf[recvOffsets[q]], 1, localHeight, B.Buffer(colShift,rowShift), colStride, rowStride*B.LDim() ); } } }
void TranslateBetweenGrids ( const DistMatrix<T,MC,MR>& A, DistMatrix<T,MC,MR>& B ) { DEBUG_ONLY(CSE cse("copy::TranslateBetweenGrids [MC,MR]")) B.Resize( A.Height(), A.Width() ); // Just need to ensure that each viewing comm contains the other team's // owning comm. Congruence is too strong. // Compute the number of process rows and columns that each process // needs to send to. const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); const Int colRank = B.ColRank(); const Int rowRank = B.RowRank(); const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int colGCD = GCD( colStride, colStrideA ); const Int rowGCD = GCD( rowStride, rowStrideA ); const Int colLCM = colStride*colStrideA / colGCD; const Int rowLCM = rowStride*rowStrideA / rowGCD; const Int numColSends = colStride / colGCD; const Int numRowSends = rowStride / rowGCD; const Int colAlign = B.ColAlign(); const Int rowAlign = B.RowAlign(); const Int colAlignA = A.ColAlign(); const Int rowAlignA = A.RowAlign(); const bool inBGrid = B.Participating(); const bool inAGrid = A.Participating(); if( !inBGrid && !inAGrid ) return; const Int maxSendSize = (A.Height()/(colStrideA*numColSends)+1) * (A.Width()/(rowStrideA*numRowSends)+1); // Translate the ranks from A's VC communicator to B's viewing so that // we can match send/recv communicators. Since A's VC communicator is not // necessarily defined on every process, we instead work with A's owning // group and account for row-major ordering if necessary. const int sizeA = A.Grid().Size(); vector<int> rankMap(sizeA), ranks(sizeA); if( A.Grid().Order() == COLUMN_MAJOR ) { for( int j=0; j<sizeA; ++j ) ranks[j] = j; } else { // The (i,j) = i + j*colStrideA rank in the column-major ordering is // equal to the j + i*rowStrideA rank in a row-major ordering. // Since we desire rankMap[i+j*colStrideA] to correspond to process // (i,j) in A's grid's rank in this viewing group, ranks[i+j*colStrideA] // should correspond to process (i,j) in A's owning group. Since the // owning group is ordered row-major in this case, its rank is // j+i*rowStrideA. Note that setting // ranks[j+i*rowStrideA] = i+j*colStrideA is *NOT* valid. for( int i=0; i<colStrideA; ++i ) for( int j=0; j<rowStrideA; ++j ) ranks[i+j*colStrideA] = j+i*rowStrideA; } mpi::Translate ( A.Grid().OwningGroup(), sizeA, &ranks[0], B.Grid().ViewingComm(), &rankMap[0] ); // Have each member of A's grid individually send to all numRow x numCol // processes in order, while the members of this grid receive from all // necessary processes at each step. Int requiredMemory = 0; if( inAGrid ) requiredMemory += maxSendSize; if( inBGrid ) requiredMemory += maxSendSize; vector<T> auxBuf( requiredMemory ); Int offset = 0; T* sendBuf = &auxBuf[offset]; if( inAGrid ) offset += maxSendSize; T* recvBuf = &auxBuf[offset]; Int recvRow = 0; // avoid compiler warnings... if( inAGrid ) recvRow = Mod(Mod(A.ColRank()-colAlignA,colStrideA)+colAlign,colStride); for( Int colSend=0; colSend<numColSends; ++colSend ) { Int recvCol = 0; // avoid compiler warnings... if( inAGrid ) recvCol=Mod(Mod(A.RowRank()-rowAlignA,rowStrideA)+rowAlign, rowStride); for( Int rowSend=0; rowSend<numRowSends; ++rowSend ) { mpi::Request sendRequest; // Fire off this round of non-blocking sends if( inAGrid ) { // Pack the data Int sendHeight = Length(A.LocalHeight(),colSend,numColSends); Int sendWidth = Length(A.LocalWidth(),rowSend,numRowSends); copy::util::InterleaveMatrix ( sendHeight, sendWidth, A.LockedBuffer(colSend,rowSend), numColSends, numRowSends*A.LDim(), sendBuf, 1, sendHeight ); // Send data const Int recvVCRank = recvRow + recvCol*colStride; const Int recvViewingRank = B.Grid().VCToViewing( recvVCRank ); mpi::ISend ( sendBuf, sendHeight*sendWidth, recvViewingRank, B.Grid().ViewingComm(), sendRequest ); } // Perform this round of recv's if( inBGrid ) { const Int sendColOffset = colAlignA; const Int recvColOffset = (colSend*colStrideA+colAlign) % colStride; const Int sendRowOffset = rowAlignA; const Int recvRowOffset = (rowSend*rowStrideA+rowAlign) % rowStride; const Int firstSendRow = Mod( Mod(colRank-recvColOffset,colStride)+sendColOffset, colStrideA ); const Int firstSendCol = Mod( Mod(rowRank-recvRowOffset,rowStride)+sendRowOffset, rowStrideA ); const Int colShift = Mod( colRank-recvColOffset, colStride ); const Int rowShift = Mod( rowRank-recvRowOffset, rowStride ); const Int numColRecvs = Length( colStrideA, colShift, colStride ); const Int numRowRecvs = Length( rowStrideA, rowShift, rowStride ); // Recv data // For now, simply receive sequentially. Until we switch to // nonblocking recv's, we won't be using much of the // recvBuf Int sendRow = firstSendRow; for( Int colRecv=0; colRecv<numColRecvs; ++colRecv ) { const Int sendColShift = Shift( sendRow, colAlignA, colStrideA ) + colSend*colStrideA; const Int sendHeight = Length( A.Height(), sendColShift, colLCM ); const Int localColOffset = (sendColShift-B.ColShift()) / colStride; Int sendCol = firstSendCol; for( Int rowRecv=0; rowRecv<numRowRecvs; ++rowRecv ) { const Int sendRowShift = Shift( sendCol, rowAlignA, rowStrideA ) + rowSend*rowStrideA; const Int sendWidth = Length( A.Width(), sendRowShift, rowLCM ); const Int localRowOffset = (sendRowShift-B.RowShift()) / rowStride; const Int sendVCRank = sendRow+sendCol*colStrideA; mpi::Recv ( recvBuf, sendHeight*sendWidth, rankMap[sendVCRank], B.Grid().ViewingComm() ); // Unpack the data copy::util::InterleaveMatrix ( sendHeight, sendWidth, recvBuf, 1, sendHeight, B.Buffer(localColOffset,localRowOffset), colLCM/colStride, (rowLCM/rowStride)*B.LDim() ); // Set up the next send col sendCol = (sendCol + rowStride) % rowStrideA; } // Set up the next send row sendRow = (sendRow + colStride) % colStrideA; } } // Ensure that this round of non-blocking sends completes if( inAGrid ) { mpi::Wait( sendRequest ); recvCol = (recvCol + rowStrideA) % rowStride; } } if( inAGrid ) recvRow = (recvRow + colStrideA) % colStride; } }
HessenbergSchurInfo MultiBulge ( DistMatrix<F,MC,MR,BLOCK>& H, DistMatrix<Complex<Base<F>>,STAR,STAR>& w, DistMatrix<F,MC,MR,BLOCK>& Z, const HessenbergSchurCtrl& ctrl ) { DEBUG_CSE typedef Base<F> Real; const Real zero(0); const Grid& grid = H.Grid(); const Int n = H.Height(); Int winBeg = ( ctrl.winBeg==END ? n : ctrl.winBeg ); Int winEnd = ( ctrl.winEnd==END ? n : ctrl.winEnd ); const Int winSize = winEnd - winBeg; const Int blockSize = H.BlockHeight(); // TODO(poulson): Implement a more reasonable/configurable means of deciding // when to call the sequential implementation Int minMultiBulgeSize = Max( ctrl.minMultiBulgeSize, 2*blockSize ); // This maximum is meant to account for parallel overheads and needs to be // more principled (and perhaps based upon the number of workers and the // cluster characteristics) // TODO(poulson): Re-enable this //minMultiBulgeSize = Max( minMultiBulgeSize, 500 ); HessenbergSchurInfo info; w.Resize( n, 1 ); if( winSize < minMultiBulgeSize ) { return multibulge::RedundantlyHandleWindow( H, w, Z, ctrl ); } auto ctrlShifts( ctrl ); ctrlShifts.winBeg = 0; ctrlShifts.winEnd = END; ctrlShifts.fullTriangle = false; Int numIterSinceDeflation = 0; const Int numStaleIterBeforeExceptional = 5; // Cf. LAPACK's DLAQR0 for this choice const Int maxIter = Max(30,2*numStaleIterBeforeExceptional) * Max(10,winSize); Int iterBegLast=-1, winEndLast=-1; DistMatrix<F,STAR,STAR> hMainWin(grid), hSuperWin(grid); DistMatrix<Real,STAR,STAR> hSubWin(grid); while( winBeg < winEnd ) { if( info.numIterations >= maxIter ) { if( ctrl.demandConverged ) RuntimeError("MultiBulge QR iteration did not converge"); else break; } auto winInd = IR(winBeg,winEnd); // Detect an irreducible Hessenberg window, [iterBeg,winEnd) // --------------------------------------------------------- // TODO(poulson): Have the interblock chase from the previous sweep // collect the main and sub diagonal of H along the diagonal workers // and then broadcast across the "cross" communicator. util::GatherTridiagonal( H, winInd, hMainWin, hSubWin, hSuperWin ); Output("winBeg=",winBeg,", winEnd=",winEnd); Print( H, "H" ); Print( hMainWin, "hMainWin" ); Print( hSubWin, "hSubWin" ); Print( hSuperWin, "hSuperWin" ); const Int iterOffset = DetectSmallSubdiagonal ( hMainWin.Matrix(), hSubWin.Matrix(), hSuperWin.Matrix() ); const Int iterBeg = winBeg + iterOffset; const Int iterWinSize = winEnd-iterBeg; if( iterOffset > 0 ) { H.Set( iterBeg, iterBeg-1, zero ); hSubWin.Set( iterOffset-1, 0, zero ); } if( iterWinSize == 1 ) { if( ctrl.progress ) Output("One-by-one window at ",iterBeg); w.Set( iterBeg, 0, hMainWin.GetLocal(iterOffset,0) ); winEnd = iterBeg; numIterSinceDeflation = 0; continue; } else if( iterWinSize == 2 ) { if( ctrl.progress ) Output("Two-by-two window at ",iterBeg); const F eta00 = hMainWin.GetLocal(iterOffset,0); const F eta01 = hSuperWin.GetLocal(iterOffset,0); const Real eta10 = hSubWin.GetLocal(iterOffset,0); const F eta11 = hMainWin.GetLocal(iterOffset+1,0); multibulge::TwoByTwo ( H, eta00, eta01, eta10, eta11, w, Z, iterBeg, ctrl ); winEnd = iterBeg; numIterSinceDeflation = 0; continue; } else if( iterWinSize < minMultiBulgeSize ) { // The window is small enough to switch to the simple scheme if( ctrl.progress ) Output("Redundantly handling window [",iterBeg,",",winEnd,"]"); auto ctrlIter( ctrl ); ctrlIter.winBeg = iterBeg; ctrlIter.winEnd = winEnd; auto iterInfo = multibulge::RedundantlyHandleWindow( H, w, Z, ctrlIter ); info.numIterations += iterInfo.numIterations; winEnd = iterBeg; numIterSinceDeflation = 0; continue; } const Int numShiftsRec = ctrl.numShifts( n, iterWinSize ); if( ctrl.progress ) { Output("Iter. ",info.numIterations,": "); Output(" window is [",iterBeg,",",winEnd,")"); Output(" recommending ",numShiftsRec," shifts"); } // NOTE(poulson): In the case where exceptional shifts are used, the // main and subdiagonals of H in the window are currently redundantly // gathered. It could be worthwhile to pass in hMainWin and hSubWin. const Int shiftBeg = multibulge::ComputeShifts ( H, w, iterBeg, winBeg, winEnd, numShiftsRec, numIterSinceDeflation, numStaleIterBeforeExceptional, ctrlShifts ); auto shiftInd = IR(shiftBeg,winEnd); auto wShifts = w(shiftInd,ALL); // Perform a small-bulge sweep auto ctrlSweep( ctrl ); ctrlSweep.winBeg = iterBeg; ctrlSweep.winEnd = winEnd; multibulge::Sweep( H, wShifts, Z, ctrlSweep ); ++info.numIterations; if( iterBeg == iterBegLast && winEnd == winEndLast ) ++numIterSinceDeflation; iterBegLast = iterBeg; winEndLast = winEnd; } info.numUnconverged = winEnd-winBeg; return info; }