void Scatter ( const DistMatrix<T,CIRC,CIRC>& A, DistMatrix<T,STAR,STAR>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.Resize( height, width ); if( B.Participating() ) { const Int pkgSize = mpi::Pad( height*width ); vector<T> buffer; FastResize( buffer, pkgSize ); // Pack if( A.Participating() ) util::InterleaveMatrix ( height, width, A.LockedBuffer(), 1, A.LDim(), buffer.data(), 1, height ); // Broadcast from the process that packed mpi::Broadcast( buffer.data(), pkgSize, A.Root(), A.CrossComm() ); // Unpack util::InterleaveMatrix ( height, width, buffer.data(), 1, height, B.Buffer(), 1, B.LDim() ); } }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { EL_DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { if( A.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf; FastResize( buf, (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) El::Broadcast( B, A.CrossComm(), A.Root() ); }
void Broadcast( AbstractDistMatrix<T>& A, mpi::Comm comm, int rank ) { DEBUG_CSE const int commSize = mpi::Size( comm ); const int commRank = mpi::Rank( comm ); if( commSize == 1 ) return; if( !A.Participating() ) return; const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); const Int localSize = localHeight*localWidth; if( localHeight == A.LDim() ) { mpi::Broadcast( A.Buffer(), localSize, rank, comm ); } else { vector<T> buf; FastResize( buf, localSize ); // Pack if( commRank == rank ) copy::util::InterleaveMatrix ( localHeight, localWidth, A.LockedBuffer(), 1, A.LDim(), buf.data(), 1, localHeight ); mpi::Broadcast( buf.data(), localSize, rank, comm ); // Unpack if( commRank != rank ) copy::util::InterleaveMatrix ( localHeight, localWidth, buf.data(), 1, localHeight, A.Buffer(), 1, A.LDim() ); } }
void Broadcast( Matrix<T>& A, mpi::Comm comm, int rank ) { DEBUG_CSE const int commSize = mpi::Size( comm ); const int commRank = mpi::Rank( comm ); if( commSize == 1 ) return; const Int height = A.Height(); const Int width = A.Width(); const Int size = height*width; if( height == A.LDim() ) { mpi::Broadcast( A.Buffer(), size, rank, comm ); } else { vector<T> buf; FastResize( buf, size ); // Pack if( commRank == rank ) copy::util::InterleaveMatrix ( height, width, A.LockedBuffer(), 1, A.LDim(), buf.data(), 1, height ); mpi::Broadcast( buf.data(), size, rank, comm ); // Unpack if( commRank != rank ) copy::util::InterleaveMatrix ( height, width, buf.data(), 1, height, A.Buffer(), 1, A.LDim() ); } }
void Send( const Matrix<T>& A, mpi::Comm comm, int destination ) { DEBUG_CSE const Int height = A.Height(); const Int width = A.Width(); const Int size = height*width; if( height == A.LDim() ) { mpi::Send( A.LockedBuffer(), size, destination, comm ); } else { vector<T> buf; FastResize( buf, size ); // Pack copy::util::InterleaveMatrix ( height, width, A.LockedBuffer(), 1, A.LDim(), buf.data(), 1, height ); mpi::Send( buf.data(), size, destination, comm ); } }
void ColAllToAllPromote ( const DistMatrix<T, U, V >& A, DistMatrix<T,Partial<U>(),PartialUnionRow<U,V>()>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.AlignColsAndResize ( Mod(A.ColAlign(),B.ColStride()), height, width, false, false ); if( !B.Participating() ) return; const Int colStride = A.ColStride(); const Int colStridePart = A.PartialColStride(); const Int colStrideUnion = A.PartialUnionColStride(); const Int colRankPart = A.PartialColRank(); const Int colDiff = B.ColAlign() - Mod(A.ColAlign(),colStridePart); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,colStrideUnion); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); if( colDiff == 0 ) { if( A.PartialUnionColStride() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), firstBuf, portionSize ); // Simultaneously Gather in columns and Scatter in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, colRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( A.Grid().Rank() == 0 ) cerr << "Unaligned PartialColAllToAllPromote" << endl; #endif const Int sendColRankPart = Mod( colRankPart+colDiff, colStridePart ); const Int recvColRankPart = Mod( colRankPart-colDiff, colStridePart ); vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), secondBuf, portionSize ); // Realign the input mpi::SendRecv ( secondBuf, colStrideUnion*portionSize, sendColRankPart, firstBuf, colStrideUnion*portionSize, recvColRankPart, A.PartialColComm() ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, recvColRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } }
void Helper ( const AbstractDistMatrix<S>& A, AbstractDistMatrix<T>& B ) { EL_DEBUG_CSE // TODO: Decide whether S or T should be used as the transmission type // based upon which is smaller. Transmit S by default. const Int height = A.Height(); const Int width = A.Width(); const Grid& g = B.Grid(); B.Resize( height, width ); Zero( B ); const bool BPartic = B.Participating(); const int BRoot = B.Root(); const bool includeViewers = (A.Grid() != B.Grid()); const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.Matrix(); // TODO: Break into smaller pieces to avoid excessive memory usage? vector<Entry<S>> remoteEntries; vector<int> distOwners; if( A.RedundantRank() == 0 ) { const bool noRedundant = B.RedundantSize() == 1; const int colStride = B.ColStride(); const int rowRank = B.RowRank(); const int colRank = B.ColRank(); vector<Int> globalRows(localHeight), localRows(localHeight); vector<int> ownerRows(localHeight); for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = A.GlobalRow(iLoc); const int ownerRow = B.RowOwner(i); globalRows[iLoc] = i; ownerRows[iLoc] = ownerRow; localRows[iLoc] = B.LocalRow(i,ownerRow); } remoteEntries.reserve( localHeight*localWidth ); distOwners.reserve( localHeight*localWidth ); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = A.GlobalCol(jLoc); const int ownerCol = B.ColOwner(j); const Int localCol = B.LocalCol(j,ownerCol); const bool isLocalCol = ( BPartic && ownerCol == rowRank ); for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const int ownerRow = ownerRows[iLoc]; const Int localRow = localRows[iLoc]; const bool isLocalRow = ( BPartic && ownerRow == colRank ); const S& alpha = ALoc(iLoc,jLoc); if( noRedundant && isLocalRow && isLocalCol ) { BLoc(localRow,localCol) = Caster<S,T>::Cast(alpha); } else { remoteEntries.push_back ( Entry<S>{localRow,localCol,alpha} ); distOwners.push_back( ownerRow + colStride*ownerCol ); } } } } // We will first push to redundant rank 0 of B const int redundantRootB = 0; // Compute the metadata // ==================== const Int totalSend = remoteEntries.size(); mpi::Comm comm; vector<int> sendCounts, owners(totalSend); if( includeViewers ) { comm = g.ViewingComm(); const int viewingSize = mpi::Size( g.ViewingComm() ); const int distBSize = mpi::Size( B.DistComm() ); vector<int> distBToViewing(distBSize); for( int distBRank=0; distBRank<distBSize; ++distBRank ) { const int vcOwner = g.CoordsToVC (B.ColDist(),B.RowDist(),distBRank,BRoot,redundantRootB); distBToViewing[distBRank] = g.VCToViewing(vcOwner); } sendCounts.resize(viewingSize,0); for( Int k=0; k<totalSend; ++k ) { owners[k] = distBToViewing[distOwners[k]]; ++sendCounts[owners[k]]; } } else { if( !g.InGrid() ) return; comm = g.VCComm(); const int distBSize = mpi::Size( B.DistComm() ); vector<int> distBToVC(distBSize); for( int distBRank=0; distBRank<distBSize; ++distBRank ) { distBToVC[distBRank] = g.CoordsToVC (B.ColDist(),B.RowDist(),distBRank,BRoot,redundantRootB); } const int vcSize = mpi::Size( g.VCComm() ); sendCounts.resize(vcSize,0); for( Int k=0; k<totalSend; ++k ) { owners[k] = distBToVC[distOwners[k]]; ++sendCounts[owners[k]]; } } SwapClear( distOwners ); // Pack the data // ============= vector<int> sendOffs; Scan( sendCounts, sendOffs ); vector<Entry<S>> sendBuf; FastResize( sendBuf, totalSend ); auto offs = sendOffs; for( Int k=0; k<totalSend; ++k ) sendBuf[offs[owners[k]]++] = remoteEntries[k]; SwapClear( remoteEntries ); SwapClear( owners ); // Exchange and unpack the data // ============================ auto recvBuf = mpi::AllToAll( sendBuf, sendCounts, sendOffs, comm ); if( BPartic ) { if( B.RedundantRank() == redundantRootB ) { Int recvBufSize = recvBuf.size(); for( Int k=0; k<recvBufSize; ++k ) { const auto& entry = recvBuf[k]; BLoc(entry.i,entry.j) = Caster<S,T>::Cast(entry.value); } } El::Broadcast( B, B.RedundantComm(), redundantRootB ); } }
void TransposeDist( const DistMatrix<T,U,V>& A, DistMatrix<T,V,U>& B ) { DEBUG_ONLY(CSE cse("copy::TransposeDist")) AssertSameGrids( A, B ); const Grid& g = B.Grid(); B.Resize( A.Height(), A.Width() ); if( !B.Participating() ) return; const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int distSize = A.DistSize(); if( A.DistSize() == 1 && B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else if( A.Width() == 1 ) { const Int height = A.Height(); const Int maxLocalHeight = MaxLength(height,distSize); const Int portionSize = mpi::Pad( maxLocalHeight ); const Int colDiff = Shift(A.DistRank(),A.ColAlign(),distSize) - Shift(B.DistRank(),B.ColAlign(),distSize); const Int sendRankB = Mod( B.DistRank()+colDiff, distSize ); const Int recvRankA = Mod( A.DistRank()-colDiff, distSize ); const Int recvRankB = (recvRankA/colStrideA)+rowStrideA*(recvRankA%colStrideA); vector<T> buffer; FastResize( buffer, (colStrideA+rowStrideA)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[colStrideA*portionSize]; if( A.RowRank() == A.RowAlign() ) { // Pack // TODO: Use kernel from copy::util const Int AColShift = A.ColShift(); const T* ABuf = A.LockedBuffer(); EL_PARALLEL_FOR for( Int k=0; k<rowStrideA; ++k ) { T* data = &recvBuf[k*portionSize]; const Int shift = Shift_(A.ColRank()+colStrideA*k,A.ColAlign(),distSize); const Int offset = (shift-AColShift) / colStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) data[iLoc] = ABuf[offset+iLoc*rowStrideA]; } } // (e.g., A[VC,STAR] <- A[MC,MR]) mpi::Scatter ( recvBuf, portionSize, sendBuf, portionSize, A.RowAlign(), A.RowComm() ); // (e.g., A[VR,STAR] <- A[VC,STAR]) mpi::SendRecv ( sendBuf, portionSize, sendRankB, recvBuf, portionSize, recvRankB, B.DistComm() ); // (e.g., A[MR,MC] <- A[VR,STAR]) mpi::Gather ( recvBuf, portionSize, sendBuf, portionSize, B.RowAlign(), B.RowComm() ); if( B.RowRank() == B.RowAlign() ) { // Unpack // TODO: Use kernel from copy::util T* bufB = B.Buffer(); EL_PARALLEL_FOR for( Int k=0; k<colStrideA; ++k ) { const T* data = &sendBuf[k*portionSize]; const Int shift = Shift_(B.ColRank()+rowStrideA*k,B.ColAlign(),distSize); const Int offset = (shift-B.ColShift()) / rowStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) bufB[offset+iLoc*colStrideA] = data[iLoc]; } } }
void Scatter ( const DistMatrix<T,CIRC,CIRC>& A, ElementalMatrix<T>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int m = A.Height(); const Int n = A.Width(); const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); B.Resize( m, n ); if( B.CrossSize() != 1 || B.RedundantSize() != 1 ) { // TODO: // Broadcast over the redundant communicator and use mpi::Translate // rank to determine whether a process is the root of the broadcast. GeneralPurpose( A, B ); return; } const Int pkgSize = mpi::Pad(MaxLength(m,colStride)*MaxLength(n,rowStride)); const Int recvSize = pkgSize; const Int sendSize = B.DistSize()*pkgSize; // Translate the root of A into the DistComm of B (if possible) const Int root = A.Root(); const Int target = mpi::Translate( A.CrossComm(), root, B.DistComm() ); if( target == mpi::UNDEFINED ) return; if( B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); return; } vector<T> buffer; T* recvBuf=0; // some compilers (falsely) warn otherwise if( A.CrossRank() == root ) { FastResize( buffer, sendSize+recvSize ); T* sendBuf = &buffer[0]; recvBuf = &buffer[sendSize]; // Pack the send buffer copy::util::StridedPack ( m, n, B.ColAlign(), colStride, B.RowAlign(), rowStride, A.LockedBuffer(), A.LDim(), sendBuf, pkgSize ); // Scatter from the root mpi::Scatter ( sendBuf, pkgSize, recvBuf, pkgSize, target, B.DistComm() ); } else { FastResize( buffer, recvSize ); recvBuf = &buffer[0]; // Perform the receiving portion of the scatter from the non-root mpi::Scatter ( static_cast<T*>(0), pkgSize, recvBuf, pkgSize, target, B.DistComm() ); } // Unpack copy::util::InterleaveMatrix ( B.LocalHeight(), B.LocalWidth(), recvBuf, 1, B.LocalHeight(), B.Buffer(), 1, B.LDim() ); }