void RowScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::RowScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("Matrix sizes did not match"); if( !B.Participating() ) return; const Int width = B.Width(); const Int colDiff = B.ColAlign()-A.ColAlign(); if( colDiff == 0 ) { if( width == 1 ) { const Int localHeight = B.LocalHeight(); const Int portionSize = mpi::Pad( localHeight ); //vector<T> buffer( portionSize ); vector<T> buffer; buffer.reserve( portionSize ); // Reduce to rowAlign const Int rowAlign = B.RowAlign(); mpi::Reduce ( A.LockedBuffer(), buffer.data(), portionSize, rowAlign, B.RowComm() ); if( B.RowRank() == rowAlign ) { axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, 1, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { const Int rowStride = B.RowStride(); const Int rowAlign = B.RowAlign(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( localHeight*maxLocalWidth ); const Int sendSize = rowStride*portionSize; // Pack //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); copy::util::RowStridedPack ( localHeight, width, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), buffer.data(), portionSize ); // Communicate mpi::ReduceScatter( buffer.data(), portionSize, B.RowComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) cerr << "Unaligned RowScatter" << endl; #endif const Int colRank = B.ColRank(); const Int colStride = B.ColStride(); const Int sendRow = Mod( colRank+colDiff, colStride ); const Int recvRow = Mod( colRank-colDiff, colStride ); const Int localHeight = B.LocalHeight(); const Int localHeightA = A.LocalHeight(); if( width == 1 ) { //vector<T> buffer( localHeight+localHeightA ); vector<T> buffer; buffer.reserve( localHeight+localHeightA ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[localHeightA]; // Reduce to rowAlign const Int rowAlign = B.RowAlign(); mpi::Reduce ( A.LockedBuffer(), sendBuf, localHeightA, rowAlign, B.RowComm() ); if( B.RowRank() == rowAlign ) { // Perform the realignment mpi::SendRecv ( sendBuf, localHeightA, sendRow, recvBuf, localHeight, recvRow, B.ColComm() ); axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, 1, recvBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { const Int rowStride = B.RowStride(); const Int rowAlign = B.RowAlign(); const Int localWidth = B.LocalWidth(); const Int maxLocalWidth = MaxLength(width,rowStride); const Int recvSize_RS = mpi::Pad( localHeightA*maxLocalWidth ); const Int sendSize_RS = rowStride * recvSize_RS; const Int recvSize_SR = localHeight * localWidth; //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); vector<T> buffer; buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[recvSize_RS]; // Pack copy::util::RowStridedPack ( localHeightA, width, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), secondBuf, recvSize_RS ); // Reduce-scatter over each process row mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.RowComm() ); // Trade reduced data with the appropriate process row mpi::SendRecv ( firstBuf, localHeightA*localWidth, sendRow, secondBuf, localHeight*localWidth, recvRow, B.ColComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, secondBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } } }
void ColScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::ColScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("A and B must be the same size"); #ifdef EL_VECTOR_WARNINGS if( A.Width() == 1 && B.Grid().Rank() == 0 ) { cerr << "The vector version of ColScatter does not" " yet have a vector version implemented, but it would only " "require a modification of the vector version of RowScatter" << endl; } #endif #ifdef EL_CACHE_WARNINGS if( A.Width() != 1 && B.Grid().Rank() == 0 ) { cerr << "axpy_contract::ColScatter potentially causes a large " "amount of cache-thrashing. If possible, avoid it by forming the " "(conjugate-)transpose of the [* ,V] matrix instead." << endl; } #endif if( !B.Participating() ) return; const Int height = B.Height(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int colAlign = B.ColAlign(); const Int colStride = B.ColStride(); const Int rowDiff = B.RowAlign()-A.RowAlign(); // TODO: Allow for modular equivalence if possible if( rowDiff == 0 ) { const Int maxLocalHeight = MaxLength(height,colStride); const Int recvSize = mpi::Pad( maxLocalHeight*localWidth ); const Int sendSize = colStride*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::ColStridedPack ( height, localWidth, colAlign, colStride, A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.ColComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) cerr << "Unaligned ColScatter" << endl; #endif const Int localWidthA = A.LocalWidth(); const Int maxLocalHeight = MaxLength(height,colStride); const Int recvSize_RS = mpi::Pad( maxLocalHeight*localWidthA ); const Int sendSize_RS = colStride*recvSize_RS; const Int recvSize_SR = localHeight*localWidth; //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); vector<T> buffer; buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[recvSize_RS]; // Pack copy::util::ColStridedPack ( height, localWidth, colAlign, colStride, A.LockedBuffer(), A.LDim(), secondBuf, recvSize_RS ); // Reduce-scatter over each col mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.ColComm() ); // Trade reduced data with the appropriate col const Int sendCol = Mod( B.RowRank()+rowDiff, B.RowStride() ); const Int recvCol = Mod( B.RowRank()-rowDiff, B.RowStride() ); mpi::SendRecv ( firstBuf, localHeight*localWidthA, sendCol, secondBuf, localHeight*localWidth, recvCol, B.RowComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, secondBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } }