void LPan ( DistMatrix<F>& A, DistMatrix<F>& W, DistMatrix<F,MD,STAR>& t, DistMatrix<F,MC,STAR>& APan_MC_STAR, DistMatrix<F,MR,STAR>& APan_MR_STAR, DistMatrix<F,MC,STAR>& W_MC_STAR, DistMatrix<F,MR,STAR>& W_MR_STAR ) { const Int n = A.Height(); const Int nW = W.Width(); DEBUG_ONLY( CallStackEntry cse("herm_tridiag::LPan"); AssertSameGrids( A, W, t ); if( n != A.Width() ) LogicError("A must be square"); if( n != W.Height() ) LogicError("A and W must be the same height"); if( n <= nW ) LogicError("W must be a column panel"); if( W.ColAlign() != A.ColAlign() || W.RowAlign() != A.RowAlign() ) LogicError("W and A must be aligned"); if( t.Height() != nW || t.Width() != 1 ) LogicError ("t must be a column vector of the same length as W's width"); if( !A.DiagonalAlignedWith(t,-1) ) LogicError("t is not aligned with A's subdiagonal."); )
void LowerPanel ( DistMatrix<F>& A, DistMatrix<F>& W, DistMatrix<F,MD,STAR>& t, DistMatrix<F,MC,STAR>& B_MC_STAR, DistMatrix<F,MR,STAR>& B_MR_STAR, DistMatrix<F,MC,STAR>& W_MC_STAR, DistMatrix<F,MR,STAR>& W_MR_STAR, const SymvCtrl<F>& ctrl ) { DEBUG_CSE const Int n = A.Height(); const Int nW = W.Width(); DEBUG_ONLY( AssertSameGrids( A, W, t ); if( n != A.Width() ) LogicError("A must be square"); if( n != W.Height() ) LogicError("A and W must be the same height"); if( n <= nW ) LogicError("W must be a column panel"); if( W.ColAlign() != A.ColAlign() || W.RowAlign() != A.RowAlign() ) LogicError("W and A must be aligned"); if( t.Height() != nW || t.Width() != 1 ) LogicError ("t must be a column vector of the same length as W's width"); if( !A.DiagonalAlignedWith(t,-1) ) LogicError("t is not aligned with A's subdiagonal."); )
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { EL_DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { if( A.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf; FastResize( buf, (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) El::Broadcast( B, A.CrossComm(), A.Root() ); }
void EnsureConformal ( const DistMatrix<T,STAR,MR>& A, const DistMatrix<T>& C, string name ) { if( A.Width() != C.Width() || A.RowAlign() != C.RowAlign() ) LogicError(name," not conformal with C"); }
void ColAllToAllPromote ( const DistMatrix<T, U, V >& A, DistMatrix<T,Partial<U>(),PartialUnionRow<U,V>()>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.AlignColsAndResize ( Mod(A.ColAlign(),B.ColStride()), height, width, false, false ); if( !B.Participating() ) return; const Int colStride = A.ColStride(); const Int colStridePart = A.PartialColStride(); const Int colStrideUnion = A.PartialUnionColStride(); const Int colRankPart = A.PartialColRank(); const Int colDiff = B.ColAlign() - Mod(A.ColAlign(),colStridePart); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,colStrideUnion); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); if( colDiff == 0 ) { if( A.PartialUnionColStride() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), firstBuf, portionSize ); // Simultaneously Gather in columns and Scatter in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, colRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( A.Grid().Rank() == 0 ) cerr << "Unaligned PartialColAllToAllPromote" << endl; #endif const Int sendColRankPart = Mod( colRankPart+colDiff, colStridePart ); const Int recvColRankPart = Mod( colRankPart-colDiff, colStridePart ); vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), secondBuf, portionSize ); // Realign the input mpi::SendRecv ( secondBuf, colStrideUnion*portionSize, sendColRankPart, firstBuf, colStrideUnion*portionSize, recvColRankPart, A.PartialColComm() ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, recvColRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } }
void TranslateBetweenGrids ( const DistMatrix<T,MC,MR>& A, DistMatrix<T,MC,MR>& B ) { DEBUG_ONLY(CSE cse("copy::TranslateBetweenGrids [MC,MR]")) B.Resize( A.Height(), A.Width() ); // Just need to ensure that each viewing comm contains the other team's // owning comm. Congruence is too strong. // Compute the number of process rows and columns that each process // needs to send to. const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); const Int colRank = B.ColRank(); const Int rowRank = B.RowRank(); const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int colGCD = GCD( colStride, colStrideA ); const Int rowGCD = GCD( rowStride, rowStrideA ); const Int colLCM = colStride*colStrideA / colGCD; const Int rowLCM = rowStride*rowStrideA / rowGCD; const Int numColSends = colStride / colGCD; const Int numRowSends = rowStride / rowGCD; const Int colAlign = B.ColAlign(); const Int rowAlign = B.RowAlign(); const Int colAlignA = A.ColAlign(); const Int rowAlignA = A.RowAlign(); const bool inBGrid = B.Participating(); const bool inAGrid = A.Participating(); if( !inBGrid && !inAGrid ) return; const Int maxSendSize = (A.Height()/(colStrideA*numColSends)+1) * (A.Width()/(rowStrideA*numRowSends)+1); // Translate the ranks from A's VC communicator to B's viewing so that // we can match send/recv communicators. Since A's VC communicator is not // necessarily defined on every process, we instead work with A's owning // group and account for row-major ordering if necessary. const int sizeA = A.Grid().Size(); vector<int> rankMap(sizeA), ranks(sizeA); if( A.Grid().Order() == COLUMN_MAJOR ) { for( int j=0; j<sizeA; ++j ) ranks[j] = j; } else { // The (i,j) = i + j*colStrideA rank in the column-major ordering is // equal to the j + i*rowStrideA rank in a row-major ordering. // Since we desire rankMap[i+j*colStrideA] to correspond to process // (i,j) in A's grid's rank in this viewing group, ranks[i+j*colStrideA] // should correspond to process (i,j) in A's owning group. Since the // owning group is ordered row-major in this case, its rank is // j+i*rowStrideA. Note that setting // ranks[j+i*rowStrideA] = i+j*colStrideA is *NOT* valid. for( int i=0; i<colStrideA; ++i ) for( int j=0; j<rowStrideA; ++j ) ranks[i+j*colStrideA] = j+i*rowStrideA; } mpi::Translate ( A.Grid().OwningGroup(), sizeA, &ranks[0], B.Grid().ViewingComm(), &rankMap[0] ); // Have each member of A's grid individually send to all numRow x numCol // processes in order, while the members of this grid receive from all // necessary processes at each step. Int requiredMemory = 0; if( inAGrid ) requiredMemory += maxSendSize; if( inBGrid ) requiredMemory += maxSendSize; vector<T> auxBuf( requiredMemory ); Int offset = 0; T* sendBuf = &auxBuf[offset]; if( inAGrid ) offset += maxSendSize; T* recvBuf = &auxBuf[offset]; Int recvRow = 0; // avoid compiler warnings... if( inAGrid ) recvRow = Mod(Mod(A.ColRank()-colAlignA,colStrideA)+colAlign,colStride); for( Int colSend=0; colSend<numColSends; ++colSend ) { Int recvCol = 0; // avoid compiler warnings... if( inAGrid ) recvCol=Mod(Mod(A.RowRank()-rowAlignA,rowStrideA)+rowAlign, rowStride); for( Int rowSend=0; rowSend<numRowSends; ++rowSend ) { mpi::Request sendRequest; // Fire off this round of non-blocking sends if( inAGrid ) { // Pack the data Int sendHeight = Length(A.LocalHeight(),colSend,numColSends); Int sendWidth = Length(A.LocalWidth(),rowSend,numRowSends); copy::util::InterleaveMatrix ( sendHeight, sendWidth, A.LockedBuffer(colSend,rowSend), numColSends, numRowSends*A.LDim(), sendBuf, 1, sendHeight ); // Send data const Int recvVCRank = recvRow + recvCol*colStride; const Int recvViewingRank = B.Grid().VCToViewing( recvVCRank ); mpi::ISend ( sendBuf, sendHeight*sendWidth, recvViewingRank, B.Grid().ViewingComm(), sendRequest ); } // Perform this round of recv's if( inBGrid ) { const Int sendColOffset = colAlignA; const Int recvColOffset = (colSend*colStrideA+colAlign) % colStride; const Int sendRowOffset = rowAlignA; const Int recvRowOffset = (rowSend*rowStrideA+rowAlign) % rowStride; const Int firstSendRow = Mod( Mod(colRank-recvColOffset,colStride)+sendColOffset, colStrideA ); const Int firstSendCol = Mod( Mod(rowRank-recvRowOffset,rowStride)+sendRowOffset, rowStrideA ); const Int colShift = Mod( colRank-recvColOffset, colStride ); const Int rowShift = Mod( rowRank-recvRowOffset, rowStride ); const Int numColRecvs = Length( colStrideA, colShift, colStride ); const Int numRowRecvs = Length( rowStrideA, rowShift, rowStride ); // Recv data // For now, simply receive sequentially. Until we switch to // nonblocking recv's, we won't be using much of the // recvBuf Int sendRow = firstSendRow; for( Int colRecv=0; colRecv<numColRecvs; ++colRecv ) { const Int sendColShift = Shift( sendRow, colAlignA, colStrideA ) + colSend*colStrideA; const Int sendHeight = Length( A.Height(), sendColShift, colLCM ); const Int localColOffset = (sendColShift-B.ColShift()) / colStride; Int sendCol = firstSendCol; for( Int rowRecv=0; rowRecv<numRowRecvs; ++rowRecv ) { const Int sendRowShift = Shift( sendCol, rowAlignA, rowStrideA ) + rowSend*rowStrideA; const Int sendWidth = Length( A.Width(), sendRowShift, rowLCM ); const Int localRowOffset = (sendRowShift-B.RowShift()) / rowStride; const Int sendVCRank = sendRow+sendCol*colStrideA; mpi::Recv ( recvBuf, sendHeight*sendWidth, rankMap[sendVCRank], B.Grid().ViewingComm() ); // Unpack the data copy::util::InterleaveMatrix ( sendHeight, sendWidth, recvBuf, 1, sendHeight, B.Buffer(localColOffset,localRowOffset), colLCM/colStride, (rowLCM/rowStride)*B.LDim() ); // Set up the next send col sendCol = (sendCol + rowStride) % rowStrideA; } // Set up the next send row sendRow = (sendRow + colStride) % colStrideA; } } // Ensure that this round of non-blocking sends completes if( inAGrid ) { mpi::Wait( sendRequest ); recvCol = (recvCol + rowStrideA) % rowStride; } } if( inAGrid ) recvRow = (recvRow + colStrideA) % colStride; } }
void TestCorrectness ( bool print, UpperOrLower uplo, const DistMatrix<F>& A, const DistMatrix<Base<F>,VR,STAR>& w, const DistMatrix<F>& Z, const DistMatrix<F>& AOrig ) { typedef Base<F> Real; const Grid& g = A.Grid(); const Int n = Z.Height(); const Int k = Z.Width(); if( g.Rank() == 0 ) { cout << " Gathering computed eigenvalues..."; cout.flush(); } DistMatrix<Real,MR,STAR> w_MR_STAR(true,Z.RowAlign(),g); w_MR_STAR = w; if( g.Rank() == 0 ) cout << "DONE" << endl; if( g.Rank() == 0 ) cout << " Testing orthogonality of eigenvectors..." << endl; DistMatrix<F> X(g); Identity( X, k, k ); Herk( uplo, ADJOINT, F(-1), Z, F(1), X ); Real oneNormOfError = OneNorm( X ); Real infNormOfError = InfinityNorm( X ); Real frobNormOfError = FrobeniusNorm( X ); if( g.Rank() == 0 ) { cout << " ||Z^H Z - I||_1 = " << oneNormOfError << "\n" << " ||Z^H Z - I||_oo = " << infNormOfError << "\n" << " ||Z^H Z - I||_F = " << frobNormOfError << "\n\n" << " Testing for deviation of AZ from ZW..." << endl; } // X := AZ X.AlignWith( Z ); Zeros( X, n, k ); Hemm( LEFT, uplo, F(1), AOrig, Z, F(0), X ); // Find the residual ||X-ZW||_oo = ||AZ-ZW||_oo for( Int jLoc=0; jLoc<X.LocalWidth(); ++jLoc ) { const Real omega = w_MR_STAR.GetLocal(jLoc,0); for( Int iLoc=0; iLoc<X.LocalHeight(); ++iLoc ) { const F chi = X.GetLocal(iLoc,jLoc); const F zeta = Z.GetLocal(iLoc,jLoc); X.SetLocal(iLoc,jLoc,chi-omega*zeta); } } // Find the infinity norms of A, Z, and AZ-ZW Real infNormOfA = HermitianInfinityNorm( uplo, AOrig ); Real frobNormOfA = HermitianFrobeniusNorm( uplo, AOrig ); Real oneNormOfZ = OneNorm( Z ); Real infNormOfZ = InfinityNorm( Z ); Real frobNormOfZ = FrobeniusNorm( Z ); oneNormOfError = OneNorm( X ); infNormOfError = InfinityNorm( X ); frobNormOfError = FrobeniusNorm( X ); if( g.Rank() == 0 ) { cout << " ||A||_1 = ||A||_oo = " << infNormOfA << "\n" << " ||A||_F = " << frobNormOfA << "\n" << " ||Z||_1 = " << oneNormOfZ << "\n" << " ||Z||_oo = " << infNormOfZ << "\n" << " ||Z||_F = " << frobNormOfZ << "\n" << " ||A Z - Z W||_1 = " << oneNormOfError << "\n" << " ||A Z - Z W||_oo = " << infNormOfError << "\n" << " ||A Z - Z W||_F = " << frobNormOfError << endl; } }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { DEBUG_ONLY(CSE cse("copy::AllGather")) AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf( (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) { // Pack from the root const Int BLocalHeight = B.LocalHeight(); const Int BLocalWidth = B.LocalWidth(); vector<T> buf(BLocalHeight*BLocalWidth); if( A.CrossRank() == A.Root() ) util::InterleaveMatrix ( BLocalHeight, BLocalWidth, B.LockedBuffer(), 1, B.LDim(), buf.data(), 1, BLocalHeight ); // Broadcast from the root mpi::Broadcast ( buf.data(), BLocalHeight*BLocalWidth, A.Root(), A.CrossComm() ); // Unpack if not the root if( A.CrossRank() != A.Root() ) util::InterleaveMatrix ( BLocalHeight, BLocalWidth, buf.data(), 1, BLocalHeight, B.Buffer(), 1, B.LDim() ); } }
void TransposeDist( const DistMatrix<T,U,V>& A, DistMatrix<T,V,U>& B ) { DEBUG_ONLY(CSE cse("copy::TransposeDist")) AssertSameGrids( A, B ); const Grid& g = B.Grid(); B.Resize( A.Height(), A.Width() ); if( !B.Participating() ) return; const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int distSize = A.DistSize(); if( A.DistSize() == 1 && B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else if( A.Width() == 1 ) { const Int height = A.Height(); const Int maxLocalHeight = MaxLength(height,distSize); const Int portionSize = mpi::Pad( maxLocalHeight ); const Int colDiff = Shift(A.DistRank(),A.ColAlign(),distSize) - Shift(B.DistRank(),B.ColAlign(),distSize); const Int sendRankB = Mod( B.DistRank()+colDiff, distSize ); const Int recvRankA = Mod( A.DistRank()-colDiff, distSize ); const Int recvRankB = (recvRankA/colStrideA)+rowStrideA*(recvRankA%colStrideA); vector<T> buffer; FastResize( buffer, (colStrideA+rowStrideA)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[colStrideA*portionSize]; if( A.RowRank() == A.RowAlign() ) { // Pack // TODO: Use kernel from copy::util const Int AColShift = A.ColShift(); const T* ABuf = A.LockedBuffer(); EL_PARALLEL_FOR for( Int k=0; k<rowStrideA; ++k ) { T* data = &recvBuf[k*portionSize]; const Int shift = Shift_(A.ColRank()+colStrideA*k,A.ColAlign(),distSize); const Int offset = (shift-AColShift) / colStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) data[iLoc] = ABuf[offset+iLoc*rowStrideA]; } } // (e.g., A[VC,STAR] <- A[MC,MR]) mpi::Scatter ( recvBuf, portionSize, sendBuf, portionSize, A.RowAlign(), A.RowComm() ); // (e.g., A[VR,STAR] <- A[VC,STAR]) mpi::SendRecv ( sendBuf, portionSize, sendRankB, recvBuf, portionSize, recvRankB, B.DistComm() ); // (e.g., A[MR,MC] <- A[VR,STAR]) mpi::Gather ( recvBuf, portionSize, sendBuf, portionSize, B.RowAlign(), B.RowComm() ); if( B.RowRank() == B.RowAlign() ) { // Unpack // TODO: Use kernel from copy::util T* bufB = B.Buffer(); EL_PARALLEL_FOR for( Int k=0; k<colStrideA; ++k ) { const T* data = &sendBuf[k*portionSize]; const Int shift = Shift_(B.ColRank()+rowStrideA*k,B.ColAlign(),distSize); const Int offset = (shift-B.ColShift()) / rowStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) bufB[offset+iLoc*colStrideA] = data[iLoc]; } } }
void ColAllToAllDemote ( const DistMatrix<T,Partial<U>(),PartialUnionRow<U,V>()>& A, DistMatrix<T, U, V >& B ) { DEBUG_ONLY(CallStackEntry cse("copy::ColAllToAllDemote")) AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.AlignColsAndResize( A.ColAlign(), height, width, false, false ); if( !B.Participating() ) return; const Int colAlign = B.ColAlign(); const Int rowAlignA = A.RowAlign(); const Int colStride = B.ColStride(); const Int colStridePart = B.PartialColStride(); const Int colStrideUnion = B.PartialUnionColStride(); const Int colRankPart = B.PartialColRank(); const Int colDiff = (colAlign%colStridePart) - A.ColAlign(); const Int colShiftA = A.ColShift(); const Int localHeightB = B.LocalHeight(); const Int localWidthA = A.LocalWidth(); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,colStrideUnion); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); std::vector<T> buffer( 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; if( colDiff == 0 ) { // Pack util::PartialColStridedPack ( height, localWidthA, colAlign, colStride, colStrideUnion, colStridePart, colRankPart, colShiftA, A.LockedBuffer(), A.LDim(), firstBuf, portionSize ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, B.PartialUnionColComm() ); // Unpack util::RowStridedUnpack ( localHeightB, width, rowAlignA, colStrideUnion, secondBuf, portionSize, B.Buffer(), B.LDim() ); } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) std::cerr << "Unaligned ColAllToAllDemote" << std::endl; #endif const Int sendColRankPart = Mod( colRankPart+colDiff, colStridePart ); const Int recvColRankPart = Mod( colRankPart-colDiff, colStridePart ); // Pack util::PartialColStridedPack ( height, localWidthA, colAlign, colStride, colStrideUnion, colStridePart, sendColRankPart, colShiftA, A.LockedBuffer(), A.LDim(), secondBuf, portionSize ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( secondBuf, portionSize, firstBuf, portionSize, B.PartialUnionColComm() ); // Realign the result mpi::SendRecv ( firstBuf, colStrideUnion*portionSize, sendColRankPart, secondBuf, colStrideUnion*portionSize, recvColRankPart, B.PartialColComm() ); // Unpack util::RowStridedUnpack ( localHeightB, width, rowAlignA, colStrideUnion, secondBuf, portionSize, B.Buffer(), B.LDim() ); } }