// Broadcast a matrix from the root grid to the others void DepthBroadcast ( const mpi::Comm& depthComm, const DistMatrix<double,MC,MR>& A, DistMatrix<double,MC,MR>& B ) { const int rank = mpi::CommRank(mpi::COMM_WORLD); const Grid& meshGrid = A.Grid(); const int meshSize = meshGrid.Size(); const int depthRank = rank / meshSize; const int localSize = A.LocalHeight()*A.LocalWidth(); if( A.LocalHeight() != A.LocalLDim() ) throw std::logic_error("Leading dimension did not match local height"); B.Empty(); B.AlignWith( A ); B.ResizeTo( A.Height(), A.Width() ); // Have the root pack the broadcast data if( depthRank == 0 ) MemCopy( B.LocalBuffer(), A.LockedLocalBuffer(), localSize ); // Broadcast from the root mpi::Broadcast( B.LocalBuffer(), localSize, 0, depthComm ); }
/* * Distributes A in such a way that * Layer 0 <- A(:, 0:(n/h - 1)) * Layer 1 <- A(:, (n/h):(2n/h - 1)) * . * . * . * Layer h-1 <- A(:, ((h-1)n/h):n) */ void DistributeCols ( const mpi::Comm& depthComm, const DistMatrix<double,MC,MR>& A, DistMatrix<double,MC,MR>& B ) { const Grid& meshGrid = A.Grid(); const int meshSize = meshGrid.Size(); const int depthSize = mpi::CommSize( depthComm ); const int depthRank = mpi::CommRank( depthComm ); const int sendCount = A.LocalHeight()*A.LocalWidth(); const int recvCount = sendCount / depthSize; // For now, we will make B as large as A... // TODO: NOT DO THIS if( A.LocalHeight() != A.LocalLDim() ) throw std::logic_error("Local height did not match local ldim"); B.Empty(); B.AlignWith( A ); Zeros( A.Height(), A.Width(), B ); // Scatter const int localColOffset = (A.LocalWidth()/depthSize)*depthRank; mpi::Scatter ( A.LockedLocalBuffer(), recvCount, B.LocalBuffer(0,localColOffset), recvCount, 0, depthComm ); }
static void Func ( DistMatrix<T,MC,STAR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); if( grid.InGrid() ) { const int n = A.Width(); const int localHeight = A.LocalHeight(); const int bufSize = localHeight*n; std::vector<T> buffer( bufSize ); // Create random matrix on process column 0, then broadcast if( grid.Col() == 0 ) { for( int j=0; j<n; ++j ) for( int iLocal=0; iLocal<localHeight; ++iLocal ) buffer[iLocal+j*localHeight] = center + radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.RowComm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int j=0; j<n; ++j ) { const T* bufferCol = &buffer[j*localHeight]; T* col = &localBuffer[j*ldim]; MemCopy( col, bufferCol, localHeight ); } } }
static void Func ( DistMatrix<T,STAR,STAR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); const int m = A.Height(); const int n = A.Width(); const int bufSize = m*n; if( grid.InGrid() ) { std::vector<T> buffer( bufSize ); if( grid.Rank() == 0 ) { for( int j=0; j<n; ++j ) for( int i=0; i<m; ++i ) buffer[i+j*m] = center+radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.Comm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int j=0; j<n; ++j ) { const T* bufferCol = &buffer[j*m]; T* col = &localBuffer[j*ldim]; MemCopy( col, bufferCol, m ); } } }
static void Func ( DistMatrix<T,STAR,MR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); const int m = A.Height(); const int localWidth = A.LocalWidth(); const int bufSize = m*localWidth; std::vector<T> buffer( bufSize ); // Create random matrix on process row 0, then broadcast if( grid.Row() == 0 ) { for( int j=0; j<localWidth; ++j ) for( int i=0; i<m; ++i ) buffer[i+j*m] = center+radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.ColComm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const T* bufferCol = &buffer[jLocal*m]; T* col = &localBuffer[jLocal*ldim]; MemCopy( col, bufferCol, m ); } }
static void Func ( DistMatrix<T,MR,STAR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); const int n = A.Width(); const int localHeight = A.LocalHeight(); const int bufSize = localHeight*n; std::vector<T> buffer( bufSize ); // Create random matrix on process row 0, then broadcast if( grid.Row() == 0 ) { for( int j=0; j<n; ++j ) for( int i=0; i<localHeight; ++i ) buffer[i+j*localHeight] = center+radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.ColComm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for COLLAPSE(2) #endif for( int j=0; j<n; ++j ) for( int iLocal=0; iLocal<localHeight; ++iLocal ) localBuffer[iLocal+j*ldim] = buffer[iLocal+j*localHeight]; }
inline void MakeTriangular( UpperOrLower uplo, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTriangular"); #endif const int height = A.Height(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = j-1; if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = RawLocalLength( boundary, colShift, colStride ); MemZero( &localBuffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = j+1; const int numNonzeroRows = RawLocalLength(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &localBuffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
inline void HermitianSVD ( UpperOrLower uplo, DistMatrix<F>& A, DistMatrix<typename Base<F>::type,VR,STAR>& s, DistMatrix<F>& U, DistMatrix<F>& V ) { #ifndef RELEASE PushCallStack("HermitianSVD"); #endif typedef typename Base<F>::type R; // Grab an eigenvalue decomposition of A HermitianEig( uplo, A, s, V ); // Redistribute the singular values into an [MR,* ] distribution const Grid& grid = A.Grid(); DistMatrix<R,MR,STAR> s_MR_STAR( grid ); s_MR_STAR.AlignWith( V ); s_MR_STAR = s; // Set the singular values to the absolute value of the eigenvalues const int numLocalVals = s.LocalHeight(); for( int iLocal=0; iLocal<numLocalVals; ++iLocal ) { const R sigma = s.GetLocal(iLocal,0); s.SetLocal(iLocal,0,Abs(sigma)); } // Copy V into U (flipping the sign as necessary) U.AlignWith( V ); U.ResizeTo( V.Height(), V.Width() ); const int localHeight = V.LocalHeight(); const int localWidth = V.LocalWidth(); for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const R sigma = s_MR_STAR.GetLocal( jLocal, 0 ); F* UCol = U.LocalBuffer( 0, jLocal ); const F* VCol = V.LockedLocalBuffer( 0, jLocal ); if( sigma >= 0 ) for( int iLocal=0; iLocal<localHeight; ++iLocal ) UCol[iLocal] = VCol[iLocal]; else for( int iLocal=0; iLocal<localHeight; ++iLocal ) UCol[iLocal] = -VCol[iLocal]; } #ifndef RELEASE PopCallStack(); #endif }
// Have the top layer initialize the distributed matrix, B void InitB( DistMatrix<double,MC,MR>& B ) { const int rank = mpi::CommRank(mpi::COMM_WORLD); const Grid& g = B.Grid(); const int meshSize = g.Size(); const int depthRank = rank / meshSize; if( depthRank == 0 ) { if( B.LocalHeight() != B.LocalLDim() ) throw std::logic_error("Local ldim of B was too large"); double* localBuffer = B.LocalBuffer(); const int localSize = B.LocalHeight()*B.LocalWidth(); for( int iLocal=0; iLocal<localSize; ++iLocal ) localBuffer[iLocal] = iLocal*meshSize + rank; B.Print("B"); } }
inline void AddInLocalData ( const DistMatrix<F,VC,STAR>& X1, DistMatrix<F,STAR,STAR>& Z ) { #ifndef RELEASE PushCallStack("internal::AddInLocalData"); #endif const int width = X1.Width(); const int localHeight = X1.LocalHeight(); const int stride = X1.Grid().Size(); const int offset = X1.ColShift(); for( int j=0; j<width; ++j ) { F* ZColBuffer = Z.LocalBuffer(0,j); const F* X1ColBuffer = X1.LockedLocalBuffer(0,j); for( int iLocal=0; iLocal<localHeight; ++iLocal ) ZColBuffer[offset+stride*iLocal] += X1ColBuffer[iLocal]; } #ifndef RELEASE PopCallStack(); #endif }
// Reduce across depth to get end result C void SumContributions ( mpi::Comm& depthComm, const DistMatrix<double,MC,MR>& APartial, DistMatrix<double,MC,MR>& A ) { const int rank = mpi::CommRank( mpi::COMM_WORLD ); const Grid& meshGrid = APartial.Grid(); A.Empty(); A.AlignWith( APartial ); A.ResizeTo( APartial.Height(), APartial.Width() ); if( APartial.LocalHeight() != APartial.LocalLDim() ) throw std::logic_error ("APartial did not have matching local height/ldim"); if( A.LocalHeight() != A.LocalLDim() ) throw std::logic_error("A did not have matching local height/ldim"); const int dataSize = APartial.LocalHeight()*APartial.LocalWidth(); mpi::AllReduce ( APartial.LockedLocalBuffer(), A.LocalBuffer(), dataSize, mpi::SUM, depthComm ); }
inline void Her ( UpperOrLower uplo, T alpha, const DistMatrix<T>& x, DistMatrix<T>& A ) { #ifndef RELEASE PushCallStack("Her"); if( A.Grid() != x.Grid() ) throw std::logic_error("{A,x} must be distributed over the same grid"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); const int xLength = ( x.Width()==1 ? x.Height() : x.Width() ); if( A.Height() != xLength ) { std::ostringstream msg; msg << "A must conform with x: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x ~ " << x.Height() << " x " << x.Width() << "\n"; throw std::logic_error( msg.str() ); } #endif const Grid& g = A.Grid(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int r = g.Height(); const int c = g.Width(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); if( x.Width() == 1 ) { DistMatrix<T,MC,STAR> x_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; const T* xLocal = x_MC_STAR.LockedLocalBuffer(); if( uplo == LOWER ) { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightAboveDiag = LocalLength(j,colShift,r); const T gamma = alpha*Conj(x_MR_STAR.GetLocal(jLocal,0)); T* ALocalCol = A.LocalBuffer(0,jLocal); for( int iLocal=heightAboveDiag; iLocal<localHeight; ++iLocal ) ALocalCol[iLocal] += gamma*xLocal[iLocal]; } } else { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightToDiag = LocalLength(j+1,colShift,r); const T gamma = alpha*Conj(x_MR_STAR.GetLocal(jLocal,0)); T* ALocalCol = A.LocalBuffer(0,jLocal); for( int iLocal=0; iLocal<heightToDiag; ++iLocal ) ALocalCol[iLocal] += gamma*xLocal[iLocal]; } } //--------------------------------------------------------------------// x_MC_STAR.FreeAlignments(); x_MR_STAR.FreeAlignments(); } else { DistMatrix<T,STAR,MC> x_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; const T* xLocal = x_STAR_MC.LockedLocalBuffer(); const int incx = x_STAR_MC.LocalLDim(); if( uplo == LOWER ) { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightAboveDiag = LocalLength(j,colShift,r); const T gamma = alpha*Conj(x_STAR_MR.GetLocal(0,jLocal)); T* ALocalCol = A.LocalBuffer(0,jLocal); for( int iLocal=heightAboveDiag; iLocal<localHeight; ++iLocal ) ALocalCol[iLocal] += gamma*xLocal[iLocal*incx]; } } else { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightToDiag = LocalLength(j+1,colShift,r); const T gamma = alpha*Conj(x_STAR_MR.GetLocal(0,jLocal)); T* ALocalCol = A.LocalBuffer(0,jLocal); for( int iLocal=0; iLocal<heightToDiag; ++iLocal ) ALocalCol[iLocal] += gamma*xLocal[iLocal*incx]; } } //--------------------------------------------------------------------// x_STAR_MC.FreeAlignments(); x_STAR_MR.FreeAlignments(); } #ifndef RELEASE PopCallStack(); #endif }
inline void PanelLU ( DistMatrix<F, STAR,STAR>& A, DistMatrix<F, MC, STAR>& B, DistMatrix<int,STAR,STAR>& p, int pivotOffset ) { #ifndef RELEASE PushCallStack("internal::PanelLU"); if( A.Grid() != p.Grid() || p.Grid() != B.Grid() ) throw std::logic_error ("Matrices must be distributed over the same grid"); if( A.Width() != B.Width() ) throw std::logic_error("A and B must be the same width"); if( A.Height() != p.Height() || p.Width() != 1 ) throw std::logic_error("p must be a vector that conforms with A"); #endif const Grid& g = A.Grid(); const int r = g.Height(); const int colShift = B.ColShift(); const int colAlignment = B.ColAlignment(); // Matrix views DistMatrix<F,STAR,STAR> ATL(g), ATR(g), A00(g), a01(g), A02(g), ABL(g), ABR(g), a10(g), alpha11(g), a12(g), A20(g), a21(g), A22(g); DistMatrix<F,MC,STAR> BL(g), BR(g), B0(g), b1(g), B2(g); const int width = A.Width(); const int numBytes = (width+1)*sizeof(F)+sizeof(int); std::vector<byte> sendData(numBytes); std::vector<byte> recvData(numBytes); // Extract pointers to send and recv data // TODO: Think of how to make this safer with respect to alignment issues F* sendBufFloat = (F*)&sendData[0]; F* recvBufFloat = (F*)&recvData[0]; int* sendBufInt = (int*)&sendData[(width+1)*sizeof(F)]; int* recvBufInt = (int*)&recvData[(width+1)*sizeof(F)]; // Start the algorithm PushBlocksizeStack( 1 ); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionRight( B, BL, BR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ a01, A02, /*************/ /**********************/ /**/ a10, /**/ alpha11, a12, ABL, /**/ ABR, A20, /**/ a21, A22 ); RepartitionRight ( BL, /**/ BR, B0, /**/ b1, B2 ); //--------------------------------------------------------------------// const int currentRow = a01.Height(); // Store the index/value of the pivot candidate in A F pivot = alpha11.GetLocal(0,0); int pivotRow = currentRow; for( int i=0; i<a21.Height(); ++i ) { F value = a21.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = currentRow + i + 1; } } // Update the pivot candidate to include local data from B for( int i=0; i<B.LocalHeight(); ++i ) { F value = b1.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = A.Height() + colShift + i*r; } } // Fill the send buffer with: // [ pivotValue | pivot row data | pivotRow ] if( pivotRow < A.Height() ) { sendBufFloat[0] = A.GetLocal(pivotRow,a10.Width()); const int ALDim = A.LocalLDim(); const F* ABuffer = A.LocalBuffer(pivotRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = ABuffer[j*ALDim]; } else { const int localRow = ((pivotRow-A.Height())-colShift)/r; sendBufFloat[0] = b1.GetLocal(localRow,0); const int BLDim = B.LocalLDim(); const F* BBuffer = B.LocalBuffer(localRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = BBuffer[j*BLDim]; } *sendBufInt = pivotRow; // Communicate to establish the pivot information mpi::AllReduce ( &sendData[0], &recvData[0], numBytes, PivotOp<F>(), g.ColComm() ); // Update the pivot vector pivotRow = *recvBufInt; p.SetLocal(currentRow,0,pivotRow+pivotOffset); // Copy the current row into the pivot row if( pivotRow < A.Height() ) { const int ALDim = A.LocalLDim(); F* ASetBuffer = A.LocalBuffer(pivotRow,0); const F* AGetBuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) ASetBuffer[j*ALDim] = AGetBuffer[j*ALDim]; } else { const int ownerRank = (colAlignment+(pivotRow-A.Height())) % r; if( g.Row() == ownerRank ) { const int localRow = ((pivotRow-A.Height())-colShift) / r; const int ALDim = A.LocalLDim(); const int BLDim = B.LocalLDim(); F* BBuffer = B.LocalBuffer(localRow,0); const F* ABuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) BBuffer[j*BLDim] = ABuffer[j*ALDim]; } } // Copy the pivot row into the current row { F* ABuffer = A.LocalBuffer(currentRow,0); const int ALDim = A.LocalLDim(); for( int j=0; j<width; ++j ) ABuffer[j*ALDim] = recvBufFloat[j+1]; } // Now we can perform the update of the current panel const F alpha = alpha11.GetLocal(0,0); if( alpha == F(0) ) throw SingularMatrixException(); const F alpha11Inv = F(1) / alpha; Scale( alpha11Inv, a21.LocalMatrix() ); Scale( alpha11Inv, b1.LocalMatrix() ); Geru( F(-1), a21.LocalMatrix(), a12.LocalMatrix(), A22.LocalMatrix() ); Geru( F(-1), b1.LocalMatrix(), a12.LocalMatrix(), B2.LocalMatrix() ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, a01, /**/ A02, /**/ a10, alpha11, /**/ a12, /*************/ /**********************/ ABL, /**/ ABR, A20, a21, /**/ A22 ); SlidePartitionRight ( BL, /**/ BR, B0, b1, /**/ B2 ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void MakeTrapezoidal ( LeftOrRight side, UpperOrLower uplo, int offset, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTrapezoidal"); #endif const int height = A.Height(); const int width = A.Width(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = ( side==LEFT ? j-offset-1 : j-offset+height-width-1 ); if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = RawLocalLength( boundary, colShift, colStride ); MemZero( &localBuffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = ( side==LEFT ? std::max(j-offset+1,0) : std::max(j-offset+height-width+1,0) ); const int numNonzeroRows = RawLocalLength(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &localBuffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyColumnPivots ( DistMatrix<F>& A, const std::vector<int>& image, const std::vector<int>& preimage ) { const int b = image.size(); #ifndef RELEASE PushCallStack("ApplyColumnPivots"); if( A.Width() < b || b != preimage.size() ) throw std::logic_error ("image and preimage must be vectors of equal length that are not " "wider than A."); #endif const int localHeight = A.LocalHeight(); if( A.Height() == 0 || A.Width() == 0 ) { #ifndef RELEASE PopCallStack(); #endif return; } // Extract the relevant process grid information const Grid& g = A.Grid(); const int c = g.Width(); const int rowAlignment = A.RowAlignment(); const int rowShift = A.RowShift(); const int myCol = g.Col(); // Extract the send and recv counts from the image and preimage. // This process's sends may be logically partitioned into two sets: // (a) sends from rows [0,...,b-1] // (b) sends from rows [b,...] // The latter is analyzed with image, the former deduced with preimage. std::vector<int> sendCounts(c,0), recvCounts(c,0); for( int j=rowShift; j<b; j+=c ) { const int sendCol = preimage[j]; const int sendTo = (rowAlignment+sendCol) % c; sendCounts[sendTo] += localHeight; const int recvCol = image[j]; const int recvFrom = (rowAlignment+recvCol) % c; recvCounts[recvFrom] += localHeight; } for( int j=0; j<b; ++j ) { const int sendCol = preimage[j]; if( sendCol >= b ) { const int sendTo = (rowAlignment+sendCol) % c; if( sendTo == myCol ) { const int sendFrom = (rowAlignment+j) % c; recvCounts[sendFrom] += localHeight; } } const int recvCol = image[j]; if( recvCol >= b ) { const int recvFrom = (rowAlignment+recvCol) % c; if( recvFrom == myCol ) { const int recvTo = (rowAlignment+j) % c; sendCounts[recvTo] += localHeight; } } } // Construct the send and recv displacements from the counts std::vector<int> sendDispls(c), recvDispls(c); int totalSend=0, totalRecv=0; for( int i=0; i<c; ++i ) { sendDispls[i] = totalSend; recvDispls[i] = totalRecv; totalSend += sendCounts[i]; totalRecv += recvCounts[i]; } #ifndef RELEASE if( totalSend != totalRecv ) { std::ostringstream msg; msg << "Send and recv counts do not match: (send,recv)=" << totalSend << "," << totalRecv; throw std::logic_error( msg.str().c_str() ); } #endif // Fill vectors with the send data std::vector<F> sendData(std::max(1,totalSend)); std::vector<int> offsets(c,0); const int localWidth = LocalLength( b, rowShift, c ); for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int sendCol = preimage[rowShift+jLocal*c]; const int sendTo = (rowAlignment+sendCol) % c; const int offset = sendDispls[sendTo]+offsets[sendTo]; MemCopy( &sendData[offset], A.LocalBuffer(0,jLocal), localHeight ); offsets[sendTo] += localHeight; } for( int j=0; j<b; ++j ) { const int recvCol = image[j]; if( recvCol >= b ) { const int recvFrom = (rowAlignment+recvCol) % c; if( recvFrom == myCol ) { const int recvTo = (rowAlignment+j) % c; const int jLocal = (recvCol-rowShift) / c; const int offset = sendDispls[recvTo]+offsets[recvTo]; MemCopy ( &sendData[offset], A.LocalBuffer(0,jLocal), localHeight ); offsets[recvTo] += localHeight; } } } // Communicate all pivot rows std::vector<F> recvData(std::max(1,totalRecv)); mpi::AllToAll ( &sendData[0], &sendCounts[0], &sendDispls[0], &recvData[0], &recvCounts[0], &recvDispls[0], g.RowComm() ); // Unpack the recv data for( int k=0; k<c; ++k ) { offsets[k] = 0; int thisRowShift = Shift( k, rowAlignment, c ); for( int j=thisRowShift; j<b; j+=c ) { const int sendCol = preimage[j]; const int sendTo = (rowAlignment+sendCol) % c; if( sendTo == myCol ) { const int offset = recvDispls[k]+offsets[k]; const int jLocal = (sendCol-rowShift) / c; MemCopy ( A.LocalBuffer(0,jLocal), &recvData[offset], localHeight ); offsets[k] += localHeight; } } } for( int j=0; j<b; ++j ) { const int recvCol = image[j]; if( recvCol >= b ) { const int recvTo = (rowAlignment+j) % c; if( recvTo == myCol ) { const int recvFrom = (rowAlignment+recvCol) % c; const int jLocal = (j-rowShift) / c; const int offset = recvDispls[recvFrom]+offsets[recvFrom]; MemCopy ( A.LocalBuffer(0,jLocal), &recvData[offset], localHeight ); offsets[recvFrom] += localHeight; } } } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyRowPivots ( DistMatrix<F>& A, const std::vector<int>& image, const std::vector<int>& preimage ) { const int b = image.size(); #ifndef RELEASE PushCallStack("ApplyRowPivots"); if( A.Height() < b || b != (int)preimage.size() ) throw std::logic_error ("image and preimage must be vectors of equal length that are not " "taller than A."); #endif const int localWidth = A.LocalWidth(); if( A.Height() == 0 || A.Width() == 0 ) { #ifndef RELEASE PopCallStack(); #endif return; } // Extract the relevant process grid information const Grid& g = A.Grid(); const int r = g.Height(); const int colAlignment = A.ColAlignment(); const int colShift = A.ColShift(); const int myRow = g.Row(); // Extract the send and recv counts from the image and preimage. // This process's sends may be logically partitioned into two sets: // (a) sends from rows [0,...,b-1] // (b) sends from rows [b,...] // The latter is analyzed with image, the former deduced with preimage. std::vector<int> sendCounts(r,0), recvCounts(r,0); for( int i=colShift; i<b; i+=r ) { const int sendRow = preimage[i]; const int sendTo = (colAlignment+sendRow) % r; sendCounts[sendTo] += localWidth; const int recvRow = image[i]; const int recvFrom = (colAlignment+recvRow) % r; recvCounts[recvFrom] += localWidth; } for( int i=0; i<b; ++i ) { const int sendRow = preimage[i]; if( sendRow >= b ) { const int sendTo = (colAlignment+sendRow) % r; if( sendTo == myRow ) { const int sendFrom = (colAlignment+i) % r; recvCounts[sendFrom] += localWidth; } } const int recvRow = image[i]; if( recvRow >= b ) { const int recvFrom = (colAlignment+recvRow) % r; if( recvFrom == myRow ) { const int recvTo = (colAlignment+i) % r; sendCounts[recvTo] += localWidth; } } } // Construct the send and recv displacements from the counts std::vector<int> sendDispls(r), recvDispls(r); int totalSend=0, totalRecv=0; for( int i=0; i<r; ++i ) { sendDispls[i] = totalSend; recvDispls[i] = totalRecv; totalSend += sendCounts[i]; totalRecv += recvCounts[i]; } #ifndef RELEASE if( totalSend != totalRecv ) { std::ostringstream msg; msg << "Send and recv counts do not match: (send,recv)=" << totalSend << "," << totalRecv; throw std::logic_error( msg.str().c_str() ); } #endif // Fill vectors with the send data const int ALDim = A.LocalLDim(); std::vector<F> sendData(std::max(1,totalSend)); std::vector<int> offsets(r,0); const int localHeight = LocalLength( b, colShift, r ); for( int iLocal=0; iLocal<localHeight; ++iLocal ) { const int sendRow = preimage[colShift+iLocal*r]; const int sendTo = (colAlignment+sendRow) % r; const int offset = sendDispls[sendTo]+offsets[sendTo]; const F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) sendData[offset+jLocal] = ABuffer[jLocal*ALDim]; offsets[sendTo] += localWidth; } for( int i=0; i<b; ++i ) { const int recvRow = image[i]; if( recvRow >= b ) { const int recvFrom = (colAlignment+recvRow) % r; if( recvFrom == myRow ) { const int recvTo = (colAlignment+i) % r; const int iLocal = (recvRow-colShift) / r; const int offset = sendDispls[recvTo]+offsets[recvTo]; const F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) sendData[offset+jLocal] = ABuffer[jLocal*ALDim]; offsets[recvTo] += localWidth; } } } // Communicate all pivot rows std::vector<F> recvData(std::max(1,totalRecv)); mpi::AllToAll ( &sendData[0], &sendCounts[0], &sendDispls[0], &recvData[0], &recvCounts[0], &recvDispls[0], g.ColComm() ); // Unpack the recv data for( int k=0; k<r; ++k ) { offsets[k] = 0; int thisColShift = Shift( k, colAlignment, r ); for( int i=thisColShift; i<b; i+=r ) { const int sendRow = preimage[i]; const int sendTo = (colAlignment+sendRow) % r; if( sendTo == myRow ) { const int offset = recvDispls[k]+offsets[k]; const int iLocal = (sendRow-colShift) / r; F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) ABuffer[jLocal*ALDim] = recvData[offset+jLocal]; offsets[k] += localWidth; } } } for( int i=0; i<b; ++i ) { const int recvRow = image[i]; if( recvRow >= b ) { const int recvTo = (colAlignment+i) % r; if( recvTo == myRow ) { const int recvFrom = (colAlignment+recvRow) % r; const int iLocal = (i-colShift) / r; const int offset = recvDispls[recvFrom]+offsets[recvFrom]; F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) ABuffer[jLocal*ALDim] = recvData[offset+jLocal]; offsets[recvFrom] += localWidth; } } } #ifndef RELEASE PopCallStack(); #endif }