/* * Distributes A in such a way that * Layer 0 <- A(:, 0:(n/h - 1)) * Layer 1 <- A(:, (n/h):(2n/h - 1)) * . * . * . * Layer h-1 <- A(:, ((h-1)n/h):n) */ void DistributeCols ( const mpi::Comm& depthComm, const DistMatrix<double,MC,MR>& A, DistMatrix<double,MC,MR>& B ) { const Grid& meshGrid = A.Grid(); const int meshSize = meshGrid.Size(); const int depthSize = mpi::CommSize( depthComm ); const int depthRank = mpi::CommRank( depthComm ); const int sendCount = A.LocalHeight()*A.LocalWidth(); const int recvCount = sendCount / depthSize; // For now, we will make B as large as A... // TODO: NOT DO THIS if( A.LocalHeight() != A.LocalLDim() ) throw std::logic_error("Local height did not match local ldim"); B.Empty(); B.AlignWith( A ); Zeros( A.Height(), A.Width(), B ); // Scatter const int localColOffset = (A.LocalWidth()/depthSize)*depthRank; mpi::Scatter ( A.LockedLocalBuffer(), recvCount, B.LocalBuffer(0,localColOffset), recvCount, 0, depthComm ); }
static void Func ( DistMatrix<T,MC,STAR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); if( grid.InGrid() ) { const int n = A.Width(); const int localHeight = A.LocalHeight(); const int bufSize = localHeight*n; std::vector<T> buffer( bufSize ); // Create random matrix on process column 0, then broadcast if( grid.Col() == 0 ) { for( int j=0; j<n; ++j ) for( int iLocal=0; iLocal<localHeight; ++iLocal ) buffer[iLocal+j*localHeight] = center + radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.RowComm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int j=0; j<n; ++j ) { const T* bufferCol = &buffer[j*localHeight]; T* col = &localBuffer[j*ldim]; MemCopy( col, bufferCol, localHeight ); } } }
// Broadcast a matrix from the root grid to the others void DepthBroadcast ( const mpi::Comm& depthComm, const DistMatrix<double,MC,MR>& A, DistMatrix<double,MC,MR>& B ) { const int rank = mpi::CommRank(mpi::COMM_WORLD); const Grid& meshGrid = A.Grid(); const int meshSize = meshGrid.Size(); const int depthRank = rank / meshSize; const int localSize = A.LocalHeight()*A.LocalWidth(); if( A.LocalHeight() != A.LocalLDim() ) throw std::logic_error("Leading dimension did not match local height"); B.Empty(); B.AlignWith( A ); B.ResizeTo( A.Height(), A.Width() ); // Have the root pack the broadcast data if( depthRank == 0 ) MemCopy( B.LocalBuffer(), A.LockedLocalBuffer(), localSize ); // Broadcast from the root mpi::Broadcast( B.LocalBuffer(), localSize, 0, depthComm ); }
static void Func ( DistMatrix<T,STAR,STAR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); const int m = A.Height(); const int n = A.Width(); const int bufSize = m*n; if( grid.InGrid() ) { std::vector<T> buffer( bufSize ); if( grid.Rank() == 0 ) { for( int j=0; j<n; ++j ) for( int i=0; i<m; ++i ) buffer[i+j*m] = center+radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.Comm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int j=0; j<n; ++j ) { const T* bufferCol = &buffer[j*m]; T* col = &localBuffer[j*ldim]; MemCopy( col, bufferCol, m ); } } }
static void Func ( DistMatrix<T,STAR,MR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); const int m = A.Height(); const int localWidth = A.LocalWidth(); const int bufSize = m*localWidth; std::vector<T> buffer( bufSize ); // Create random matrix on process row 0, then broadcast if( grid.Row() == 0 ) { for( int j=0; j<localWidth; ++j ) for( int i=0; i<m; ++i ) buffer[i+j*m] = center+radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.ColComm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const T* bufferCol = &buffer[jLocal*m]; T* col = &localBuffer[jLocal*ldim]; MemCopy( col, bufferCol, m ); } }
static void Func ( DistMatrix<T,MR,STAR>& A, T center, typename Base<T>::type radius ) { const Grid& grid = A.Grid(); const int n = A.Width(); const int localHeight = A.LocalHeight(); const int bufSize = localHeight*n; std::vector<T> buffer( bufSize ); // Create random matrix on process row 0, then broadcast if( grid.Row() == 0 ) { for( int j=0; j<n; ++j ) for( int i=0; i<localHeight; ++i ) buffer[i+j*localHeight] = center+radius*SampleUnitBall<T>(); } mpi::Broadcast( &buffer[0], bufSize, 0, grid.ColComm() ); // Unpack T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for COLLAPSE(2) #endif for( int j=0; j<n; ++j ) for( int iLocal=0; iLocal<localHeight; ++iLocal ) localBuffer[iLocal+j*ldim] = buffer[iLocal+j*localHeight]; }
inline void MakeTriangular( UpperOrLower uplo, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTriangular"); #endif const int height = A.Height(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = j-1; if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = RawLocalLength( boundary, colShift, colStride ); MemZero( &localBuffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = j+1; const int numNonzeroRows = RawLocalLength(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &localBuffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
// Reduce across depth to get end result C void SumContributions ( mpi::Comm& depthComm, const DistMatrix<double,MC,MR>& APartial, DistMatrix<double,MC,MR>& A ) { const int rank = mpi::CommRank( mpi::COMM_WORLD ); const Grid& meshGrid = APartial.Grid(); A.Empty(); A.AlignWith( APartial ); A.ResizeTo( APartial.Height(), APartial.Width() ); if( APartial.LocalHeight() != APartial.LocalLDim() ) throw std::logic_error ("APartial did not have matching local height/ldim"); if( A.LocalHeight() != A.LocalLDim() ) throw std::logic_error("A did not have matching local height/ldim"); const int dataSize = APartial.LocalHeight()*APartial.LocalWidth(); mpi::AllReduce ( APartial.LockedLocalBuffer(), A.LocalBuffer(), dataSize, mpi::SUM, depthComm ); }
// Have the top layer initialize the distributed matrix, B void InitB( DistMatrix<double,MC,MR>& B ) { const int rank = mpi::CommRank(mpi::COMM_WORLD); const Grid& g = B.Grid(); const int meshSize = g.Size(); const int depthRank = rank / meshSize; if( depthRank == 0 ) { if( B.LocalHeight() != B.LocalLDim() ) throw std::logic_error("Local ldim of B was too large"); double* localBuffer = B.LocalBuffer(); const int localSize = B.LocalHeight()*B.LocalWidth(); for( int iLocal=0; iLocal<localSize; ++iLocal ) localBuffer[iLocal] = iLocal*meshSize + rank; B.Print("B"); } }
inline const DistMatrix<T,MD,STAR,Int>& DistMatrix<T,MD,STAR,Int>::operator=( const DistMatrix<T,STAR,STAR,Int>& A ) { #ifndef RELEASE PushCallStack("[MD,* ] = [* ,* ]"); this->AssertNotLockedView(); this->AssertSameGrid( A ); if( this->Viewing() ) this->AssertSameSize( A ); #endif if( !this->Viewing() ) this->ResizeTo( A.Height(), A.Width() ); if( this->Participating() ) { const Int lcm = this->grid_->LCM(); const Int colShift = this->ColShift(); const Int width = this->Width(); const Int localHeight = this->LocalHeight(); const T* ALocalBuffer = A.LockedLocalBuffer(); const Int ALDim = A.LocalLDim(); T* thisLocalBuffer = this->LocalBuffer(); const Int thisLDim = this->LocalLDim(); #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( Int j=0; j<width; ++j ) { T* destCol = &thisLocalBuffer[j*thisLDim]; const T* sourceCol = &ALocalBuffer[colShift+j*ALDim]; for( Int iLocal=0; iLocal<localHeight; ++iLocal ) destCol[iLocal] = sourceCol[iLocal*lcm]; } } #ifndef RELEASE PopCallStack(); #endif return *this; }
inline void PanelLU ( DistMatrix<F, STAR,STAR>& A, DistMatrix<F, MC, STAR>& B, DistMatrix<int,STAR,STAR>& p, int pivotOffset ) { #ifndef RELEASE PushCallStack("internal::PanelLU"); if( A.Grid() != p.Grid() || p.Grid() != B.Grid() ) throw std::logic_error ("Matrices must be distributed over the same grid"); if( A.Width() != B.Width() ) throw std::logic_error("A and B must be the same width"); if( A.Height() != p.Height() || p.Width() != 1 ) throw std::logic_error("p must be a vector that conforms with A"); #endif const Grid& g = A.Grid(); const int r = g.Height(); const int colShift = B.ColShift(); const int colAlignment = B.ColAlignment(); // Matrix views DistMatrix<F,STAR,STAR> ATL(g), ATR(g), A00(g), a01(g), A02(g), ABL(g), ABR(g), a10(g), alpha11(g), a12(g), A20(g), a21(g), A22(g); DistMatrix<F,MC,STAR> BL(g), BR(g), B0(g), b1(g), B2(g); const int width = A.Width(); const int numBytes = (width+1)*sizeof(F)+sizeof(int); std::vector<byte> sendData(numBytes); std::vector<byte> recvData(numBytes); // Extract pointers to send and recv data // TODO: Think of how to make this safer with respect to alignment issues F* sendBufFloat = (F*)&sendData[0]; F* recvBufFloat = (F*)&recvData[0]; int* sendBufInt = (int*)&sendData[(width+1)*sizeof(F)]; int* recvBufInt = (int*)&recvData[(width+1)*sizeof(F)]; // Start the algorithm PushBlocksizeStack( 1 ); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionRight( B, BL, BR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ a01, A02, /*************/ /**********************/ /**/ a10, /**/ alpha11, a12, ABL, /**/ ABR, A20, /**/ a21, A22 ); RepartitionRight ( BL, /**/ BR, B0, /**/ b1, B2 ); //--------------------------------------------------------------------// const int currentRow = a01.Height(); // Store the index/value of the pivot candidate in A F pivot = alpha11.GetLocal(0,0); int pivotRow = currentRow; for( int i=0; i<a21.Height(); ++i ) { F value = a21.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = currentRow + i + 1; } } // Update the pivot candidate to include local data from B for( int i=0; i<B.LocalHeight(); ++i ) { F value = b1.GetLocal(i,0); if( FastAbs(value) > FastAbs(pivot) ) { pivot = value; pivotRow = A.Height() + colShift + i*r; } } // Fill the send buffer with: // [ pivotValue | pivot row data | pivotRow ] if( pivotRow < A.Height() ) { sendBufFloat[0] = A.GetLocal(pivotRow,a10.Width()); const int ALDim = A.LocalLDim(); const F* ABuffer = A.LocalBuffer(pivotRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = ABuffer[j*ALDim]; } else { const int localRow = ((pivotRow-A.Height())-colShift)/r; sendBufFloat[0] = b1.GetLocal(localRow,0); const int BLDim = B.LocalLDim(); const F* BBuffer = B.LocalBuffer(localRow,0); for( int j=0; j<width; ++j ) sendBufFloat[j+1] = BBuffer[j*BLDim]; } *sendBufInt = pivotRow; // Communicate to establish the pivot information mpi::AllReduce ( &sendData[0], &recvData[0], numBytes, PivotOp<F>(), g.ColComm() ); // Update the pivot vector pivotRow = *recvBufInt; p.SetLocal(currentRow,0,pivotRow+pivotOffset); // Copy the current row into the pivot row if( pivotRow < A.Height() ) { const int ALDim = A.LocalLDim(); F* ASetBuffer = A.LocalBuffer(pivotRow,0); const F* AGetBuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) ASetBuffer[j*ALDim] = AGetBuffer[j*ALDim]; } else { const int ownerRank = (colAlignment+(pivotRow-A.Height())) % r; if( g.Row() == ownerRank ) { const int localRow = ((pivotRow-A.Height())-colShift) / r; const int ALDim = A.LocalLDim(); const int BLDim = B.LocalLDim(); F* BBuffer = B.LocalBuffer(localRow,0); const F* ABuffer = A.LocalBuffer(currentRow,0); for( int j=0; j<width; ++j ) BBuffer[j*BLDim] = ABuffer[j*ALDim]; } } // Copy the pivot row into the current row { F* ABuffer = A.LocalBuffer(currentRow,0); const int ALDim = A.LocalLDim(); for( int j=0; j<width; ++j ) ABuffer[j*ALDim] = recvBufFloat[j+1]; } // Now we can perform the update of the current panel const F alpha = alpha11.GetLocal(0,0); if( alpha == F(0) ) throw SingularMatrixException(); const F alpha11Inv = F(1) / alpha; Scale( alpha11Inv, a21.LocalMatrix() ); Scale( alpha11Inv, b1.LocalMatrix() ); Geru( F(-1), a21.LocalMatrix(), a12.LocalMatrix(), A22.LocalMatrix() ); Geru( F(-1), b1.LocalMatrix(), a12.LocalMatrix(), B2.LocalMatrix() ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, a01, /**/ A02, /**/ a10, alpha11, /**/ a12, /*************/ /**********************/ ABL, /**/ ABR, A20, a21, /**/ A22 ); SlidePartitionRight ( BL, /**/ BR, B0, b1, /**/ B2 ); } PopBlocksizeStack(); #ifndef RELEASE PopCallStack(); #endif }
inline void MakeTrapezoidal ( LeftOrRight side, UpperOrLower uplo, int offset, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTrapezoidal"); #endif const int height = A.Height(); const int width = A.Width(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* localBuffer = A.LocalBuffer(); const int ldim = A.LocalLDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = ( side==LEFT ? j-offset-1 : j-offset+height-width-1 ); if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = RawLocalLength( boundary, colShift, colStride ); MemZero( &localBuffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = ( side==LEFT ? std::max(j-offset+1,0) : std::max(j-offset+height-width+1,0) ); const int numNonzeroRows = RawLocalLength(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &localBuffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyRowPivots ( DistMatrix<F>& A, const std::vector<int>& image, const std::vector<int>& preimage ) { const int b = image.size(); #ifndef RELEASE PushCallStack("ApplyRowPivots"); if( A.Height() < b || b != (int)preimage.size() ) throw std::logic_error ("image and preimage must be vectors of equal length that are not " "taller than A."); #endif const int localWidth = A.LocalWidth(); if( A.Height() == 0 || A.Width() == 0 ) { #ifndef RELEASE PopCallStack(); #endif return; } // Extract the relevant process grid information const Grid& g = A.Grid(); const int r = g.Height(); const int colAlignment = A.ColAlignment(); const int colShift = A.ColShift(); const int myRow = g.Row(); // Extract the send and recv counts from the image and preimage. // This process's sends may be logically partitioned into two sets: // (a) sends from rows [0,...,b-1] // (b) sends from rows [b,...] // The latter is analyzed with image, the former deduced with preimage. std::vector<int> sendCounts(r,0), recvCounts(r,0); for( int i=colShift; i<b; i+=r ) { const int sendRow = preimage[i]; const int sendTo = (colAlignment+sendRow) % r; sendCounts[sendTo] += localWidth; const int recvRow = image[i]; const int recvFrom = (colAlignment+recvRow) % r; recvCounts[recvFrom] += localWidth; } for( int i=0; i<b; ++i ) { const int sendRow = preimage[i]; if( sendRow >= b ) { const int sendTo = (colAlignment+sendRow) % r; if( sendTo == myRow ) { const int sendFrom = (colAlignment+i) % r; recvCounts[sendFrom] += localWidth; } } const int recvRow = image[i]; if( recvRow >= b ) { const int recvFrom = (colAlignment+recvRow) % r; if( recvFrom == myRow ) { const int recvTo = (colAlignment+i) % r; sendCounts[recvTo] += localWidth; } } } // Construct the send and recv displacements from the counts std::vector<int> sendDispls(r), recvDispls(r); int totalSend=0, totalRecv=0; for( int i=0; i<r; ++i ) { sendDispls[i] = totalSend; recvDispls[i] = totalRecv; totalSend += sendCounts[i]; totalRecv += recvCounts[i]; } #ifndef RELEASE if( totalSend != totalRecv ) { std::ostringstream msg; msg << "Send and recv counts do not match: (send,recv)=" << totalSend << "," << totalRecv; throw std::logic_error( msg.str().c_str() ); } #endif // Fill vectors with the send data const int ALDim = A.LocalLDim(); std::vector<F> sendData(std::max(1,totalSend)); std::vector<int> offsets(r,0); const int localHeight = LocalLength( b, colShift, r ); for( int iLocal=0; iLocal<localHeight; ++iLocal ) { const int sendRow = preimage[colShift+iLocal*r]; const int sendTo = (colAlignment+sendRow) % r; const int offset = sendDispls[sendTo]+offsets[sendTo]; const F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) sendData[offset+jLocal] = ABuffer[jLocal*ALDim]; offsets[sendTo] += localWidth; } for( int i=0; i<b; ++i ) { const int recvRow = image[i]; if( recvRow >= b ) { const int recvFrom = (colAlignment+recvRow) % r; if( recvFrom == myRow ) { const int recvTo = (colAlignment+i) % r; const int iLocal = (recvRow-colShift) / r; const int offset = sendDispls[recvTo]+offsets[recvTo]; const F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) sendData[offset+jLocal] = ABuffer[jLocal*ALDim]; offsets[recvTo] += localWidth; } } } // Communicate all pivot rows std::vector<F> recvData(std::max(1,totalRecv)); mpi::AllToAll ( &sendData[0], &sendCounts[0], &sendDispls[0], &recvData[0], &recvCounts[0], &recvDispls[0], g.ColComm() ); // Unpack the recv data for( int k=0; k<r; ++k ) { offsets[k] = 0; int thisColShift = Shift( k, colAlignment, r ); for( int i=thisColShift; i<b; i+=r ) { const int sendRow = preimage[i]; const int sendTo = (colAlignment+sendRow) % r; if( sendTo == myRow ) { const int offset = recvDispls[k]+offsets[k]; const int iLocal = (sendRow-colShift) / r; F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) ABuffer[jLocal*ALDim] = recvData[offset+jLocal]; offsets[k] += localWidth; } } } for( int i=0; i<b; ++i ) { const int recvRow = image[i]; if( recvRow >= b ) { const int recvTo = (colAlignment+i) % r; if( recvTo == myRow ) { const int recvFrom = (colAlignment+recvRow) % r; const int iLocal = (i-colShift) / r; const int offset = recvDispls[recvFrom]+offsets[recvFrom]; F* ABuffer = A.LocalBuffer(iLocal,0); for( int jLocal=0; jLocal<localWidth; ++jLocal ) ABuffer[jLocal*ALDim] = recvData[offset+jLocal]; offsets[recvFrom] += localWidth; } } } #ifndef RELEASE PopCallStack(); #endif }