inline void Binary( DistMatrix<T,STAR,STAR>& A, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::Binary")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); Int height, width; file >> height; file >> width; const Int numBytes = FileSize( file ); const Int metaBytes = 2*sizeof(Int); const Int dataBytes = height*width*sizeof(T); const Int numBytesExp = metaBytes + dataBytes; if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); if( A.Height() == A.LDim() ) file.read( (char*)A.Buffer(), height*width*sizeof(T) ); else for( Int j=0; j<width; ++j ) file.read( (char*)A.Buffer(0,j), height*sizeof(T) ); }
inline void BinaryFlat ( DistMatrix<T,CIRC,CIRC>& A, Int height, Int width, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::Binary")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); const Int numBytes = FileSize( file ); const Int numBytesExp = height*width*sizeof(T); if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); if( A.CrossRank() == A.Root() ) { if( A.Height() == A.LDim() ) file.read( (char*)A.Buffer(), height*width*sizeof(T) ); else for( Int j=0; j<width; ++j ) file.read( (char*)A.Buffer(0,j), height*sizeof(T) ); } }
inline void BinaryFlat ( DistMatrix<T,U,V>& A, Int height, Int width, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::BinaryFlat")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); const Int numBytes = FileSize( file ); const Int numBytesExp = height*width*sizeof(T); if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); if( U == A.UGath && V == A.VGath ) { if( A.CrossRank() == A.Root() ) { if( A.Height() == A.LDim() ) file.read( (char*)A.Buffer(), height*width*sizeof(T) ); else for( Int j=0; j<width; ++j ) file.read( (char*)A.Buffer(0,j), height*sizeof(T) ); } } else if( U == A.UGath ) { const Int localWidth = A.LocalWidth(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = A.GlobalCol(jLoc); const Int localIndex = j*height; const std::streamoff pos = localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(0,jLoc), height*sizeof(T) ); } } else { const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = A.GlobalRow(iLoc); const Int localIndex = i+j*height; const std::streamoff pos = localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(iLoc,jLoc), sizeof(T) ); } } } }
inline void BinaryFlat ( DistMatrix<T,STAR,V>& A, Int height, Int width, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::BinaryFlat")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); const Int numBytes = FileSize( file ); const Int numBytesExp = height*width*sizeof(T); if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); const Int localWidth = A.LocalWidth(); const Int rowShift = A.RowShift(); const Int rowStride = A.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; const Int localIndex = j*height; const std::streamoff pos = localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(0,jLoc), height*sizeof(T) ); } }
void Scatter ( const DistMatrix<T,CIRC,CIRC>& A, DistMatrix<T,STAR,STAR>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.Resize( height, width ); if( B.Participating() ) { const Int pkgSize = mpi::Pad( height*width ); vector<T> buffer; FastResize( buffer, pkgSize ); // Pack if( A.Participating() ) util::InterleaveMatrix ( height, width, A.LockedBuffer(), 1, A.LDim(), buffer.data(), 1, height ); // Broadcast from the process that packed mpi::Broadcast( buffer.data(), pkgSize, A.Root(), A.CrossComm() ); // Unpack util::InterleaveMatrix ( height, width, buffer.data(), 1, height, B.Buffer(), 1, B.LDim() ); } }
inline void MakeTriangular( UpperOrLower uplo, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTriangular"); #endif const int height = A.Height(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* buffer = A.Buffer(); const int ldim = A.LDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = j-1; if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = Length_( boundary, colShift, colStride ); MemZero( &buffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = j+1; const int numNonzeroRows = Length_(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &buffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
void ScaLAPACKHelper ( DistMatrix<F,MC,MR,BLOCK>& A, DistMatrix<F,MR,STAR,BLOCK>& householderScalars ) { EL_DEBUG_CSE AssertScaLAPACKSupport(); #ifdef EL_HAVE_SCALAPACK const Int m = A.Height(); const Int n = A.Width(); const Int minDim = Min(m,n); householderScalars.AlignWith( A ); householderScalars.Resize( minDim, 1 ); auto descA = FillDesc( A ); scalapack::QR ( m, n, A.Buffer(), descA.data(), householderScalars.Buffer() ); #endif }
void UpdateWithLocalData ( T alpha, const AbstractDistMatrix<T>& A, DistMatrix<T,STAR,STAR>& B ) { DEBUG_ONLY(CSE cse("axpy::util::UpdateWithLocalData")) axpy::util::InterleaveMatrixUpdate ( alpha, A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), B.Buffer(A.ColShift(),A.RowShift()), A.ColStride(), A.RowStride()*B.LDim() ); }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { EL_DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { if( A.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf; FastResize( buf, (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) El::Broadcast( B, A.CrossComm(), A.Root() ); }
void StackedGeometricColumnScaling ( const DistMatrix<Field, U,V >& A, const DistMatrix<Field, U,V >& B, DistMatrix<Base<Field>,V,STAR>& geomScaling ) { EL_DEBUG_CSE // NOTE: Assuming A.ColComm() == B.ColComm() and that the row alignments // are equal typedef Base<Field> Real; DistMatrix<Real,V,STAR> maxScalingA(A.Grid()), maxScalingB(A.Grid()); ColumnMaxNorms( A, maxScalingA ); ColumnMaxNorms( B, maxScalingB ); const Int mLocalA = A.LocalHeight(); const Int mLocalB = B.LocalHeight(); const Int nLocal = A.LocalWidth(); geomScaling.AlignWith( maxScalingA ); geomScaling.Resize( A.Width(), 1 ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.LockedMatrix(); auto& geomScalingLoc = geomScaling.Matrix(); auto& maxScalingALoc = maxScalingA.Matrix(); auto& maxScalingBLoc = maxScalingB.Matrix(); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { Real minAbs = Max(maxScalingALoc(jLoc),maxScalingBLoc(jLoc)); for( Int iLoc=0; iLoc<mLocalA; ++iLoc ) { const Real absVal = Abs(ALoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } for( Int iLoc=0; iLoc<mLocalB; ++iLoc ) { const Real absVal = Abs(BLoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } geomScalingLoc(jLoc) = minAbs; } mpi::AllReduce( geomScaling.Buffer(), nLocal, mpi::MIN, A.ColComm() ); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { const Real maxAbsA = maxScalingALoc(jLoc); const Real maxAbsB = maxScalingBLoc(jLoc); const Real maxAbs = Max(maxAbsA,maxAbsB); const Real minAbs = geomScalingLoc(jLoc); geomScalingLoc(jLoc) = Sqrt(minAbs*maxAbs); } }
inline void HermitianSVD ( UpperOrLower uplo, DistMatrix<F>& A, DistMatrix<BASE(F),VR,STAR>& s, DistMatrix<F>& U, DistMatrix<F>& V ) { #ifndef RELEASE CallStackEntry entry("HermitianSVD"); #endif #ifdef HAVE_PMRRR typedef BASE(F) R; // Grab an eigenvalue decomposition of A HermitianEig( uplo, A, s, V ); // Redistribute the singular values into an [MR,* ] distribution const Grid& grid = A.Grid(); DistMatrix<R,MR,STAR> s_MR_STAR( grid ); s_MR_STAR.AlignWith( V.DistData() ); s_MR_STAR = s; // Set the singular values to the absolute value of the eigenvalues const Int numLocalVals = s.LocalHeight(); for( Int iLoc=0; iLoc<numLocalVals; ++iLoc ) { const R sigma = s.GetLocal(iLoc,0); s.SetLocal(iLoc,0,Abs(sigma)); } // Copy V into U (flipping the sign as necessary) U.AlignWith( V ); U.ResizeTo( V.Height(), V.Width() ); const Int localHeight = V.LocalHeight(); const Int localWidth = V.LocalWidth(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const R sigma = s_MR_STAR.GetLocal( jLoc, 0 ); F* UCol = U.Buffer( 0, jLoc ); const F* VCol = V.LockedBuffer( 0, jLoc ); if( sigma >= 0 ) for( Int iLoc=0; iLoc<localHeight; ++iLoc ) UCol[iLoc] = VCol[iLoc]; else for( Int iLoc=0; iLoc<localHeight; ++iLoc ) UCol[iLoc] = -VCol[iLoc]; } #else U = A; MakeHermitian( uplo, U ); SVD( U, s, V ); #endif // ifdef HAVE_PMRRR }
void QR ( DistMatrix<F,MC,MR,BLOCK>& A, DistMatrix<F,MR,STAR,BLOCK>& phase ) { DEBUG_CSE AssertScaLAPACKSupport(); #ifdef EL_HAVE_SCALAPACK const Int m = A.Height(); const Int n = A.Width(); const Int minDim = Min(m,n); phase.AlignWith( A ); phase.Resize( minDim, 1 ); const int bHandle = blacs::Handle( A ); const int context = blacs::GridInit( bHandle, A ); auto descA = FillDesc( A, context ); scalapack::QR( m, n, A.Buffer(), descA.data(), phase.Buffer() ); blacs::FreeGrid( context ); blacs::FreeHandle( bHandle ); #endif }
void Filter ( const DistMatrix<T,Collect<U>(),Collect<V>()>& A, DistMatrix<T, U, V >& B ) { DEBUG_CSE AssertSameGrids( A, B ); B.Resize( A.Height(), A.Width() ); if( !B.Participating() ) return; const Int colShift = B.ColShift(); const Int rowShift = B.RowShift(); util::InterleaveMatrix ( B.LocalHeight(), B.LocalWidth(), A.LockedBuffer(colShift,rowShift), B.ColStride(), B.RowStride()*A.LDim(), B.Buffer(), 1, B.LDim() ); }
inline void AddInLocalData ( const DistMatrix<F,VC,STAR>& X1, DistMatrix<F,STAR,STAR>& Z ) { #ifndef RELEASE PushCallStack("internal::AddInLocalData"); #endif const int width = X1.Width(); const int localHeight = X1.LocalHeight(); const int stride = X1.Grid().Size(); const int offset = X1.ColShift(); for( int j=0; j<width; ++j ) { F* ZColBuffer = Z.Buffer(0,j); const F* X1ColBuffer = X1.LockedBuffer(0,j); for( int iLocal=0; iLocal<localHeight; ++iLocal ) ZColBuffer[offset+stride*iLocal] += X1ColBuffer[iLocal]; } #ifndef RELEASE PopCallStack(); #endif }
void AccumulateRHS( const DistMatrix<F,VC,STAR>& X, DistMatrix<F,STAR,STAR>& Z ) { const Int height = X.Height(); const Int width = X.Width(); Z.Empty(); Zeros( Z, height, width ); const Int localHeight = X.LocalHeight(); const Int colShift = X.ColShift(); const int commSize = X.Grid().Size(); const F* XBuffer = X.LockedBuffer(); F* ZBuffer = Z.Buffer(); const Int XLDim = X.LDim(); const Int ZLDim = Z.LDim(); for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*commSize; for( Int j=0; j<width; ++j ) ZBuffer[i+j*ZLDim] = XBuffer[iLoc+j*XLDim]; } mpi::AllReduce( ZBuffer, ZLDim*width, mpi::SUM, X.Grid().VCComm() ); }
inline void Binary( DistMatrix<T,U,V>& A, const std::string filename ) { DEBUG_ONLY(CallStackEntry cse("read::Binary")) std::ifstream file( filename.c_str(), std::ios::binary ); if( !file.is_open() ) RuntimeError("Could not open ",filename); Int height, width; file >> height; file >> width; const Int numBytes = FileSize( file ); const Int metaBytes = 2*sizeof(Int); const Int dataBytes = height*width*sizeof(T); const Int numBytesExp = metaBytes + dataBytes; if( numBytes != numBytesExp ) RuntimeError ("Expected file to be ",numBytesExp," bytes but found ",numBytes); A.Resize( height, width ); const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); const Int colShift = A.ColShift(); const Int rowShift = A.RowShift(); const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*colStride; const Int localIndex = i+j*height; const std::streamoff pos = metaBytes + localIndex*sizeof(T); file.seekg( pos ); file.read( (char*)A.Buffer(iLoc,jLoc), sizeof(T) ); } } }
void InPlaceRedist ( DistMatrix<F>& paddedZ, Int rowAlign, const Base<F>* readBuffer ) { typedef Base<F> Real; const Grid& g = paddedZ.Grid(); const Int height = paddedZ.Height(); const Int width = paddedZ.Width(); const Int r = g.Height(); const Int c = g.Width(); const Int p = r * c; const Int row = g.Row(); const Int col = g.Col(); const Int rowShift = paddedZ.RowShift(); const Int colAlign = paddedZ.ColAlign(); const Int localWidth = Length(width,g.VRRank(),rowAlign,p); const Int maxHeight = MaxLength(height,r); const Int maxWidth = MaxLength(width,p); const Int portionSize = mpi::Pad( maxHeight*maxWidth ); // Allocate our send/recv buffers std::vector<Real> buffer(2*r*portionSize); Real* sendBuffer = &buffer[0]; Real* recvBuffer = &buffer[r*portionSize]; // Pack OUTER_PARALLEL_FOR for( Int k=0; k<r; ++k ) { Real* data = &sendBuffer[k*portionSize]; const Int thisColShift = Shift(k,colAlign,r); const Int thisLocalHeight = Length(height,thisColShift,r); INNER_PARALLEL_FOR_COLLAPSE2 for( Int j=0; j<localWidth; ++j ) for( Int i=0; i<thisLocalHeight; ++i ) data[i+j*thisLocalHeight] = readBuffer[thisColShift+i*r+j*height]; } // Communicate mpi::AllToAll ( sendBuffer, portionSize, recvBuffer, portionSize, g.ColComm() ); // Unpack const Int localHeight = Length(height,row,colAlign,r); OUTER_PARALLEL_FOR for( Int k=0; k<r; ++k ) { const Real* data = &recvBuffer[k*portionSize]; const Int thisRank = col+k*c; const Int thisRowShift = Shift(thisRank,rowAlign,p); const Int thisRowOffset = (thisRowShift-rowShift) / c; const Int thisLocalWidth = Length(width,thisRowShift,p); INNER_PARALLEL_FOR for( Int j=0; j<thisLocalWidth; ++j ) { const Real* dataCol = &(data[j*localHeight]); Real* thisCol = (Real*)paddedZ.Buffer(0,thisRowOffset+j*r); if( IsComplex<F>::val ) { for( Int i=0; i<localHeight; ++i ) { thisCol[2*i] = dataCol[i]; thisCol[2*i+1] = 0; } } else { MemCopy( thisCol, dataCol, localHeight ); } } } }
void Gather ( const ElementalMatrix<T>& A, DistMatrix<T,CIRC,CIRC>& B ) { DEBUG_ONLY(CSE cse("copy::Gather")) AssertSameGrids( A, B ); if( A.DistSize() == 1 && A.CrossSize() == 1 ) { B.Resize( A.Height(), A.Width() ); if( B.CrossRank() == B.Root() ) Copy( A.LockedMatrix(), B.Matrix() ); return; } const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); // Gather the colShifts and rowShifts // ================================== Int myShifts[2]; myShifts[0] = A.ColShift(); myShifts[1] = A.RowShift(); vector<Int> shifts; const Int crossSize = B.CrossSize(); if( B.CrossRank() == B.Root() ) shifts.resize( 2*crossSize ); mpi::Gather( myShifts, 2, shifts.data(), 2, B.Root(), B.CrossComm() ); // Gather the payload data // ======================= const bool irrelevant = ( A.RedundantRank()!=0 || A.CrossRank()!=A.Root() ); int totalSend = ( irrelevant ? 0 : A.LocalHeight()*A.LocalWidth() ); vector<int> recvCounts, recvOffsets; if( B.CrossRank() == B.Root() ) recvCounts.resize( crossSize ); mpi::Gather( &totalSend, 1, recvCounts.data(), 1, B.Root(), B.CrossComm() ); int totalRecv = Scan( recvCounts, recvOffsets ); //vector<T> sendBuf(totalSend), recvBuf(totalRecv); vector<T> sendBuf, recvBuf; sendBuf.reserve( totalSend ); recvBuf.reserve( totalRecv ); if( !irrelevant ) copy::util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf.data(), 1, A.LocalHeight() ); mpi::Gather ( sendBuf.data(), totalSend, recvBuf.data(), recvCounts.data(), recvOffsets.data(), B.Root(), B.CrossComm() ); // Unpack // ====== if( B.Root() == B.CrossRank() ) { for( Int q=0; q<crossSize; ++q ) { if( recvCounts[q] == 0 ) continue; const Int colShift = shifts[2*q+0]; const Int rowShift = shifts[2*q+1]; const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int localHeight = Length( height, colShift, colStride ); const Int localWidth = Length( width, rowShift, rowStride ); copy::util::InterleaveMatrix ( localHeight, localWidth, &recvBuf[recvOffsets[q]], 1, localHeight, B.Buffer(colShift,rowShift), colStride, rowStride*B.LDim() ); } } }
void TranslateBetweenGrids ( const DistMatrix<T,MC,MR>& A, DistMatrix<T,MC,MR>& B ) { DEBUG_ONLY(CSE cse("copy::TranslateBetweenGrids [MC,MR]")) B.Resize( A.Height(), A.Width() ); // Just need to ensure that each viewing comm contains the other team's // owning comm. Congruence is too strong. // Compute the number of process rows and columns that each process // needs to send to. const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); const Int colRank = B.ColRank(); const Int rowRank = B.RowRank(); const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int colGCD = GCD( colStride, colStrideA ); const Int rowGCD = GCD( rowStride, rowStrideA ); const Int colLCM = colStride*colStrideA / colGCD; const Int rowLCM = rowStride*rowStrideA / rowGCD; const Int numColSends = colStride / colGCD; const Int numRowSends = rowStride / rowGCD; const Int colAlign = B.ColAlign(); const Int rowAlign = B.RowAlign(); const Int colAlignA = A.ColAlign(); const Int rowAlignA = A.RowAlign(); const bool inBGrid = B.Participating(); const bool inAGrid = A.Participating(); if( !inBGrid && !inAGrid ) return; const Int maxSendSize = (A.Height()/(colStrideA*numColSends)+1) * (A.Width()/(rowStrideA*numRowSends)+1); // Translate the ranks from A's VC communicator to B's viewing so that // we can match send/recv communicators. Since A's VC communicator is not // necessarily defined on every process, we instead work with A's owning // group and account for row-major ordering if necessary. const int sizeA = A.Grid().Size(); vector<int> rankMap(sizeA), ranks(sizeA); if( A.Grid().Order() == COLUMN_MAJOR ) { for( int j=0; j<sizeA; ++j ) ranks[j] = j; } else { // The (i,j) = i + j*colStrideA rank in the column-major ordering is // equal to the j + i*rowStrideA rank in a row-major ordering. // Since we desire rankMap[i+j*colStrideA] to correspond to process // (i,j) in A's grid's rank in this viewing group, ranks[i+j*colStrideA] // should correspond to process (i,j) in A's owning group. Since the // owning group is ordered row-major in this case, its rank is // j+i*rowStrideA. Note that setting // ranks[j+i*rowStrideA] = i+j*colStrideA is *NOT* valid. for( int i=0; i<colStrideA; ++i ) for( int j=0; j<rowStrideA; ++j ) ranks[i+j*colStrideA] = j+i*rowStrideA; } mpi::Translate ( A.Grid().OwningGroup(), sizeA, &ranks[0], B.Grid().ViewingComm(), &rankMap[0] ); // Have each member of A's grid individually send to all numRow x numCol // processes in order, while the members of this grid receive from all // necessary processes at each step. Int requiredMemory = 0; if( inAGrid ) requiredMemory += maxSendSize; if( inBGrid ) requiredMemory += maxSendSize; vector<T> auxBuf( requiredMemory ); Int offset = 0; T* sendBuf = &auxBuf[offset]; if( inAGrid ) offset += maxSendSize; T* recvBuf = &auxBuf[offset]; Int recvRow = 0; // avoid compiler warnings... if( inAGrid ) recvRow = Mod(Mod(A.ColRank()-colAlignA,colStrideA)+colAlign,colStride); for( Int colSend=0; colSend<numColSends; ++colSend ) { Int recvCol = 0; // avoid compiler warnings... if( inAGrid ) recvCol=Mod(Mod(A.RowRank()-rowAlignA,rowStrideA)+rowAlign, rowStride); for( Int rowSend=0; rowSend<numRowSends; ++rowSend ) { mpi::Request sendRequest; // Fire off this round of non-blocking sends if( inAGrid ) { // Pack the data Int sendHeight = Length(A.LocalHeight(),colSend,numColSends); Int sendWidth = Length(A.LocalWidth(),rowSend,numRowSends); copy::util::InterleaveMatrix ( sendHeight, sendWidth, A.LockedBuffer(colSend,rowSend), numColSends, numRowSends*A.LDim(), sendBuf, 1, sendHeight ); // Send data const Int recvVCRank = recvRow + recvCol*colStride; const Int recvViewingRank = B.Grid().VCToViewing( recvVCRank ); mpi::ISend ( sendBuf, sendHeight*sendWidth, recvViewingRank, B.Grid().ViewingComm(), sendRequest ); } // Perform this round of recv's if( inBGrid ) { const Int sendColOffset = colAlignA; const Int recvColOffset = (colSend*colStrideA+colAlign) % colStride; const Int sendRowOffset = rowAlignA; const Int recvRowOffset = (rowSend*rowStrideA+rowAlign) % rowStride; const Int firstSendRow = Mod( Mod(colRank-recvColOffset,colStride)+sendColOffset, colStrideA ); const Int firstSendCol = Mod( Mod(rowRank-recvRowOffset,rowStride)+sendRowOffset, rowStrideA ); const Int colShift = Mod( colRank-recvColOffset, colStride ); const Int rowShift = Mod( rowRank-recvRowOffset, rowStride ); const Int numColRecvs = Length( colStrideA, colShift, colStride ); const Int numRowRecvs = Length( rowStrideA, rowShift, rowStride ); // Recv data // For now, simply receive sequentially. Until we switch to // nonblocking recv's, we won't be using much of the // recvBuf Int sendRow = firstSendRow; for( Int colRecv=0; colRecv<numColRecvs; ++colRecv ) { const Int sendColShift = Shift( sendRow, colAlignA, colStrideA ) + colSend*colStrideA; const Int sendHeight = Length( A.Height(), sendColShift, colLCM ); const Int localColOffset = (sendColShift-B.ColShift()) / colStride; Int sendCol = firstSendCol; for( Int rowRecv=0; rowRecv<numRowRecvs; ++rowRecv ) { const Int sendRowShift = Shift( sendCol, rowAlignA, rowStrideA ) + rowSend*rowStrideA; const Int sendWidth = Length( A.Width(), sendRowShift, rowLCM ); const Int localRowOffset = (sendRowShift-B.RowShift()) / rowStride; const Int sendVCRank = sendRow+sendCol*colStrideA; mpi::Recv ( recvBuf, sendHeight*sendWidth, rankMap[sendVCRank], B.Grid().ViewingComm() ); // Unpack the data copy::util::InterleaveMatrix ( sendHeight, sendWidth, recvBuf, 1, sendHeight, B.Buffer(localColOffset,localRowOffset), colLCM/colStride, (rowLCM/rowStride)*B.LDim() ); // Set up the next send col sendCol = (sendCol + rowStride) % rowStrideA; } // Set up the next send row sendRow = (sendRow + colStride) % colStrideA; } } // Ensure that this round of non-blocking sends completes if( inAGrid ) { mpi::Wait( sendRequest ); recvCol = (recvCol + rowStrideA) % rowStride; } } if( inAGrid ) recvRow = (recvRow + colStrideA) % colStride; } }
inline void MakeTrapezoidal ( LeftOrRight side, UpperOrLower uplo, int offset, DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("MakeTrapezoidal"); #endif const int height = A.Height(); const int width = A.Width(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); const int colStride = A.ColStride(); const int rowStride = A.RowStride(); T* buffer = A.Buffer(); const int ldim = A.LDim(); if( uplo == LOWER ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int lastZeroRow = ( side==LEFT ? j-offset-1 : j-offset+height-width-1 ); if( lastZeroRow >= 0 ) { const int boundary = std::min( lastZeroRow+1, height ); const int numZeroRows = Length_( boundary, colShift, colStride ); MemZero( &buffer[jLocal*ldim], numZeroRows ); } } } else { #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*rowStride; const int firstZeroRow = ( side==LEFT ? std::max(j-offset+1,0) : std::max(j-offset+height-width+1,0) ); const int numNonzeroRows = Length_(firstZeroRow,colShift,colStride); if( numNonzeroRows < localHeight ) { T* col = &buffer[numNonzeroRows+jLocal*ldim]; MemZero( col, localHeight-numNonzeroRows ); } } } #ifndef RELEASE PopCallStack(); #endif }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { DEBUG_ONLY(CSE cse("copy::AllGather")) AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf( (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) { // Pack from the root const Int BLocalHeight = B.LocalHeight(); const Int BLocalWidth = B.LocalWidth(); vector<T> buf(BLocalHeight*BLocalWidth); if( A.CrossRank() == A.Root() ) util::InterleaveMatrix ( BLocalHeight, BLocalWidth, B.LockedBuffer(), 1, B.LDim(), buf.data(), 1, BLocalHeight ); // Broadcast from the root mpi::Broadcast ( buf.data(), BLocalHeight*BLocalWidth, A.Root(), A.CrossComm() ); // Unpack if not the root if( A.CrossRank() != A.Root() ) util::InterleaveMatrix ( BLocalHeight, BLocalWidth, buf.data(), 1, BLocalHeight, B.Buffer(), 1, B.LDim() ); } }
void ColAllToAllDemote ( const DistMatrix<T,Partial<U>(),PartialUnionRow<U,V>()>& A, DistMatrix<T, U, V >& B ) { DEBUG_ONLY(CallStackEntry cse("copy::ColAllToAllDemote")) AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.AlignColsAndResize( A.ColAlign(), height, width, false, false ); if( !B.Participating() ) return; const Int colAlign = B.ColAlign(); const Int rowAlignA = A.RowAlign(); const Int colStride = B.ColStride(); const Int colStridePart = B.PartialColStride(); const Int colStrideUnion = B.PartialUnionColStride(); const Int colRankPart = B.PartialColRank(); const Int colDiff = (colAlign%colStridePart) - A.ColAlign(); const Int colShiftA = A.ColShift(); const Int localHeightB = B.LocalHeight(); const Int localWidthA = A.LocalWidth(); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,colStrideUnion); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); std::vector<T> buffer( 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; if( colDiff == 0 ) { // Pack util::PartialColStridedPack ( height, localWidthA, colAlign, colStride, colStrideUnion, colStridePart, colRankPart, colShiftA, A.LockedBuffer(), A.LDim(), firstBuf, portionSize ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, B.PartialUnionColComm() ); // Unpack util::RowStridedUnpack ( localHeightB, width, rowAlignA, colStrideUnion, secondBuf, portionSize, B.Buffer(), B.LDim() ); } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) std::cerr << "Unaligned ColAllToAllDemote" << std::endl; #endif const Int sendColRankPart = Mod( colRankPart+colDiff, colStridePart ); const Int recvColRankPart = Mod( colRankPart-colDiff, colStridePart ); // Pack util::PartialColStridedPack ( height, localWidthA, colAlign, colStride, colStrideUnion, colStridePart, sendColRankPart, colShiftA, A.LockedBuffer(), A.LDim(), secondBuf, portionSize ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( secondBuf, portionSize, firstBuf, portionSize, B.PartialUnionColComm() ); // Realign the result mpi::SendRecv ( firstBuf, colStrideUnion*portionSize, sendColRankPart, secondBuf, colStrideUnion*portionSize, recvColRankPart, B.PartialColComm() ); // Unpack util::RowStridedUnpack ( localHeightB, width, rowAlignA, colStrideUnion, secondBuf, portionSize, B.Buffer(), B.LDim() ); } }
void TransposeDist( const DistMatrix<T,U,V>& A, DistMatrix<T,V,U>& B ) { DEBUG_ONLY(CSE cse("copy::TransposeDist")) AssertSameGrids( A, B ); const Grid& g = B.Grid(); B.Resize( A.Height(), A.Width() ); if( !B.Participating() ) return; const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int distSize = A.DistSize(); if( A.DistSize() == 1 && B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else if( A.Width() == 1 ) { const Int height = A.Height(); const Int maxLocalHeight = MaxLength(height,distSize); const Int portionSize = mpi::Pad( maxLocalHeight ); const Int colDiff = Shift(A.DistRank(),A.ColAlign(),distSize) - Shift(B.DistRank(),B.ColAlign(),distSize); const Int sendRankB = Mod( B.DistRank()+colDiff, distSize ); const Int recvRankA = Mod( A.DistRank()-colDiff, distSize ); const Int recvRankB = (recvRankA/colStrideA)+rowStrideA*(recvRankA%colStrideA); vector<T> buffer; FastResize( buffer, (colStrideA+rowStrideA)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[colStrideA*portionSize]; if( A.RowRank() == A.RowAlign() ) { // Pack // TODO: Use kernel from copy::util const Int AColShift = A.ColShift(); const T* ABuf = A.LockedBuffer(); EL_PARALLEL_FOR for( Int k=0; k<rowStrideA; ++k ) { T* data = &recvBuf[k*portionSize]; const Int shift = Shift_(A.ColRank()+colStrideA*k,A.ColAlign(),distSize); const Int offset = (shift-AColShift) / colStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) data[iLoc] = ABuf[offset+iLoc*rowStrideA]; } } // (e.g., A[VC,STAR] <- A[MC,MR]) mpi::Scatter ( recvBuf, portionSize, sendBuf, portionSize, A.RowAlign(), A.RowComm() ); // (e.g., A[VR,STAR] <- A[VC,STAR]) mpi::SendRecv ( sendBuf, portionSize, sendRankB, recvBuf, portionSize, recvRankB, B.DistComm() ); // (e.g., A[MR,MC] <- A[VR,STAR]) mpi::Gather ( recvBuf, portionSize, sendBuf, portionSize, B.RowAlign(), B.RowComm() ); if( B.RowRank() == B.RowAlign() ) { // Unpack // TODO: Use kernel from copy::util T* bufB = B.Buffer(); EL_PARALLEL_FOR for( Int k=0; k<colStrideA; ++k ) { const T* data = &sendBuf[k*portionSize]; const Int shift = Shift_(B.ColRank()+rowStrideA*k,B.ColAlign(),distSize); const Int offset = (shift-B.ColShift()) / rowStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) bufB[offset+iLoc*colStrideA] = data[iLoc]; } } }
void ColAllToAllPromote ( const DistMatrix<T, U, V >& A, DistMatrix<T,Partial<U>(),PartialUnionRow<U,V>()>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.AlignColsAndResize ( Mod(A.ColAlign(),B.ColStride()), height, width, false, false ); if( !B.Participating() ) return; const Int colStride = A.ColStride(); const Int colStridePart = A.PartialColStride(); const Int colStrideUnion = A.PartialUnionColStride(); const Int colRankPart = A.PartialColRank(); const Int colDiff = B.ColAlign() - Mod(A.ColAlign(),colStridePart); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,colStrideUnion); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); if( colDiff == 0 ) { if( A.PartialUnionColStride() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), firstBuf, portionSize ); // Simultaneously Gather in columns and Scatter in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, colRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( A.Grid().Rank() == 0 ) cerr << "Unaligned PartialColAllToAllPromote" << endl; #endif const Int sendColRankPart = Mod( colRankPart+colDiff, colStridePart ); const Int recvColRankPart = Mod( colRankPart-colDiff, colStridePart ); vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), secondBuf, portionSize ); // Realign the input mpi::SendRecv ( secondBuf, colStrideUnion*portionSize, sendColRankPart, firstBuf, colStrideUnion*portionSize, recvColRankPart, A.PartialColComm() ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, recvColRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } }
void FormDiagonalBlocks ( const DistMatrix<F,VC,STAR>& L, DistMatrix<F,STAR,STAR>& D, bool conjugate ) { const Grid& g = L.Grid(); const Int height = L.Width(); const Int blocksize = Blocksize(); const int commRank = g.VCRank(); const int commSize = g.Size(); const Int localHeight = Length(height,commRank,commSize); const Int maxLocalHeight = MaxLength(height,commSize); const Int portionSize = maxLocalHeight*blocksize; std::vector<F> sendBuffer( portionSize ); const Int colShift = L.ColShift(); const Int LLDim = L.LDim(); const F* LBuffer = L.LockedBuffer(); if( conjugate ) { for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*commSize; const Int block = i / blocksize; const Int jStart = block*blocksize; const Int b = std::min(height-jStart,blocksize); for( Int jOff=0; jOff<b; ++jOff ) sendBuffer[iLoc*blocksize+jOff] = Conj(LBuffer[iLoc+(jStart+jOff)*LLDim]); } } else { for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*commSize; const Int block = i / blocksize; const Int jStart = block*blocksize; const Int b = std::min(height-jStart,blocksize); for( Int jOff=0; jOff<b; ++jOff ) sendBuffer[iLoc*blocksize+jOff] = LBuffer[iLoc+(jStart+jOff)*LLDim]; } } std::vector<F> recvBuffer( portionSize*commSize ); mpi::AllGather ( &sendBuffer[0], portionSize, &recvBuffer[0], portionSize, g.VCComm() ); SwapClear( sendBuffer ); D.Resize( blocksize, height ); F* DBuffer = D.Buffer(); const Int DLDim = D.LDim(); for( Int proc=0; proc<commSize; ++proc ) { const F* procRecv = &recvBuffer[proc*portionSize]; const Int procLocalHeight = Length(height,proc,commSize); for( Int iLoc=0; iLoc<procLocalHeight; ++iLoc ) { const Int i = proc + iLoc*commSize; for( Int jOff=0; jOff<blocksize; ++jOff ) DBuffer[jOff+i*DLDim] = procRecv[jOff+iLoc*blocksize]; } } }
void Read_MPI(DistMatrix<DataType> &M, std::string filename, FileFormat format = BINARY, bool sequential = false) { // TODO: error out if format != BINARY // TODO: use TypeMap<>() and templating to figure this out MPI_Datatype type = DataTypeMPI; // define our file name const char* path = filename.c_str(); // get MPI communicator MPI_Comm comm = M.Grid().Comm().comm; // get our rank int rank = M.Grid().Rank(); // open the file MPI_File fh; MPI_Status status; char datarep[] = "native"; int amode = MPI_MODE_RDONLY; int rc = MPI_File_open(comm, path, amode, MPI_INFO_NULL, &fh); if (rc != MPI_SUCCESS) { if (rank == 0) { cout << "Failed to open file `" << path << "'" << endl; } return; } // set displacement to beginning of file MPI_Offset disp = 0; // set our view to read header (height and width as unsigned 32-bit ints) uint32_t dimensions[2]; MPI_File_set_view(fh, disp, MPI_UINT32_T, MPI_UINT32_T, datarep, MPI_INFO_NULL); if (rank == 0) { MPI_File_read_at(fh, 0, dimensions, 2, MPI_UINT32_T, &status); } disp += 2 * sizeof(uint32_t); // broadcast dimensions from rank 0 MPI_Bcast(dimensions, 2, MPI_UINT32_T, 0, comm); // resize matrix to hold data Int global_height = dimensions[0]; Int global_width = dimensions[1]; M.Resize(global_height, global_width); // now define datatypes to describe local buffer and view into file MPI_Datatype mattype, viewtype; create_types(M, &mattype, &viewtype); // set view to write data MPI_File_set_view(fh, disp, type, viewtype, datarep, MPI_INFO_NULL); // write our portion of the matrix, since we set our view using create_darray, // all procs write at offset 0, the file view will take care of interleaving appropriately char* buf = (char*) M.Buffer(); MPI_File_read_at_all(fh, 0, buf, 1, mattype, &status); // close file MPI_File_close(&fh); // free our datatypes MPI_Type_free(&mattype); MPI_Type_free(&viewtype); return; }
inline void Syr2 ( UpperOrLower uplo, T alpha, const DistMatrix<T>& x, const DistMatrix<T>& y, DistMatrix<T>& A, bool conjugate=false ) { #ifndef RELEASE CallStackEntry entry("Syr2"); if( A.Grid() != x.Grid() || x.Grid() != y.Grid() ) LogicError ("{A,x,y} must be distributed over the same grid"); if( A.Height() != A.Width() ) LogicError("A must be square"); const Int xLength = ( x.Width()==1 ? x.Height() : x.Width() ); const Int yLength = ( y.Width()==1 ? y.Height() : y.Width() ); if( A.Height() != xLength || A.Height() != yLength ) { std::ostringstream msg; msg << "A must conform with x: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x ~ " << x.Height() << " x " << x.Width() << "\n" << " y ~ " << y.Height() << " x " << y.Width() << "\n"; LogicError( msg.str() ); } #endif const Grid& g = A.Grid(); const Int localHeight = A.LocalHeight(); const Int localWidth = A.LocalWidth(); const Int r = g.Height(); const Int c = g.Width(); const Int colShift = A.ColShift(); const Int rowShift = A.RowShift(); if( x.Width() == 1 && y.Width() == 1 ) { DistMatrix<T,MC,STAR> x_MC_STAR(g), y_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g), y_MR_STAR(g); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); y_MC_STAR.AlignWith( A ); y_MR_STAR.AlignWith( A ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; y_MC_STAR = y; y_MR_STAR = y_MC_STAR; const T* xBuffer = x_MC_STAR.LockedBuffer(); const T* yBuffer = y_MC_STAR.LockedBuffer(); if( uplo == LOWER ) { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightAboveDiag = Length(j,colShift,r); const T beta = y_MR_STAR.GetLocal(jLoc,0); const T kappa = x_MR_STAR.GetLocal(jLoc,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=heightAboveDiag; iLoc<localHeight; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc] + delta*yBuffer[iLoc]; } } else { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightToDiag = Length(j+1,colShift,r); const T beta = y_MR_STAR.GetLocal(jLoc,0); const T kappa = x_MR_STAR.GetLocal(jLoc,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=0; iLoc<heightToDiag; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc] + delta*yBuffer[iLoc]; } } //--------------------------------------------------------------------// } else if( x.Width() == 1 ) { DistMatrix<T,MC,STAR> x_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g); DistMatrix<T,STAR,MC> y_STAR_MC(g); DistMatrix<T,STAR,MR> y_STAR_MR(g); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); y_STAR_MC.AlignWith( A ); y_STAR_MR.AlignWith( A ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; y_STAR_MR = y; y_STAR_MC = y_STAR_MR; const T* xBuffer = x_MC_STAR.LockedBuffer(); const T* yBuffer = y_STAR_MC.LockedBuffer(); const Int incy = y_STAR_MC.LDim(); if( uplo == LOWER ) { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightAboveDiag = Length(j,colShift,r); const T beta = y_STAR_MR.GetLocal(0,jLoc); const T kappa = x_MR_STAR.GetLocal(jLoc,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=heightAboveDiag; iLoc<localHeight; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc] + delta*yBuffer[iLoc*incy]; } } else { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightToDiag = Length(j+1,colShift,r); const T beta = y_STAR_MR.GetLocal(0,jLoc); const T kappa = x_MR_STAR.GetLocal(jLoc,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=0; iLoc<heightToDiag; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc] + delta*yBuffer[iLoc*incy]; } } //--------------------------------------------------------------------// } else if( y.Width() == 1 ) { DistMatrix<T,STAR,MC> x_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g); DistMatrix<T,MC,STAR> y_MC_STAR(g); DistMatrix<T,MR,STAR> y_MR_STAR(g); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); y_MC_STAR.AlignWith( A ); y_MR_STAR.AlignWith( A ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; y_MC_STAR = y; y_MR_STAR = y_MC_STAR; const T* xBuffer = x_STAR_MC.LockedBuffer(); const T* yBuffer = y_MC_STAR.LockedBuffer(); const Int incx = x_STAR_MC.LDim(); if( uplo == LOWER ) { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightAboveDiag = Length(j,colShift,r); const T beta = x_STAR_MR.GetLocal(0,jLoc); const T kappa = y_MR_STAR.GetLocal(jLoc,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=heightAboveDiag; iLoc<localHeight; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc*incx] + delta*yBuffer[iLoc]; } } else { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightToDiag = Length(j+1,colShift,r); const T beta = x_STAR_MR.GetLocal(0,jLoc); const T kappa = y_MR_STAR.GetLocal(jLoc,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=0; iLoc<heightToDiag; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc*incx] + delta*yBuffer[iLoc]; } } //--------------------------------------------------------------------// } else { DistMatrix<T,STAR,MC> x_STAR_MC(g), y_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g), y_STAR_MR(g); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); y_STAR_MC.AlignWith( A ); y_STAR_MR.AlignWith( A ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; y_STAR_MR = y; y_STAR_MC = y_STAR_MR; const T* xBuffer = x_STAR_MC.LockedBuffer(); const T* yBuffer = y_STAR_MC.LockedBuffer(); const Int incx = x_STAR_MC.LDim(); const Int incy = y_STAR_MC.LDim(); if( uplo == LOWER ) { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightAboveDiag = Length(j,colShift,r); const T beta = y_STAR_MR.GetLocal(0,jLoc); const T kappa = x_STAR_MR.GetLocal(0,jLoc); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=heightAboveDiag; iLoc<localHeight; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc*incx] + delta*yBuffer[iLoc*incy]; } } else { for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*c; const Int heightToDiag = Length(j+1,colShift,r); const T beta = y_STAR_MR.GetLocal(0,jLoc); const T kappa = x_STAR_MR.GetLocal(0,jLoc); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); const T delta = ( conjugate ? alpha*Conj(kappa) : alpha*kappa ); T* ACol = A.Buffer(0,jLoc); for( Int iLoc=0; iLoc<heightToDiag; ++iLoc ) ACol[iLoc] += gamma*xBuffer[iLoc*incx] + delta*yBuffer[iLoc*incy]; } } //--------------------------------------------------------------------// } }
inline void Syr ( UpperOrLower uplo, T alpha, const DistMatrix<T>& x, DistMatrix<T>& A, bool conjugate=false ) { #ifndef RELEASE PushCallStack("Syr"); if( A.Grid() != x.Grid() ) throw std::logic_error ("A and x must be distributed over the same grid"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); const int xLength = ( x.Width()==1 ? x.Height() : x.Width() ); if( A.Height() != xLength ) { std::ostringstream msg; msg << "A must conform with x: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x ~ " << x.Height() << " x " << x.Width() << "\n"; throw std::logic_error( msg.str() ); } #endif const Grid& g = A.Grid(); const int localHeight = A.LocalHeight(); const int localWidth = A.LocalWidth(); const int r = g.Height(); const int c = g.Width(); const int colShift = A.ColShift(); const int rowShift = A.RowShift(); if( x.Width() == 1 ) { DistMatrix<T,MC,STAR> x_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; const T* xBuffer = x_MC_STAR.LockedBuffer(); if( uplo == LOWER ) { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightAboveDiag = Length(j,colShift,r); const T beta = x_MR_STAR.GetLocal(jLocal,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); T* ACol = A.Buffer(0,jLocal); for( int iLocal=heightAboveDiag; iLocal<localHeight; ++iLocal ) ACol[iLocal] += gamma*xBuffer[iLocal]; } } else { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightToDiag = Length(j+1,colShift,r); const T beta = x_MR_STAR.GetLocal(jLocal,0); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); T* ACol = A.Buffer(0,jLocal); for( int iLocal=0; iLocal<heightToDiag; ++iLocal ) ACol[iLocal] += gamma*xBuffer[iLocal]; } } //--------------------------------------------------------------------// x_MC_STAR.FreeAlignments(); x_MR_STAR.FreeAlignments(); } else { DistMatrix<T,STAR,MC> x_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; const T* xBuffer = x_STAR_MC.LockedBuffer(); const int incx = x_STAR_MC.LDim(); if( uplo == LOWER ) { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightAboveDiag = Length(j,colShift,r); const T beta = x_STAR_MR.GetLocal(0,jLocal); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); T* ACol = A.Buffer(0,jLocal); for( int iLocal=heightAboveDiag; iLocal<localHeight; ++iLocal ) ACol[iLocal] += gamma*xBuffer[iLocal*incx]; } } else { for( int jLocal=0; jLocal<localWidth; ++jLocal ) { const int j = rowShift + jLocal*c; const int heightToDiag = Length(j+1,colShift,r); const T beta = x_STAR_MR.GetLocal(0,jLocal); const T gamma = ( conjugate ? alpha*Conj(beta) : alpha*beta ); T* ACol = A.Buffer(0,jLocal); for( int iLocal=0; iLocal<heightToDiag; ++iLocal ) ACol[iLocal] += gamma*xBuffer[iLocal*incx]; } } //--------------------------------------------------------------------// x_STAR_MC.FreeAlignments(); x_STAR_MR.FreeAlignments(); } #ifndef RELEASE PopCallStack(); #endif }