inline void LocalGer ( T alpha, const DistMatrix<T,xColDist,xRowDist>& x, const DistMatrix<T,yColDist,yRowDist>& y, DistMatrix<T,AColDist,ARowDist>& A ) { DEBUG_ONLY(CallStackEntry cse("LocalGer")) // TODO: Add error checking here Ger( alpha, x.LockedMatrix(), y.LockedMatrix(), A.Matrix() ); }
inline void LocalGer ( T alpha, const DistMatrix<T,xColDist,xRowDist>& x, const DistMatrix<T,yColDist,yRowDist>& y, DistMatrix<T,AColDist,ARowDist>& A ) { #ifndef RELEASE CallStackEntry entry("LocalGer"); // TODO: Add error checking here #endif Ger( alpha, x.LockedMatrix(), y.LockedMatrix(), A.Matrix() ); }
void StackedGeometricColumnScaling ( const DistMatrix<Field, U,V >& A, const DistMatrix<Field, U,V >& B, DistMatrix<Base<Field>,V,STAR>& geomScaling ) { EL_DEBUG_CSE // NOTE: Assuming A.ColComm() == B.ColComm() and that the row alignments // are equal typedef Base<Field> Real; DistMatrix<Real,V,STAR> maxScalingA(A.Grid()), maxScalingB(A.Grid()); ColumnMaxNorms( A, maxScalingA ); ColumnMaxNorms( B, maxScalingB ); const Int mLocalA = A.LocalHeight(); const Int mLocalB = B.LocalHeight(); const Int nLocal = A.LocalWidth(); geomScaling.AlignWith( maxScalingA ); geomScaling.Resize( A.Width(), 1 ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.LockedMatrix(); auto& geomScalingLoc = geomScaling.Matrix(); auto& maxScalingALoc = maxScalingA.Matrix(); auto& maxScalingBLoc = maxScalingB.Matrix(); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { Real minAbs = Max(maxScalingALoc(jLoc),maxScalingBLoc(jLoc)); for( Int iLoc=0; iLoc<mLocalA; ++iLoc ) { const Real absVal = Abs(ALoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } for( Int iLoc=0; iLoc<mLocalB; ++iLoc ) { const Real absVal = Abs(BLoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } geomScalingLoc(jLoc) = minAbs; } mpi::AllReduce( geomScaling.Buffer(), nLocal, mpi::MIN, A.ColComm() ); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { const Real maxAbsA = maxScalingALoc(jLoc); const Real maxAbsB = maxScalingBLoc(jLoc); const Real maxAbs = Max(maxAbsA,maxAbsB); const Real minAbs = geomScalingLoc(jLoc); geomScalingLoc(jLoc) = Sqrt(minAbs*maxAbs); } }
const DistMatrix<T,MD,STAR>& DistMatrix<T,MD,STAR>::operator=( const DistMatrix<T,MD,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("[MD,* ] = [MD,* ]"); this->AssertNotLocked(); this->AssertSameGrid( A.Grid() ); #endif if( !this->Viewing() && !this->ConstrainedColAlignment() ) { this->diagPath_ = A.diagPath_; this->colAlignment_ = A.colAlignment_; if( this->Participating() ) this->colShift_ = A.ColShift(); } this->ResizeTo( A.Height(), A.Width() ); if( this->diagPath_ == A.diagPath_ && this->colAlignment_ == A.colAlignment_ ) { this->matrix_ = A.LockedMatrix(); } else { #ifdef UNALIGNED_WARNINGS if( this->Grid().Rank() == 0 ) std::cerr << "Unaligned [MD,* ] <- [MD,* ]." << std::endl; #endif LogicError("Unaligned [MD,* ] = [MD,* ] not yet implemented"); } return *this; }
void RowMaxNorms ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,U,STAR>& norms ) { DEBUG_CSE norms.AlignWith( A ); norms.Resize( A.Height(), 1 ); RowMaxNorms( A.LockedMatrix(), norms.Matrix() ); AllReduce( norms, A.RowComm(), mpi::MAX ); }
void ColumnMinAbs ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,V,STAR>& mins ) { EL_DEBUG_CSE const Int n = A.Width(); mins.AlignWith( A ); mins.Resize( n, 1 ); ColumnMinAbs( A.LockedMatrix(), mins.Matrix() ); AllReduce( mins.Matrix(), A.ColComm(), mpi::MIN ); }
inline void DiagonalScale ( LeftOrRight side, Orientation orientation, const DistMatrix<typename Base<T>::type,U,V>& d, DistMatrix<T,W,Z>& X ) { #ifndef RELEASE PushCallStack("DiagonalScale"); #endif typedef typename Base<T>::type R; if( side == LEFT ) { if( U == W && V == STAR && d.ColAlignment() == X.ColAlignment() ) { DiagonalScale( LEFT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,W,STAR> d_W_STAR( X.Grid() ); d_W_STAR = d; DiagonalScale ( LEFT, orientation, d_W_STAR.LockedMatrix(), X.Matrix() ); } } else { if( U == Z && V == STAR && d.ColAlignment() == X.RowAlignment() ) { DiagonalScale( RIGHT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,Z,STAR> d_Z_STAR( X.Grid() ); d_Z_STAR = d; DiagonalScale ( RIGHT, orientation, d_Z_STAR.LockedMatrix(), X.Matrix() ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void Print ( const DistMatrix<T,CIRC,CIRC>& A, std::string title="", std::ostream& os=std::cout ) { #ifndef RELEASE CallStackEntry entry("Print"); #endif if( A.Grid().VCRank() == A.Root() ) Print( A.LockedMatrix(), title, os ); }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { EL_DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { if( A.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf; FastResize( buf, (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) El::Broadcast( B, A.CrossComm(), A.Root() ); }
void RowTwoNorms ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,U,STAR>& norms ) { DEBUG_CSE norms.AlignWith( A ); norms.Resize( A.Height(), 1 ); if( A.Width() == 0 ) { Zero( norms ); return; } RowTwoNormsHelper( A.LockedMatrix(), norms.Matrix(), A.RowComm() ); }
inline void LocalTrmm ( LeftOrRight side, UpperOrLower uplo, Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T,STAR,STAR>& A, DistMatrix<T,BColDist,BRowDist>& B ) { #ifndef RELEASE CallStackEntry entry("LocalTrmm"); if( (side == LEFT && BColDist != STAR) || (side == RIGHT && BRowDist != STAR) ) LogicError ("Distribution of RHS must conform with that of triangle"); #endif Trmm ( side, uplo, orientation, diag, alpha, A.LockedMatrix(), B.Matrix() ); }
void GatherSubdiagonal ( const DistMatrix<F,MC,MR,BLOCK>& H, const IR& winInd, DistMatrix<Base<F>,STAR,STAR>& hSubWin ) { DEBUG_CSE const Int winSize = winInd.end - winInd.beg; const Int blockSize = H.BlockHeight(); const Grid& grid = H.Grid(); const auto& HLoc = H.LockedMatrix(); DEBUG_ONLY( if( H.BlockHeight() != H.BlockWidth() ) LogicError("Assumed square distribution blocks"); if( H.ColCut() != H.RowCut() ) LogicError("Assumed symmetric cuts"); if( blockSize < 2 ) LogicError("Assumed blocks of size at least two"); )
void IndexDependentMap ( const DistMatrix<S,U,V,wrap>& A, DistMatrix<T,U,V,wrap>& B, function<T(Int,Int,const S&)> func ) { EL_DEBUG_CSE const Int mLoc = A.LocalHeight(); const Int nLoc = A.LocalWidth(); B.AlignWith( A.DistData() ); B.Resize( A.Height(), A.Width() ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.Matrix(); for( Int jLoc=0; jLoc<nLoc; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<mLoc; ++iLoc ) { const Int i = A.GlobalRow(iLoc); BLoc(iLoc,jLoc) = func(i,j,ALoc(iLoc,jLoc)); } } }
void TransformRows ( const Matrix<F>& Z, DistMatrix<F,MC,MR,BLOCK>& H ) { DEBUG_CSE const Int height = H.Height(); const Grid& grid = H.Grid(); const Int blockHeight = H.BlockHeight(); const Int firstBlockHeight = blockHeight - H.ColCut(); if( height <= firstBlockHeight || grid.Height() == 1 ) { if( grid.Row() == H.RowOwner(0) ) { // This process row can locally update its portion of H Matrix<F> HLocCopy( H.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), Z, HLocCopy, H.Matrix() ); } } else if( height <= firstBlockHeight + blockHeight ) { const bool firstRow = H.RowOwner( 0 ); const bool secondRow = H.RowOwner( firstBlockHeight ); if( grid.Row() == firstRow ) { // // Replace H with // // | ZLeft, ZRight |' | HTop |, // | HBottom | // // where HTop is owned by this process row and HBottom by the next. // auto ZLeft = Z( ALL, IR(0,firstBlockHeight) ); // Partition space for the combined matrix Matrix<F> HCombine( height, H.LocalWidth() ); auto HTop = HCombine( IR(0,firstBlockHeight), ALL ); auto HBottom = HCombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix HTop = H.LockedMatrix(); // Exchange the data El::SendRecv( HTop, HBottom, H.ColComm(), secondRow, secondRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), ZLeft, HCombine, H.Matrix() ); } else if( grid.Row() == secondRow ) { // // Replace H with // // | ZLeft, ZRight |' | HTop |, // | HBottom | // // where HTop is owned by the previous process row and HBottom by // this one. // auto ZRight = Z( ALL, IR(firstBlockHeight,END) ); // Partition space for the combined matrix Matrix<F> HCombine( height, H.LocalWidth() ); auto HTop = HCombine( IR(0,firstBlockHeight), ALL ); auto HBottom = HCombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix HBottom = H.LockedMatrix(); // Exchange the data El::SendRecv( HBottom, HTop, H.ColComm(), firstRow, firstRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), ZRight, HCombine, H.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,STAR,MR,BLOCK> H_STAR_MR( H ); Matrix<F> HLocCopy( H_STAR_MR.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), Z, HLocCopy, H_STAR_MR.Matrix() ); H = H_STAR_MR; } }
void TransformColumns ( const Matrix<F>& Z, DistMatrix<F,MC,MR,BLOCK>& H ) { DEBUG_CSE const Int width = H.Width(); const Grid& grid = H.Grid(); const Int blockWidth = H.BlockWidth(); const Int firstBlockWidth = blockWidth - H.RowCut(); if( width <= firstBlockWidth || grid.Width() == 1 ) { if( grid.Col() == H.ColOwner(0) ) { // This process row can locally update its portion of H Matrix<F> HLocCopy( H.Matrix() ); Gemm( NORMAL, NORMAL, F(1), HLocCopy, Z, H.Matrix() ); } } else if( width <= firstBlockWidth + blockWidth ) { const bool firstCol = H.ColOwner( 0 ); const bool secondCol = H.ColOwner( firstBlockWidth ); if( grid.Col() == firstCol ) { // // Replace H with // // | HLeft, HRight | | ZLeft, ZRight |, // // where HLeft is owned by this process column and HRight by the // next. // auto ZLeft = Z( ALL, IR(0,firstBlockWidth) ); // Partition space for the combined matrix Matrix<F> HCombine( H.LocalHeight(), width ); auto HLeft = HCombine( ALL, IR(0,firstBlockWidth) ); auto HRight = HCombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix HLeft = H.LockedMatrix(); // Exchange the data El::SendRecv( HLeft, HRight, H.RowComm(), secondCol, secondCol ); // Form our portion of the result Gemm( NORMAL, NORMAL, F(1), HCombine, ZLeft, H.Matrix() ); } else if( grid.Col() == secondCol ) { // // Replace H with // // | HLeft, HRight | | ZLeft, ZRight |, // // where HLeft is owned by the previous process column and HRight // by this one. // auto ZRight = Z( ALL, IR(firstBlockWidth,END) ); // Partition space for the combined matrix Matrix<F> HCombine( H.LocalHeight(), width ); auto HLeft = HCombine( ALL, IR(0,firstBlockWidth) ); auto HRight = HCombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix HRight = H.LockedMatrix(); // Exchange the data El::SendRecv( HRight, HLeft, H.RowComm(), firstCol, firstCol ); // Form our portion of the result Gemm( NORMAL, NORMAL, F(1), HCombine, ZRight, H.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,MC,STAR,BLOCK> H_MC_STAR( H ); Matrix<F> HLocCopy( H_MC_STAR.Matrix() ); Gemm( NORMAL, NORMAL, F(1), HLocCopy, Z, H_MC_STAR.Matrix() ); H = H_MC_STAR; } }
const DistMatrix<T,STAR,STAR>& DistMatrix<T,STAR,STAR>::operator=( const DistMatrix<T,STAR,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("[* ,* ] = [* ,* ]"); this->AssertNotLocked(); #endif this->ResizeTo( A.Height(), A.Width() ); if( this->Grid() == A.Grid() ) { this->matrix_ = A.LockedMatrix(); } else { // TODO: Remember why I wrote this... if( !mpi::CongruentComms( A.Grid().ViewingComm(), this->Grid().ViewingComm() ) ) LogicError ("Redistributing between nonmatching grids currently requires" " the viewing communicators to match."); // Compute and allocate the amount of required memory Int requiredMemory = 0; if( A.Grid().VCRank() == 0 ) requiredMemory += A.Height()*A.Width(); if( this->Participating() ) requiredMemory += A.Height()*A.Width(); T* buffer = this->auxMemory_.Require( requiredMemory ); Int offset = 0; T* sendBuf = &buffer[offset]; if( A.Grid().VCRank() == 0 ) offset += A.Height()*A.Width(); T* bcastBuffer = &buffer[offset]; // Send from the root of A to the root of this matrix's grid mpi::Request sendRequest; if( A.Grid().VCRank() == 0 ) { for( Int j=0; j<A.Width(); ++j ) for( Int i=0; i<A.Height(); ++i ) sendBuf[i+j*A.Height()] = A.GetLocal(i,j); const Int recvViewingRank = this->Grid().VCToViewingMap(0); mpi::ISend ( sendBuf, A.Height()*A.Width(), recvViewingRank, this->Grid().ViewingComm(), sendRequest ); } // Receive on the root of this matrix's grid and then broadcast // over this matrix's owning communicator if( this->Participating() ) { if( this->Grid().VCRank() == 0 ) { const Int sendViewingRank = A.Grid().VCToViewingMap(0); mpi::Recv ( bcastBuffer, A.Height()*A.Width(), sendViewingRank, this->Grid().ViewingComm() ); } mpi::Broadcast ( bcastBuffer, A.Height()*A.Width(), 0, this->Grid().VCComm() ); for( Int j=0; j<A.Width(); ++j ) for( Int i=0; i<A.Height(); ++i ) this->SetLocal(i,j,bcastBuffer[i+j*A.Height()]); } if( A.Grid().VCRank() == 0 ) mpi::Wait( sendRequest ); this->auxMemory_.Release(); } return *this; }
inline void ComposePivots ( const DistMatrix<Int,STAR,STAR>& p, std::vector<Int>& image, std::vector<Int>& preimage ) { ComposePivots( p.LockedMatrix(), image, preimage ); }
void TransformRows ( const Matrix<F>& V, DistMatrix<F,MC,MR,BLOCK>& A ) { DEBUG_CSE const Int height = A.Height(); const Grid& grid = A.Grid(); const Int blockHeight = A.BlockHeight(); const Int firstBlockHeight = blockHeight - A.ColCut(); if( height <= firstBlockHeight || grid.Height() == 1 ) { if( grid.Row() == A.RowOwner(0) ) { // This process row can locally update its portion of A TransformRows( V, A.Matrix() ); } } else if( height <= firstBlockHeight + blockHeight ) { const int firstRow = A.RowOwner( 0 ); const int secondRow = A.RowOwner( firstBlockHeight ); if( grid.Row() == firstRow ) { // // Replace A with // // | VLeft, VRight |' | ATop |, // | ABottom | // // where ATop is owned by this process row and ABottom by the next. // auto VLeft = V( ALL, IR(0,firstBlockHeight) ); // Partition space for the combined matrix Matrix<F> ACombine( height, A.LocalWidth() ); auto ATop = ACombine( IR(0,firstBlockHeight), ALL ); auto ABottom = ACombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix ATop = A.LockedMatrix(); // Exchange the data El::SendRecv( ATop, ABottom, A.ColComm(), secondRow, secondRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), VLeft, ACombine, A.Matrix() ); } else if( grid.Row() == secondRow ) { // // Replace A with // // | VLeft, VRight |' | ATop |, // | ABottom | // // where ATop is owned by the previous process row and ABottom by // this one. // auto VRight = V( ALL, IR(firstBlockHeight,END) ); // Partition space for the combined matrix Matrix<F> ACombine( height, A.LocalWidth() ); auto ATop = ACombine( IR(0,firstBlockHeight), ALL ); auto ABottom = ACombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix ABottom = A.LockedMatrix(); // Exchange the data El::SendRecv( ABottom, ATop, A.ColComm(), firstRow, firstRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), VRight, ACombine, A.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,STAR,MR,BLOCK> A_STAR_MR( A ); Matrix<F> ALocCopy( A_STAR_MR.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), V, ALocCopy, A_STAR_MR.Matrix() ); A = A_STAR_MR; } }
void TransformColumns ( const Matrix<F>& V, DistMatrix<F,MC,MR,BLOCK>& A ) { DEBUG_CSE const Int width = A.Width(); const Grid& grid = A.Grid(); const Int blockWidth = A.BlockWidth(); const Int firstBlockWidth = blockWidth - A.RowCut(); if( width <= firstBlockWidth || grid.Width() == 1 ) { if( grid.Col() == A.ColOwner(0) ) { // This process row can locally update its portion of A TransformColumns( V, A.Matrix() ); } } else if( width <= firstBlockWidth + blockWidth ) { const int firstCol = A.ColOwner( 0 ); const int secondCol = A.ColOwner( firstBlockWidth ); if( grid.Col() == firstCol ) { // // Replace A with // // | ALeft, ARight | | VLeft, VRight |, // // where ALeft is owned by this process column and ARight by the // next. // // Partition space for the combined matrix Matrix<F> ACombine( A.LocalHeight(), width ); auto ALeft = ACombine( ALL, IR(0,firstBlockWidth) ); auto ARight = ACombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix ALeft = A.LockedMatrix(); // Exchange the data El::SendRecv( ALeft, ARight, A.RowComm(), secondCol, secondCol ); // Form our portion of the result auto VLeft = V( ALL, IR(0,firstBlockWidth) ); Gemm( NORMAL, NORMAL, F(1), ACombine, VLeft, A.Matrix() ); } else if( grid.Col() == secondCol ) { // // Replace A with // // | ALeft, ARight | | VLeft, VRight |, // // where ALeft is owned by the previous process column and ARight // by this one. // // Partition space for the combined matrix Matrix<F> ACombine( A.LocalHeight(), width ); auto ALeft = ACombine( ALL, IR(0,firstBlockWidth) ); auto ARight = ACombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix ARight = A.LockedMatrix(); // Exchange the data El::SendRecv( ARight, ALeft, A.RowComm(), firstCol, firstCol ); // Form our portion of the result auto VRight = V( ALL, IR(firstBlockWidth,END) ); Gemm( NORMAL, NORMAL, F(1), ACombine, VRight, A.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,MC,STAR,BLOCK> A_MC_STAR( A ); Matrix<F> ALocCopy( A_MC_STAR.Matrix() ); Gemm( NORMAL, NORMAL, F(1), ALocCopy, V, A_MC_STAR.Matrix() ); A = A_MC_STAR; } }
void Scatter ( const DistMatrix<T,CIRC,CIRC>& A, ElementalMatrix<T>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int m = A.Height(); const Int n = A.Width(); const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); B.Resize( m, n ); if( B.CrossSize() != 1 || B.RedundantSize() != 1 ) { // TODO: // Broadcast over the redundant communicator and use mpi::Translate // rank to determine whether a process is the root of the broadcast. GeneralPurpose( A, B ); return; } const Int pkgSize = mpi::Pad(MaxLength(m,colStride)*MaxLength(n,rowStride)); const Int recvSize = pkgSize; const Int sendSize = B.DistSize()*pkgSize; // Translate the root of A into the DistComm of B (if possible) const Int root = A.Root(); const Int target = mpi::Translate( A.CrossComm(), root, B.DistComm() ); if( target == mpi::UNDEFINED ) return; if( B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); return; } vector<T> buffer; T* recvBuf=0; // some compilers (falsely) warn otherwise if( A.CrossRank() == root ) { FastResize( buffer, sendSize+recvSize ); T* sendBuf = &buffer[0]; recvBuf = &buffer[sendSize]; // Pack the send buffer copy::util::StridedPack ( m, n, B.ColAlign(), colStride, B.RowAlign(), rowStride, A.LockedBuffer(), A.LDim(), sendBuf, pkgSize ); // Scatter from the root mpi::Scatter ( sendBuf, pkgSize, recvBuf, pkgSize, target, B.DistComm() ); } else { FastResize( buffer, recvSize ); recvBuf = &buffer[0]; // Perform the receiving portion of the scatter from the non-root mpi::Scatter ( static_cast<T*>(0), pkgSize, recvBuf, pkgSize, target, B.DistComm() ); } // Unpack copy::util::InterleaveMatrix ( B.LocalHeight(), B.LocalWidth(), recvBuf, 1, B.LocalHeight(), B.Buffer(), 1, B.LDim() ); }
void ColAllToAllPromote ( const DistMatrix<T, U, V >& A, DistMatrix<T,Partial<U>(),PartialUnionRow<U,V>()>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.AlignColsAndResize ( Mod(A.ColAlign(),B.ColStride()), height, width, false, false ); if( !B.Participating() ) return; const Int colStride = A.ColStride(); const Int colStridePart = A.PartialColStride(); const Int colStrideUnion = A.PartialUnionColStride(); const Int colRankPart = A.PartialColRank(); const Int colDiff = B.ColAlign() - Mod(A.ColAlign(),colStridePart); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,colStrideUnion); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); if( colDiff == 0 ) { if( A.PartialUnionColStride() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), firstBuf, portionSize ); // Simultaneously Gather in columns and Scatter in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, colRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( A.Grid().Rank() == 0 ) cerr << "Unaligned PartialColAllToAllPromote" << endl; #endif const Int sendColRankPart = Mod( colRankPart+colDiff, colStridePart ); const Int recvColRankPart = Mod( colRankPart-colDiff, colStridePart ); vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), secondBuf, portionSize ); // Realign the input mpi::SendRecv ( secondBuf, colStrideUnion*portionSize, sendColRankPart, firstBuf, colStrideUnion*portionSize, recvColRankPart, A.PartialColComm() ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, recvColRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } }
void TransposeDist( const DistMatrix<T,U,V>& A, DistMatrix<T,V,U>& B ) { DEBUG_ONLY(CSE cse("copy::TransposeDist")) AssertSameGrids( A, B ); const Grid& g = B.Grid(); B.Resize( A.Height(), A.Width() ); if( !B.Participating() ) return; const Int colStrideA = A.ColStride(); const Int rowStrideA = A.RowStride(); const Int distSize = A.DistSize(); if( A.DistSize() == 1 && B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else if( A.Width() == 1 ) { const Int height = A.Height(); const Int maxLocalHeight = MaxLength(height,distSize); const Int portionSize = mpi::Pad( maxLocalHeight ); const Int colDiff = Shift(A.DistRank(),A.ColAlign(),distSize) - Shift(B.DistRank(),B.ColAlign(),distSize); const Int sendRankB = Mod( B.DistRank()+colDiff, distSize ); const Int recvRankA = Mod( A.DistRank()-colDiff, distSize ); const Int recvRankB = (recvRankA/colStrideA)+rowStrideA*(recvRankA%colStrideA); vector<T> buffer; FastResize( buffer, (colStrideA+rowStrideA)*portionSize ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[colStrideA*portionSize]; if( A.RowRank() == A.RowAlign() ) { // Pack // TODO: Use kernel from copy::util const Int AColShift = A.ColShift(); const T* ABuf = A.LockedBuffer(); EL_PARALLEL_FOR for( Int k=0; k<rowStrideA; ++k ) { T* data = &recvBuf[k*portionSize]; const Int shift = Shift_(A.ColRank()+colStrideA*k,A.ColAlign(),distSize); const Int offset = (shift-AColShift) / colStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) data[iLoc] = ABuf[offset+iLoc*rowStrideA]; } } // (e.g., A[VC,STAR] <- A[MC,MR]) mpi::Scatter ( recvBuf, portionSize, sendBuf, portionSize, A.RowAlign(), A.RowComm() ); // (e.g., A[VR,STAR] <- A[VC,STAR]) mpi::SendRecv ( sendBuf, portionSize, sendRankB, recvBuf, portionSize, recvRankB, B.DistComm() ); // (e.g., A[MR,MC] <- A[VR,STAR]) mpi::Gather ( recvBuf, portionSize, sendBuf, portionSize, B.RowAlign(), B.RowComm() ); if( B.RowRank() == B.RowAlign() ) { // Unpack // TODO: Use kernel from copy::util T* bufB = B.Buffer(); EL_PARALLEL_FOR for( Int k=0; k<colStrideA; ++k ) { const T* data = &sendBuf[k*portionSize]; const Int shift = Shift_(B.ColRank()+rowStrideA*k,B.ColAlign(),distSize); const Int offset = (shift-B.ColShift()) / rowStrideA; const Int thisLocalHeight = Length_(height,shift,distSize); for( Int iLoc=0; iLoc<thisLocalHeight; ++iLoc ) bufB[offset+iLoc*colStrideA] = data[iLoc]; } } }
inline void ForwardMany ( const DistMatrix<F,VC,STAR>& L, DistMatrix<F,VC,STAR>& X ) { const Grid& g = L.Grid(); if( g.Size() == 1 ) { FrontLowerForwardSolve( L.LockedMatrix(), X.Matrix() ); return; } // Matrix views DistMatrix<F,VC,STAR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,VC,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionDown ( X, XT, XB, 0 ); while( LTL.Width() < L.Width() ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionDown ( XT, X0, /**/ /**/ X1, XB, X2, L11.Height() ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[VC,* ] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VC,* ] // X1[* ,* ] := (L11[* ,* ])^-1 X1[* ,* ] LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, F(1), L11_STAR_STAR, X1_STAR_STAR, true ); X1 = X1_STAR_STAR; // X2[VC,* ] -= L21[VC,* ] X1[* ,* ] LocalGemm( NORMAL, NORMAL, F(-1), L21, X1_STAR_STAR, F(1), X2 ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionDown ( XT, X0, X1, /**/ /**/ XB, X2 ); } }
inline R Row( DistMatrix<R>& chi, DistMatrix<R>& x ) { #ifndef RELEASE PushCallStack("reflector::Row"); if( chi.Grid() != x.Grid() ) throw std::logic_error ("chi and x must be distributed over the same grid"); if( chi.Height() != 1 || chi.Width() != 1 ) throw std::logic_error("chi must be a scalar"); if( x.Height() != 1 ) throw std::logic_error("x must be a row vector"); if( chi.Grid().Row() != chi.ColAlignment() ) throw std::logic_error("Reflecting with incorrect row of processes"); if( x.Grid().Row() != x.ColAlignment() ) throw std::logic_error("Reflecting with incorrect row of processes"); #endif const Grid& grid = x.Grid(); mpi::Comm rowComm = grid.RowComm(); const int gridCol = grid.Col(); const int gridWidth = grid.Width(); const int rowAlignment = chi.RowAlignment(); std::vector<R> localNorms(gridWidth); R localNorm = Nrm2( x.LockedMatrix() ); mpi::AllGather( &localNorm, 1, &localNorms[0], 1, rowComm ); R norm = blas::Nrm2( gridWidth, &localNorms[0], 1 ); if( norm == 0 ) { if( gridCol == rowAlignment ) chi.SetLocal(0,0,-chi.GetLocal(0,0)); #ifndef RELEASE PopCallStack(); #endif return R(2); } R alpha; if( gridCol == rowAlignment ) alpha = chi.GetLocal(0,0); mpi::Broadcast( &alpha, 1, rowAlignment, rowComm ); R beta; if( alpha <= 0 ) beta = lapack::SafeNorm( alpha, norm ); else beta = -lapack::SafeNorm( alpha, norm ); const R one = 1; const R safeMin = lapack::MachineSafeMin<R>(); const R epsilon = lapack::MachineEpsilon<R>(); const R safeInv = safeMin/epsilon; int count = 0; if( Abs(beta) < safeInv ) { R invOfSafeInv = one/safeInv; do { ++count; Scale( invOfSafeInv, x ); alpha *= invOfSafeInv; beta *= invOfSafeInv; } while( Abs(beta) < safeInv ); localNorm = Nrm2( x.LockedMatrix() ); mpi::AllGather( &localNorm, 1, &localNorms[0], 1, rowComm ); norm = blas::Nrm2( gridWidth, &localNorms[0], 1 ); if( alpha <= 0 ) beta = lapack::SafeNorm( alpha, norm ); else beta = -lapack::SafeNorm( alpha, norm ); } R tau = (beta-alpha)/beta; Scale( one/(alpha-beta), x ); for( int j=0; j<count; ++j ) beta *= safeInv; if( gridCol == rowAlignment ) chi.SetLocal(0,0,beta); #ifndef RELEASE PopCallStack(); #endif return tau; }