void Contract ( const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("Contract")) AssertSameGrids( A, B ); const Dist U = B.ColDist(); const Dist V = B.RowDist(); // TODO: Shorten this implementation? if( A.ColDist() == U && A.RowDist() == V ) { Copy( A, B ); } else if( A.ColDist() == U && A.RowDist() == Partial(V) ) { B.AlignAndResize ( A.ColAlign(), A.RowAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == Partial(U) && A.RowDist() == V ) { B.AlignAndResize ( A.ColAlign(), A.RowAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == U && A.RowDist() == Collect(V) ) { B.AlignColsAndResize ( A.ColAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == Collect(U) && A.RowDist() == V ) { B.AlignRowsAndResize ( A.RowAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == Collect(U) && A.RowDist() == Collect(V) ) { Zeros( B, A.Height(), A.Width() ); AxpyContract( T(1), A, B ); } else LogicError("Incompatible distributions"); }
void PartialColScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::PartialColScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("A and B must be the same size"); #ifdef EL_CACHE_WARNINGS if( A.Width() != 1 && A.Grid().Rank() == 0 ) { cerr << "axpy_contract::PartialColScatterUpdate potentially causes a large " "amount of cache-thrashing. If possible, avoid it by forming the " "(conjugate-)transpose of the [UGath,* ] matrix instead." << endl; } #endif if( B.ColAlign() % A.ColStride() == A.ColAlign() ) { const Int colStride = B.ColStride(); const Int colStridePart = B.PartialColStride(); const Int colStrideUnion = B.PartialUnionColStride(); const Int colRankPart = B.PartialColRank(); const Int colAlign = B.ColAlign(); const Int height = B.Height(); const Int width = B.Width(); const Int localHeight = B.LocalHeight(); const Int maxLocalHeight = MaxLength( height, colStride ); const Int recvSize = mpi::Pad( maxLocalHeight*width ); const Int sendSize = colStrideUnion*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::PartialColStridedPack ( height, width, colAlign, colStride, colStrideUnion, colStridePart, colRankPart, A.ColShift(), A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.PartialUnionColComm() ); // Unpack our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, width, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } else LogicError("Unaligned PartialColScatter not implemented"); }
void UniformHelmholtzGreens ( ElementalMatrix<Complex<Real>>& A, Int n, Real lambda ) { EL_DEBUG_CSE typedef Complex<Real> C; const Real pi = 4*Atan( Real(1) ); const Real k0 = 2*pi/lambda; const Grid& g = A.Grid(); // Generate a list of n uniform samples from the 3D unit ball DistMatrix<Real,STAR,VR> X_STAR_VR(3,n,g); for( Int jLoc=0; jLoc<X_STAR_VR.LocalWidth(); ++jLoc ) { Real x0, x1, x2; // Sample uniformly from [-1,+1]^3 until a point is drawn from the ball while( true ) { x0 = SampleUniform( Real(-1), Real(1) ); x1 = SampleUniform( Real(-1), Real(1) ); x2 = SampleUniform( Real(-1), Real(1) ); const Real radiusSq = x0*x0 + x1*x1 + x2*x2; if( radiusSq > 0 && radiusSq <= 1 ) break; } X_STAR_VR.SetLocal( 0, jLoc, x0 ); X_STAR_VR.SetLocal( 1, jLoc, x1 ); X_STAR_VR.SetLocal( 2, jLoc, x2 ); } DistMatrix<Real,STAR,STAR> X_STAR_STAR( X_STAR_VR ); A.Resize( n, n ); for( Int jLoc=0; jLoc<A.LocalWidth(); ++jLoc ) { const Int j = A.GlobalCol(jLoc); const Real xj0 = X_STAR_STAR.GetLocal(0,j); const Real xj1 = X_STAR_STAR.GetLocal(1,j); const Real xj2 = X_STAR_STAR.GetLocal(2,j); for( Int iLoc=0; iLoc<A.LocalHeight(); ++iLoc ) { const Int i = A.GlobalRow(iLoc); if( i == j ) { A.SetLocal( iLoc, jLoc, 0 ); } else { const Real d0 = X_STAR_STAR.GetLocal(0,i)-xj0; const Real d1 = X_STAR_STAR.GetLocal(1,i)-xj1; const Real d2 = X_STAR_STAR.GetLocal(2,i)-xj2; const Real gamma = k0*Sqrt(d0*d0+d1*d1+d2*d2); const Real realPart = Cos(gamma)/gamma; const Real imagPart = Sin(gamma)/gamma; A.SetLocal( iLoc, jLoc, C(realPart,imagPart) ); } } } }
void Scatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::Scatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("Sizes of A and B must match"); if( !B.Participating() ) return; const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); const Int colAlign = B.ColAlign(); const Int rowAlign = B.RowAlign(); const Int height = B.Height(); const Int width = B.Width(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int recvSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); const Int sendSize = colStride*rowStride*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::StridedPack ( height, width, colAlign, colStride, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.DistComm() ); // Unpack our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); }
void MakeExtendedKahan ( ElementalMatrix<F>& A, Base<F> phi, Base<F> mu ) { EL_DEBUG_CSE typedef Base<F> Real; if( A.Height() != A.Width() ) LogicError("Extended Kahan matrices must be square"); const Int n = A.Height(); if( n % 3 != 0 ) LogicError("Dimension must be an integer multiple of 3"); const Int l = n / 3; if( !l || (l & (l-1)) ) LogicError("n/3 is not a power of two"); Int k=0; while( Int(1u<<k) < l ) ++k; if( phi <= Real(0) || phi >= Real(1) ) LogicError("phi must be in (0,1)"); if( mu <= Real(0) || mu >= Real(1) ) LogicError("mu must be in (0,1)"); // Start by setting A to the identity, and then modify the necessary // l x l blocks of its 3 x 3 partitioning. MakeIdentity( A ); unique_ptr<ElementalMatrix<F>> ABlock( A.Construct(A.Grid(),A.Root()) ); View( *ABlock, A, IR(2*l,3*l), IR(2*l,3*l) ); *ABlock *= mu; View( *ABlock, A, IR(0,l), IR(l,2*l) ); Walsh( *ABlock, k ); *ABlock *= -phi; View( *ABlock, A, IR(l,2*l), IR(2*l,3*l) ); Walsh( *ABlock, k ); *ABlock *= phi; // Now scale A by S const Real zeta = Sqrt(Real(1)-phi*phi); auto& ALoc = A.Matrix(); for( Int iLoc=0; iLoc<A.LocalHeight(); ++iLoc ) { const Int i = A.GlobalRow(iLoc); const Real gamma = Pow(zeta,Real(i)); for( Int jLoc=0; jLoc<A.LocalWidth(); ++jLoc ) ALoc(iLoc,jLoc) *= gamma; } }
void IndexDependentMap ( const ElementalMatrix<S>& A, ElementalMatrix<T>& B, function<T(Int,Int,S)> func ) { DEBUG_CSE const Int mLoc = A.LocalHeight(); const Int nLoc = A.LocalWidth(); B.AlignWith( A.DistData() ); B.Resize( A.Height(), A.Width() ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.Matrix(); for( Int jLoc=0; jLoc<nLoc; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<mLoc; ++iLoc ) { const Int i = A.GlobalRow(iLoc); BLoc(iLoc,jLoc) = func(i,j,ALoc(iLoc,jLoc)); } } }
void Gather ( const ElementalMatrix<T>& A, DistMatrix<T,CIRC,CIRC>& B ) { DEBUG_ONLY(CSE cse("copy::Gather")) AssertSameGrids( A, B ); if( A.DistSize() == 1 && A.CrossSize() == 1 ) { B.Resize( A.Height(), A.Width() ); if( B.CrossRank() == B.Root() ) Copy( A.LockedMatrix(), B.Matrix() ); return; } const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); // Gather the colShifts and rowShifts // ================================== Int myShifts[2]; myShifts[0] = A.ColShift(); myShifts[1] = A.RowShift(); vector<Int> shifts; const Int crossSize = B.CrossSize(); if( B.CrossRank() == B.Root() ) shifts.resize( 2*crossSize ); mpi::Gather( myShifts, 2, shifts.data(), 2, B.Root(), B.CrossComm() ); // Gather the payload data // ======================= const bool irrelevant = ( A.RedundantRank()!=0 || A.CrossRank()!=A.Root() ); int totalSend = ( irrelevant ? 0 : A.LocalHeight()*A.LocalWidth() ); vector<int> recvCounts, recvOffsets; if( B.CrossRank() == B.Root() ) recvCounts.resize( crossSize ); mpi::Gather( &totalSend, 1, recvCounts.data(), 1, B.Root(), B.CrossComm() ); int totalRecv = Scan( recvCounts, recvOffsets ); //vector<T> sendBuf(totalSend), recvBuf(totalRecv); vector<T> sendBuf, recvBuf; sendBuf.reserve( totalSend ); recvBuf.reserve( totalRecv ); if( !irrelevant ) copy::util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf.data(), 1, A.LocalHeight() ); mpi::Gather ( sendBuf.data(), totalSend, recvBuf.data(), recvCounts.data(), recvOffsets.data(), B.Root(), B.CrossComm() ); // Unpack // ====== if( B.Root() == B.CrossRank() ) { for( Int q=0; q<crossSize; ++q ) { if( recvCounts[q] == 0 ) continue; const Int colShift = shifts[2*q+0]; const Int rowShift = shifts[2*q+1]; const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int localHeight = Length( height, colShift, colStride ); const Int localWidth = Length( width, rowShift, rowStride ); copy::util::InterleaveMatrix ( localHeight, localWidth, &recvBuf[recvOffsets[q]], 1, localHeight, B.Buffer(colShift,rowShift), colStride, rowStride*B.LDim() ); } } }
void Scatter ( const DistMatrix<T,CIRC,CIRC>& A, ElementalMatrix<T>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int m = A.Height(); const Int n = A.Width(); const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); B.Resize( m, n ); if( B.CrossSize() != 1 || B.RedundantSize() != 1 ) { // TODO: // Broadcast over the redundant communicator and use mpi::Translate // rank to determine whether a process is the root of the broadcast. GeneralPurpose( A, B ); return; } const Int pkgSize = mpi::Pad(MaxLength(m,colStride)*MaxLength(n,rowStride)); const Int recvSize = pkgSize; const Int sendSize = B.DistSize()*pkgSize; // Translate the root of A into the DistComm of B (if possible) const Int root = A.Root(); const Int target = mpi::Translate( A.CrossComm(), root, B.DistComm() ); if( target == mpi::UNDEFINED ) return; if( B.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); return; } vector<T> buffer; T* recvBuf=0; // some compilers (falsely) warn otherwise if( A.CrossRank() == root ) { FastResize( buffer, sendSize+recvSize ); T* sendBuf = &buffer[0]; recvBuf = &buffer[sendSize]; // Pack the send buffer copy::util::StridedPack ( m, n, B.ColAlign(), colStride, B.RowAlign(), rowStride, A.LockedBuffer(), A.LDim(), sendBuf, pkgSize ); // Scatter from the root mpi::Scatter ( sendBuf, pkgSize, recvBuf, pkgSize, target, B.DistComm() ); } else { FastResize( buffer, recvSize ); recvBuf = &buffer[0]; // Perform the receiving portion of the scatter from the non-root mpi::Scatter ( static_cast<T*>(0), pkgSize, recvBuf, pkgSize, target, B.DistComm() ); } // Unpack copy::util::InterleaveMatrix ( B.LocalHeight(), B.LocalWidth(), recvBuf, 1, B.LocalHeight(), B.Buffer(), 1, B.LDim() ); }
void RowScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::RowScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("Matrix sizes did not match"); if( !B.Participating() ) return; const Int width = B.Width(); const Int colDiff = B.ColAlign()-A.ColAlign(); if( colDiff == 0 ) { if( width == 1 ) { const Int localHeight = B.LocalHeight(); const Int portionSize = mpi::Pad( localHeight ); //vector<T> buffer( portionSize ); vector<T> buffer; buffer.reserve( portionSize ); // Reduce to rowAlign const Int rowAlign = B.RowAlign(); mpi::Reduce ( A.LockedBuffer(), buffer.data(), portionSize, rowAlign, B.RowComm() ); if( B.RowRank() == rowAlign ) { axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, 1, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { const Int rowStride = B.RowStride(); const Int rowAlign = B.RowAlign(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( localHeight*maxLocalWidth ); const Int sendSize = rowStride*portionSize; // Pack //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); copy::util::RowStridedPack ( localHeight, width, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), buffer.data(), portionSize ); // Communicate mpi::ReduceScatter( buffer.data(), portionSize, B.RowComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) cerr << "Unaligned RowScatter" << endl; #endif const Int colRank = B.ColRank(); const Int colStride = B.ColStride(); const Int sendRow = Mod( colRank+colDiff, colStride ); const Int recvRow = Mod( colRank-colDiff, colStride ); const Int localHeight = B.LocalHeight(); const Int localHeightA = A.LocalHeight(); if( width == 1 ) { //vector<T> buffer( localHeight+localHeightA ); vector<T> buffer; buffer.reserve( localHeight+localHeightA ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[localHeightA]; // Reduce to rowAlign const Int rowAlign = B.RowAlign(); mpi::Reduce ( A.LockedBuffer(), sendBuf, localHeightA, rowAlign, B.RowComm() ); if( B.RowRank() == rowAlign ) { // Perform the realignment mpi::SendRecv ( sendBuf, localHeightA, sendRow, recvBuf, localHeight, recvRow, B.ColComm() ); axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, 1, recvBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { const Int rowStride = B.RowStride(); const Int rowAlign = B.RowAlign(); const Int localWidth = B.LocalWidth(); const Int maxLocalWidth = MaxLength(width,rowStride); const Int recvSize_RS = mpi::Pad( localHeightA*maxLocalWidth ); const Int sendSize_RS = rowStride * recvSize_RS; const Int recvSize_SR = localHeight * localWidth; //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); vector<T> buffer; buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[recvSize_RS]; // Pack copy::util::RowStridedPack ( localHeightA, width, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), secondBuf, recvSize_RS ); // Reduce-scatter over each process row mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.RowComm() ); // Trade reduced data with the appropriate process row mpi::SendRecv ( firstBuf, localHeightA*localWidth, sendRow, secondBuf, localHeight*localWidth, recvRow, B.ColComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, secondBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } } }
void ColScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::ColScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("A and B must be the same size"); #ifdef EL_VECTOR_WARNINGS if( A.Width() == 1 && B.Grid().Rank() == 0 ) { cerr << "The vector version of ColScatter does not" " yet have a vector version implemented, but it would only " "require a modification of the vector version of RowScatter" << endl; } #endif #ifdef EL_CACHE_WARNINGS if( A.Width() != 1 && B.Grid().Rank() == 0 ) { cerr << "axpy_contract::ColScatter potentially causes a large " "amount of cache-thrashing. If possible, avoid it by forming the " "(conjugate-)transpose of the [* ,V] matrix instead." << endl; } #endif if( !B.Participating() ) return; const Int height = B.Height(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int colAlign = B.ColAlign(); const Int colStride = B.ColStride(); const Int rowDiff = B.RowAlign()-A.RowAlign(); // TODO: Allow for modular equivalence if possible if( rowDiff == 0 ) { const Int maxLocalHeight = MaxLength(height,colStride); const Int recvSize = mpi::Pad( maxLocalHeight*localWidth ); const Int sendSize = colStride*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::ColStridedPack ( height, localWidth, colAlign, colStride, A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.ColComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) cerr << "Unaligned ColScatter" << endl; #endif const Int localWidthA = A.LocalWidth(); const Int maxLocalHeight = MaxLength(height,colStride); const Int recvSize_RS = mpi::Pad( maxLocalHeight*localWidthA ); const Int sendSize_RS = colStride*recvSize_RS; const Int recvSize_SR = localHeight*localWidth; //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); vector<T> buffer; buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[recvSize_RS]; // Pack copy::util::ColStridedPack ( height, localWidth, colAlign, colStride, A.LockedBuffer(), A.LDim(), secondBuf, recvSize_RS ); // Reduce-scatter over each col mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.ColComm() ); // Trade reduced data with the appropriate col const Int sendCol = Mod( B.RowRank()+rowDiff, B.RowStride() ); const Int recvCol = Mod( B.RowRank()-rowDiff, B.RowStride() ); mpi::SendRecv ( firstBuf, localHeight*localWidthA, sendCol, secondBuf, localHeight*localWidth, recvCol, B.RowComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, secondBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } }