void ConsistentlyComputeDecomposition ( DistMatrix<Field,MC,MR,BLOCK>& H, DistMatrix<Complex<Base<Field>>,STAR,STAR>& w, Matrix<Field>& Z, const HessenbergSchurCtrl& ctrl=HessenbergSchurCtrl() ) { EL_DEBUG_CSE // Because double-precision floating-point computation is often // non-deterministic due to extra-precision computation being frequent but // not guaranteed, we must be careful to not allow this non-determinism to // be amplified by the forward instability of Francis sweeps. const Grid& grid = H.Grid(); const int owner = H.Owner(0,0); DistMatrix<Field,CIRC,CIRC> H_CIRC_CIRC( grid, owner ); H_CIRC_CIRC = H; w.Resize( H.Height(), 1 ); if( H_CIRC_CIRC.CrossRank() == H_CIRC_CIRC.Root() ) HessenbergSchur( H_CIRC_CIRC.Matrix(), w.Matrix(), Z, ctrl ); else Z.Resize( H.Height(), H.Height() ); H = H_CIRC_CIRC; El::Broadcast( w.Matrix(), H_CIRC_CIRC.CrossComm(), H_CIRC_CIRC.Root() ); El::Broadcast( Z, H_CIRC_CIRC.CrossComm(), H_CIRC_CIRC.Root() ); }
void ColumnMinAbs ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,V,STAR>& mins ) { EL_DEBUG_CSE const Int n = A.Width(); mins.AlignWith( A ); mins.Resize( n, 1 ); ColumnMinAbs( A.LockedMatrix(), mins.Matrix() ); AllReduce( mins.Matrix(), A.ColComm(), mpi::MIN ); }
inline void MakeJordan( DistMatrix<T,U,V>& J, T lambda ) { DEBUG_ONLY(CallStackEntry cse("MakeJordan")) Zero( J.Matrix() ); const Int localHeight = J.LocalHeight(); const Int localWidth = J.LocalWidth(); const Int colShift = J.ColShift(); const Int rowShift = J.RowShift(); const Int colStride = J.ColStride(); const Int rowStride = J.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*colStride; if( i == j ) J.SetLocal( iLoc, jLoc, lambda ); else if( i == j-1 ) J.SetLocal( iLoc, jLoc, T(1) ); } } }
inline void LocalLU( DistMatrix<F,STAR,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("LocalLU"); #endif LU( A.Matrix() ); }
inline void LocalHermitianInverse ( UpperOrLower uplo, DistMatrix<F,STAR,STAR>& A, LDLPivotType pivotType=BUNCH_KAUFMAN_A ) { DEBUG_ONLY(CallStackEntry cse("LocalHermitianInverse")) SymmetricInverse( uplo, A.Matrix(), true, pivotType ); }
inline void LocalHPDInverse( UpperOrLower uplo, DistMatrix<F,STAR,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("LocalHPDInverse"); #endif HPDInverse( uplo, A.Matrix() ); }
inline void LocalInverse( DistMatrix<F,STAR,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("LocalInverse"); #endif Inverse( A.Matrix() ); }
inline void Conjugate( DistMatrix<T,U,V>& A ) { #ifndef RELEASE CallStackEntry entry("Conjugate (in-place)"); #endif Conjugate( A.Matrix() ); }
inline void LocalTrdtrmm ( Orientation orientation, UpperOrLower uplo, DistMatrix<T,STAR,STAR>& A ) { #ifndef RELEASE CallStackEntry entry("LocalTrdtrmm"); #endif Trdtrmm( orientation, uplo, A.Matrix() ); }
inline void LocalGer ( T alpha, const DistMatrix<T,xColDist,xRowDist>& x, const DistMatrix<T,yColDist,yRowDist>& y, DistMatrix<T,AColDist,ARowDist>& A ) { DEBUG_ONLY(CallStackEntry cse("LocalGer")) // TODO: Add error checking here Ger( alpha, x.LockedMatrix(), y.LockedMatrix(), A.Matrix() ); }
void RowMaxNorms ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,U,STAR>& norms ) { DEBUG_CSE norms.AlignWith( A ); norms.Resize( A.Height(), 1 ); RowMaxNorms( A.LockedMatrix(), norms.Matrix() ); AllReduce( norms, A.RowComm(), mpi::MAX ); }
inline void DiagonalScale ( LeftOrRight side, Orientation orientation, const DistMatrix<typename Base<T>::type,U,V>& d, DistMatrix<T,W,Z>& X ) { #ifndef RELEASE PushCallStack("DiagonalScale"); #endif typedef typename Base<T>::type R; if( side == LEFT ) { if( U == W && V == STAR && d.ColAlignment() == X.ColAlignment() ) { DiagonalScale( LEFT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,W,STAR> d_W_STAR( X.Grid() ); d_W_STAR = d; DiagonalScale ( LEFT, orientation, d_W_STAR.LockedMatrix(), X.Matrix() ); } } else { if( U == Z && V == STAR && d.ColAlignment() == X.RowAlignment() ) { DiagonalScale( RIGHT, orientation, d.LockedMatrix(), X.Matrix() ); } else { DistMatrix<R,Z,STAR> d_Z_STAR( X.Grid() ); d_Z_STAR = d; DiagonalScale ( RIGHT, orientation, d_Z_STAR.LockedMatrix(), X.Matrix() ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void LocalHPDInverse( UpperOrLower uplo, DistMatrix<F,STAR,STAR>& A ) { #ifndef RELEASE PushCallStack("LocalHPDInverse"); #endif HPDInverse( uplo, A.Matrix() ); #ifndef RELEASE PopCallStack(); #endif }
inline void LocalInverse( DistMatrix<F,STAR,STAR>& A ) { #ifndef RELEASE PushCallStack("LocalInverse"); #endif Inverse( A.Matrix() ); #ifndef RELEASE PopCallStack(); #endif }
inline void LocalGer ( T alpha, const DistMatrix<T,xColDist,xRowDist>& x, const DistMatrix<T,yColDist,yRowDist>& y, DistMatrix<T,AColDist,ARowDist>& A ) { #ifndef RELEASE CallStackEntry entry("LocalGer"); // TODO: Add error checking here #endif Ger( alpha, x.LockedMatrix(), y.LockedMatrix(), A.Matrix() ); }
inline void Zero( DistMatrix<T,U,V>& A ) { #ifndef RELEASE PushCallStack("Zero"); #endif Zero( A.Matrix() ); #ifndef RELEASE PopCallStack(); #endif }
void AllGather ( const DistMatrix<T, U, V >& A, DistMatrix<T,Collect<U>(),Collect<V>()>& B ) { EL_DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); if( A.Participating() ) { if( A.DistSize() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int distStride = colStride*rowStride; const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); vector<T> buf; FastResize( buf, (distStride+1)*portionSize ); T* sendBuf = &buf[0]; T* recvBuf = &buf[portionSize]; // Pack util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf, 1, A.LocalHeight() ); // Communicate mpi::AllGather ( sendBuf, portionSize, recvBuf, portionSize, A.DistComm() ); // Unpack util::StridedUnpack ( height, width, A.ColAlign(), colStride, A.RowAlign(), rowStride, recvBuf, portionSize, B.Buffer(), B.LDim() ); } } if( A.Grid().InGrid() && A.CrossComm() != mpi::COMM_SELF ) El::Broadcast( B, A.CrossComm(), A.Root() ); }
void StackedGeometricColumnScaling ( const DistMatrix<Field, U,V >& A, const DistMatrix<Field, U,V >& B, DistMatrix<Base<Field>,V,STAR>& geomScaling ) { EL_DEBUG_CSE // NOTE: Assuming A.ColComm() == B.ColComm() and that the row alignments // are equal typedef Base<Field> Real; DistMatrix<Real,V,STAR> maxScalingA(A.Grid()), maxScalingB(A.Grid()); ColumnMaxNorms( A, maxScalingA ); ColumnMaxNorms( B, maxScalingB ); const Int mLocalA = A.LocalHeight(); const Int mLocalB = B.LocalHeight(); const Int nLocal = A.LocalWidth(); geomScaling.AlignWith( maxScalingA ); geomScaling.Resize( A.Width(), 1 ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.LockedMatrix(); auto& geomScalingLoc = geomScaling.Matrix(); auto& maxScalingALoc = maxScalingA.Matrix(); auto& maxScalingBLoc = maxScalingB.Matrix(); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { Real minAbs = Max(maxScalingALoc(jLoc),maxScalingBLoc(jLoc)); for( Int iLoc=0; iLoc<mLocalA; ++iLoc ) { const Real absVal = Abs(ALoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } for( Int iLoc=0; iLoc<mLocalB; ++iLoc ) { const Real absVal = Abs(BLoc(iLoc,jLoc)); if( absVal > 0 && absVal < minAbs ) minAbs = Min(minAbs,absVal); } geomScalingLoc(jLoc) = minAbs; } mpi::AllReduce( geomScaling.Buffer(), nLocal, mpi::MIN, A.ColComm() ); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { const Real maxAbsA = maxScalingALoc(jLoc); const Real maxAbsB = maxScalingBLoc(jLoc); const Real maxAbs = Max(maxAbsA,maxAbsB); const Real minAbs = geomScalingLoc(jLoc); geomScalingLoc(jLoc) = Sqrt(minAbs*maxAbs); } }
void RowTwoNorms ( const DistMatrix<F,U,V>& A, DistMatrix<Base<F>,U,STAR>& norms ) { DEBUG_CSE norms.AlignWith( A ); norms.Resize( A.Height(), 1 ); if( A.Width() == 0 ) { Zero( norms ); return; } RowTwoNormsHelper( A.LockedMatrix(), norms.Matrix(), A.RowComm() ); }
inline void LocalTrmm ( LeftOrRight side, UpperOrLower uplo, Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T,STAR,STAR>& A, DistMatrix<T,BColDist,BRowDist>& B ) { #ifndef RELEASE CallStackEntry entry("LocalTrmm"); if( (side == LEFT && BColDist != STAR) || (side == RIGHT && BRowDist != STAR) ) LogicError ("Distribution of RHS must conform with that of triangle"); #endif Trmm ( side, uplo, orientation, diag, alpha, A.LockedMatrix(), B.Matrix() ); }
void GeometricColumnScaling ( const DistMatrix<Field, U,V >& A, DistMatrix<Base<Field>,V,STAR>& geomScaling ) { EL_DEBUG_CSE typedef Base<Field> Real; DistMatrix<Real,V,STAR> maxScaling(A.Grid()); ColumnMaxNorms( A, maxScaling ); ColumnMinAbsNonzero( A, maxScaling, geomScaling ); const Int nLocal = A.LocalWidth(); auto& maxScalingLoc = maxScaling.Matrix(); auto& geomScalingLoc = geomScaling.Matrix(); for( Int jLoc=0; jLoc<nLocal; ++jLoc ) { const Real maxAbs = maxScalingLoc(jLoc); const Real minAbs = geomScalingLoc(jLoc); geomScalingLoc(jLoc) = Sqrt(minAbs*maxAbs); } }
void GeometricRowScaling ( const DistMatrix<Field, U,V >& A, DistMatrix<Base<Field>,U,STAR>& geomScaling ) { EL_DEBUG_CSE typedef Base<Field> Real; DistMatrix<Real,U,STAR> maxScaling(A.Grid()); RowMaxNorms( A, maxScaling ); RowMinAbsNonzero( A, maxScaling, geomScaling ); const Int mLocal = A.LocalHeight(); auto& maxScalingLoc = maxScaling.Matrix(); auto& geomScalingLoc = geomScaling.Matrix(); for( Int iLoc=0; iLoc<mLocal; ++iLoc ) { const Real maxAbs = maxScalingLoc(iLoc); const Real minAbs = geomScalingLoc(iLoc); geomScalingLoc(iLoc) = Sqrt(minAbs*maxAbs); } }
void IndexDependentMap ( const DistMatrix<S,U,V,wrap>& A, DistMatrix<T,U,V,wrap>& B, function<T(Int,Int,const S&)> func ) { EL_DEBUG_CSE const Int mLoc = A.LocalHeight(); const Int nLoc = A.LocalWidth(); B.AlignWith( A.DistData() ); B.Resize( A.Height(), A.Width() ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.Matrix(); for( Int jLoc=0; jLoc<nLoc; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<mLoc; ++iLoc ) { const Int i = A.GlobalRow(iLoc); BLoc(iLoc,jLoc) = func(i,j,ALoc(iLoc,jLoc)); } } }
inline void MakeIdentity( DistMatrix<T,U,V>& I ) { DEBUG_ONLY(CallStackEntry cse("MakeIdentity")) Zero( I.Matrix() ); const Int localHeight = I.LocalHeight(); const Int localWidth = I.LocalWidth(); const Int colShift = I.ColShift(); const Int rowShift = I.RowShift(); const Int colStride = I.ColStride(); const Int rowStride = I.RowStride(); for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int j = rowShift + jLoc*rowStride; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int i = colShift + iLoc*colStride; if( i == j ) I.SetLocal( iLoc, jLoc, T(1) ); } } }
void SweepHelper ( DistMatrix<Real,MC,MR,BLOCK>& H, DistMatrix<Complex<Real>,STAR,STAR>& shifts, DistMatrix<Real,MC,MR,BLOCK>& Z, const HessenbergSchurCtrl& ctrl ) { EL_DEBUG_CSE const Int n = H.Height(); const Int winBeg = ( ctrl.winBeg==END ? n : ctrl.winBeg ); const Int winEnd = ( ctrl.winEnd==END ? n : ctrl.winEnd ); const Int winSize = winEnd-winBeg; auto ctrlMod( ctrl ); ctrlMod.winBeg = winBeg; ctrlMod.winEnd = winEnd; const Int numShifts = shifts.Height(); multibulge::PairShifts( shifts.Matrix() ); const Int remainder = (numShifts % 2); if( remainder == 1 ) { LogicError ("Remainder shifts are not yet supported for distributed sweeps"); } auto shiftsEven = shifts(IR(remainder,END),ALL); if( winSize >= 4 ) { multibulge::Sweep( H, shiftsEven, Z, ctrlMod ); } else { // Sweep in pairs LogicError("Distributed pair sweeps are not yet supported"); } }
inline void ForwardMany ( const DistMatrix<F,VC,STAR>& L, DistMatrix<F,VC,STAR>& X ) { const Grid& g = L.Grid(); if( g.Size() == 1 ) { FrontLowerForwardSolve( L.LockedMatrix(), X.Matrix() ); return; } // Matrix views DistMatrix<F,VC,STAR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,VC,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionDown ( X, XT, XB, 0 ); while( LTL.Width() < L.Width() ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionDown ( XT, X0, /**/ /**/ X1, XB, X2, L11.Height() ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[VC,* ] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VC,* ] // X1[* ,* ] := (L11[* ,* ])^-1 X1[* ,* ] LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, F(1), L11_STAR_STAR, X1_STAR_STAR, true ); X1 = X1_STAR_STAR; // X2[VC,* ] -= L21[VC,* ] X1[* ,* ] LocalGemm( NORMAL, NORMAL, F(-1), L21, X1_STAR_STAR, F(1), X2 ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionDown ( XT, X0, X1, /**/ /**/ XB, X2 ); } }
void ColAllToAllPromote ( const DistMatrix<T, U, V >& A, DistMatrix<T,Partial<U>(),PartialUnionRow<U,V>()>& B ) { DEBUG_CSE AssertSameGrids( A, B ); const Int height = A.Height(); const Int width = A.Width(); B.AlignColsAndResize ( Mod(A.ColAlign(),B.ColStride()), height, width, false, false ); if( !B.Participating() ) return; const Int colStride = A.ColStride(); const Int colStridePart = A.PartialColStride(); const Int colStrideUnion = A.PartialUnionColStride(); const Int colRankPart = A.PartialColRank(); const Int colDiff = B.ColAlign() - Mod(A.ColAlign(),colStridePart); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,colStrideUnion); const Int portionSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); if( colDiff == 0 ) { if( A.PartialUnionColStride() == 1 ) { Copy( A.LockedMatrix(), B.Matrix() ); } else { vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), firstBuf, portionSize ); // Simultaneously Gather in columns and Scatter in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, colRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( A.Grid().Rank() == 0 ) cerr << "Unaligned PartialColAllToAllPromote" << endl; #endif const Int sendColRankPart = Mod( colRankPart+colDiff, colStridePart ); const Int recvColRankPart = Mod( colRankPart-colDiff, colStridePart ); vector<T> buffer; FastResize( buffer, 2*colStrideUnion*portionSize ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[colStrideUnion*portionSize]; // Pack util::RowStridedPack ( A.LocalHeight(), width, B.RowAlign(), colStrideUnion, A.LockedBuffer(), A.LDim(), secondBuf, portionSize ); // Realign the input mpi::SendRecv ( secondBuf, colStrideUnion*portionSize, sendColRankPart, firstBuf, colStrideUnion*portionSize, recvColRankPart, A.PartialColComm() ); // Simultaneously Scatter in columns and Gather in rows mpi::AllToAll ( firstBuf, portionSize, secondBuf, portionSize, A.PartialUnionColComm() ); // Unpack util::PartialColStridedUnpack ( height, B.LocalWidth(), A.ColAlign(), colStride, colStrideUnion, colStridePart, recvColRankPart, B.ColShift(), secondBuf, portionSize, B.Buffer(), B.LDim() ); } }
void TransformRows ( const Matrix<F>& Z, DistMatrix<F,MC,MR,BLOCK>& H ) { DEBUG_CSE const Int height = H.Height(); const Grid& grid = H.Grid(); const Int blockHeight = H.BlockHeight(); const Int firstBlockHeight = blockHeight - H.ColCut(); if( height <= firstBlockHeight || grid.Height() == 1 ) { if( grid.Row() == H.RowOwner(0) ) { // This process row can locally update its portion of H Matrix<F> HLocCopy( H.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), Z, HLocCopy, H.Matrix() ); } } else if( height <= firstBlockHeight + blockHeight ) { const bool firstRow = H.RowOwner( 0 ); const bool secondRow = H.RowOwner( firstBlockHeight ); if( grid.Row() == firstRow ) { // // Replace H with // // | ZLeft, ZRight |' | HTop |, // | HBottom | // // where HTop is owned by this process row and HBottom by the next. // auto ZLeft = Z( ALL, IR(0,firstBlockHeight) ); // Partition space for the combined matrix Matrix<F> HCombine( height, H.LocalWidth() ); auto HTop = HCombine( IR(0,firstBlockHeight), ALL ); auto HBottom = HCombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix HTop = H.LockedMatrix(); // Exchange the data El::SendRecv( HTop, HBottom, H.ColComm(), secondRow, secondRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), ZLeft, HCombine, H.Matrix() ); } else if( grid.Row() == secondRow ) { // // Replace H with // // | ZLeft, ZRight |' | HTop |, // | HBottom | // // where HTop is owned by the previous process row and HBottom by // this one. // auto ZRight = Z( ALL, IR(firstBlockHeight,END) ); // Partition space for the combined matrix Matrix<F> HCombine( height, H.LocalWidth() ); auto HTop = HCombine( IR(0,firstBlockHeight), ALL ); auto HBottom = HCombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix HBottom = H.LockedMatrix(); // Exchange the data El::SendRecv( HBottom, HTop, H.ColComm(), firstRow, firstRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), ZRight, HCombine, H.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,STAR,MR,BLOCK> H_STAR_MR( H ); Matrix<F> HLocCopy( H_STAR_MR.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), Z, HLocCopy, H_STAR_MR.Matrix() ); H = H_STAR_MR; } }
void TransformColumns ( const Matrix<F>& Z, DistMatrix<F,MC,MR,BLOCK>& H ) { DEBUG_CSE const Int width = H.Width(); const Grid& grid = H.Grid(); const Int blockWidth = H.BlockWidth(); const Int firstBlockWidth = blockWidth - H.RowCut(); if( width <= firstBlockWidth || grid.Width() == 1 ) { if( grid.Col() == H.ColOwner(0) ) { // This process row can locally update its portion of H Matrix<F> HLocCopy( H.Matrix() ); Gemm( NORMAL, NORMAL, F(1), HLocCopy, Z, H.Matrix() ); } } else if( width <= firstBlockWidth + blockWidth ) { const bool firstCol = H.ColOwner( 0 ); const bool secondCol = H.ColOwner( firstBlockWidth ); if( grid.Col() == firstCol ) { // // Replace H with // // | HLeft, HRight | | ZLeft, ZRight |, // // where HLeft is owned by this process column and HRight by the // next. // auto ZLeft = Z( ALL, IR(0,firstBlockWidth) ); // Partition space for the combined matrix Matrix<F> HCombine( H.LocalHeight(), width ); auto HLeft = HCombine( ALL, IR(0,firstBlockWidth) ); auto HRight = HCombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix HLeft = H.LockedMatrix(); // Exchange the data El::SendRecv( HLeft, HRight, H.RowComm(), secondCol, secondCol ); // Form our portion of the result Gemm( NORMAL, NORMAL, F(1), HCombine, ZLeft, H.Matrix() ); } else if( grid.Col() == secondCol ) { // // Replace H with // // | HLeft, HRight | | ZLeft, ZRight |, // // where HLeft is owned by the previous process column and HRight // by this one. // auto ZRight = Z( ALL, IR(firstBlockWidth,END) ); // Partition space for the combined matrix Matrix<F> HCombine( H.LocalHeight(), width ); auto HLeft = HCombine( ALL, IR(0,firstBlockWidth) ); auto HRight = HCombine( ALL, IR(firstBlockWidth,END) ); // Copy our portion into the combined matrix HRight = H.LockedMatrix(); // Exchange the data El::SendRecv( HRight, HLeft, H.RowComm(), firstCol, firstCol ); // Form our portion of the result Gemm( NORMAL, NORMAL, F(1), HCombine, ZRight, H.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,MC,STAR,BLOCK> H_MC_STAR( H ); Matrix<F> HLocCopy( H_MC_STAR.Matrix() ); Gemm( NORMAL, NORMAL, F(1), HLocCopy, Z, H_MC_STAR.Matrix() ); H = H_MC_STAR; } }
void Gather ( const BlockMatrix<T>& A, DistMatrix<T,CIRC,CIRC,BLOCK>& B ) { DEBUG_ONLY(CSE cse("copy::Gather")) AssertSameGrids( A, B ); if( A.DistSize() == 1 && A.CrossSize() == 1 ) { B.Resize( A.Height(), A.Width() ); if( B.CrossRank() == B.Root() ) Copy( A.LockedMatrix(), B.Matrix() ); return; } const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); // Gather the colShifts and rowShifts // ================================== Int myShifts[2]; myShifts[0] = A.ColShift(); myShifts[1] = A.RowShift(); vector<Int> shifts; const Int crossSize = B.CrossSize(); if( B.CrossRank() == B.Root() ) shifts.resize( 2*crossSize ); mpi::Gather( myShifts, 2, shifts.data(), 2, B.Root(), B.CrossComm() ); // Gather the payload data // ======================= const bool irrelevant = ( A.RedundantRank()!=0 || A.CrossRank()!=A.Root() ); int totalSend = ( irrelevant ? 0 : A.LocalHeight()*A.LocalWidth() ); vector<int> recvCounts, recvOffsets; if( B.CrossRank() == B.Root() ) recvCounts.resize( crossSize ); mpi::Gather( &totalSend, 1, recvCounts.data(), 1, B.Root(), B.CrossComm() ); int totalRecv = Scan( recvCounts, recvOffsets ); //vector<T> sendBuf(totalSend), recvBuf(totalRecv); vector<T> sendBuf, recvBuf; sendBuf.reserve( totalSend ); recvBuf.reserve( totalRecv ); if( !irrelevant ) copy::util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf.data(), 1, A.LocalHeight() ); mpi::Gather ( sendBuf.data(), totalSend, recvBuf.data(), recvCounts.data(), recvOffsets.data(), B.Root(), B.CrossComm() ); // Unpack // ====== const Int mb = A.BlockHeight(); const Int nb = A.BlockWidth(); const Int colCut = A.ColCut(); const Int rowCut = A.RowCut(); if( B.Root() == B.CrossRank() ) { for( Int q=0; q<crossSize; ++q ) { if( recvCounts[q] == 0 ) continue; const Int colShift = shifts[2*q+0]; const Int rowShift = shifts[2*q+1]; const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int localHeight = BlockedLength( height, colShift, mb, colCut, colStride ); const Int localWidth = BlockedLength( width, rowShift, nb, rowCut, rowStride ); const T* data = &recvBuf[recvOffsets[q]]; for( Int jLoc=0; jLoc<localWidth; ++jLoc ) { const Int jBefore = rowShift*nb - rowCut; const Int jLocAdj = ( rowShift==0 ? jLoc+rowCut : jLoc ); const Int numFilledLocalBlocks = jLocAdj / nb; const Int jMid = numFilledLocalBlocks*nb*rowStride; const Int jPost = jLocAdj-numFilledLocalBlocks*nb; const Int j = jBefore + jMid + jPost; const T* sourceCol = &data[jLoc*localHeight]; for( Int iLoc=0; iLoc<localHeight; ++iLoc ) { const Int iBefore = colShift*mb - colCut; const Int iLocAdj = (colShift==0 ? iLoc+colCut : iLoc); const Int numFilledLocalBlocks = iLocAdj / mb; const Int iMid = numFilledLocalBlocks*mb*colStride; const Int iPost = iLocAdj-numFilledLocalBlocks*mb; const Int i = iBefore + iMid + iPost; B.SetLocal(i,j,sourceCol[iLoc]); } } } } }