void GatherSubdiagonal ( const DistMatrix<F,MC,MR,BLOCK>& H, const IR& winInd, DistMatrix<Base<F>,STAR,STAR>& hSubWin ) { DEBUG_CSE const Int winSize = winInd.end - winInd.beg; const Int blockSize = H.BlockHeight(); const Grid& grid = H.Grid(); const auto& HLoc = H.LockedMatrix(); DEBUG_ONLY( if( H.BlockHeight() != H.BlockWidth() ) LogicError("Assumed square distribution blocks"); if( H.ColCut() != H.RowCut() ) LogicError("Assumed symmetric cuts"); if( blockSize < 2 ) LogicError("Assumed blocks of size at least two"); )
void TransformRows ( const Matrix<F>& Z, DistMatrix<F,MC,MR,BLOCK>& H ) { DEBUG_CSE const Int height = H.Height(); const Grid& grid = H.Grid(); const Int blockHeight = H.BlockHeight(); const Int firstBlockHeight = blockHeight - H.ColCut(); if( height <= firstBlockHeight || grid.Height() == 1 ) { if( grid.Row() == H.RowOwner(0) ) { // This process row can locally update its portion of H Matrix<F> HLocCopy( H.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), Z, HLocCopy, H.Matrix() ); } } else if( height <= firstBlockHeight + blockHeight ) { const bool firstRow = H.RowOwner( 0 ); const bool secondRow = H.RowOwner( firstBlockHeight ); if( grid.Row() == firstRow ) { // // Replace H with // // | ZLeft, ZRight |' | HTop |, // | HBottom | // // where HTop is owned by this process row and HBottom by the next. // auto ZLeft = Z( ALL, IR(0,firstBlockHeight) ); // Partition space for the combined matrix Matrix<F> HCombine( height, H.LocalWidth() ); auto HTop = HCombine( IR(0,firstBlockHeight), ALL ); auto HBottom = HCombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix HTop = H.LockedMatrix(); // Exchange the data El::SendRecv( HTop, HBottom, H.ColComm(), secondRow, secondRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), ZLeft, HCombine, H.Matrix() ); } else if( grid.Row() == secondRow ) { // // Replace H with // // | ZLeft, ZRight |' | HTop |, // | HBottom | // // where HTop is owned by the previous process row and HBottom by // this one. // auto ZRight = Z( ALL, IR(firstBlockHeight,END) ); // Partition space for the combined matrix Matrix<F> HCombine( height, H.LocalWidth() ); auto HTop = HCombine( IR(0,firstBlockHeight), ALL ); auto HBottom = HCombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix HBottom = H.LockedMatrix(); // Exchange the data El::SendRecv( HBottom, HTop, H.ColComm(), firstRow, firstRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), ZRight, HCombine, H.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,STAR,MR,BLOCK> H_STAR_MR( H ); Matrix<F> HLocCopy( H_STAR_MR.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), Z, HLocCopy, H_STAR_MR.Matrix() ); H = H_STAR_MR; } }
HessenbergSchurInfo MultiBulge ( DistMatrix<F,MC,MR,BLOCK>& H, DistMatrix<Complex<Base<F>>,STAR,STAR>& w, DistMatrix<F,MC,MR,BLOCK>& Z, const HessenbergSchurCtrl& ctrl ) { DEBUG_CSE typedef Base<F> Real; const Real zero(0); const Grid& grid = H.Grid(); const Int n = H.Height(); Int winBeg = ( ctrl.winBeg==END ? n : ctrl.winBeg ); Int winEnd = ( ctrl.winEnd==END ? n : ctrl.winEnd ); const Int winSize = winEnd - winBeg; const Int blockSize = H.BlockHeight(); // TODO(poulson): Implement a more reasonable/configurable means of deciding // when to call the sequential implementation Int minMultiBulgeSize = Max( ctrl.minMultiBulgeSize, 2*blockSize ); // This maximum is meant to account for parallel overheads and needs to be // more principled (and perhaps based upon the number of workers and the // cluster characteristics) // TODO(poulson): Re-enable this //minMultiBulgeSize = Max( minMultiBulgeSize, 500 ); HessenbergSchurInfo info; w.Resize( n, 1 ); if( winSize < minMultiBulgeSize ) { return multibulge::RedundantlyHandleWindow( H, w, Z, ctrl ); } auto ctrlShifts( ctrl ); ctrlShifts.winBeg = 0; ctrlShifts.winEnd = END; ctrlShifts.fullTriangle = false; Int numIterSinceDeflation = 0; const Int numStaleIterBeforeExceptional = 5; // Cf. LAPACK's DLAQR0 for this choice const Int maxIter = Max(30,2*numStaleIterBeforeExceptional) * Max(10,winSize); Int iterBegLast=-1, winEndLast=-1; DistMatrix<F,STAR,STAR> hMainWin(grid), hSuperWin(grid); DistMatrix<Real,STAR,STAR> hSubWin(grid); while( winBeg < winEnd ) { if( info.numIterations >= maxIter ) { if( ctrl.demandConverged ) RuntimeError("MultiBulge QR iteration did not converge"); else break; } auto winInd = IR(winBeg,winEnd); // Detect an irreducible Hessenberg window, [iterBeg,winEnd) // --------------------------------------------------------- // TODO(poulson): Have the interblock chase from the previous sweep // collect the main and sub diagonal of H along the diagonal workers // and then broadcast across the "cross" communicator. util::GatherTridiagonal( H, winInd, hMainWin, hSubWin, hSuperWin ); Output("winBeg=",winBeg,", winEnd=",winEnd); Print( H, "H" ); Print( hMainWin, "hMainWin" ); Print( hSubWin, "hSubWin" ); Print( hSuperWin, "hSuperWin" ); const Int iterOffset = DetectSmallSubdiagonal ( hMainWin.Matrix(), hSubWin.Matrix(), hSuperWin.Matrix() ); const Int iterBeg = winBeg + iterOffset; const Int iterWinSize = winEnd-iterBeg; if( iterOffset > 0 ) { H.Set( iterBeg, iterBeg-1, zero ); hSubWin.Set( iterOffset-1, 0, zero ); } if( iterWinSize == 1 ) { if( ctrl.progress ) Output("One-by-one window at ",iterBeg); w.Set( iterBeg, 0, hMainWin.GetLocal(iterOffset,0) ); winEnd = iterBeg; numIterSinceDeflation = 0; continue; } else if( iterWinSize == 2 ) { if( ctrl.progress ) Output("Two-by-two window at ",iterBeg); const F eta00 = hMainWin.GetLocal(iterOffset,0); const F eta01 = hSuperWin.GetLocal(iterOffset,0); const Real eta10 = hSubWin.GetLocal(iterOffset,0); const F eta11 = hMainWin.GetLocal(iterOffset+1,0); multibulge::TwoByTwo ( H, eta00, eta01, eta10, eta11, w, Z, iterBeg, ctrl ); winEnd = iterBeg; numIterSinceDeflation = 0; continue; } else if( iterWinSize < minMultiBulgeSize ) { // The window is small enough to switch to the simple scheme if( ctrl.progress ) Output("Redundantly handling window [",iterBeg,",",winEnd,"]"); auto ctrlIter( ctrl ); ctrlIter.winBeg = iterBeg; ctrlIter.winEnd = winEnd; auto iterInfo = multibulge::RedundantlyHandleWindow( H, w, Z, ctrlIter ); info.numIterations += iterInfo.numIterations; winEnd = iterBeg; numIterSinceDeflation = 0; continue; } const Int numShiftsRec = ctrl.numShifts( n, iterWinSize ); if( ctrl.progress ) { Output("Iter. ",info.numIterations,": "); Output(" window is [",iterBeg,",",winEnd,")"); Output(" recommending ",numShiftsRec," shifts"); } // NOTE(poulson): In the case where exceptional shifts are used, the // main and subdiagonals of H in the window are currently redundantly // gathered. It could be worthwhile to pass in hMainWin and hSubWin. const Int shiftBeg = multibulge::ComputeShifts ( H, w, iterBeg, winBeg, winEnd, numShiftsRec, numIterSinceDeflation, numStaleIterBeforeExceptional, ctrlShifts ); auto shiftInd = IR(shiftBeg,winEnd); auto wShifts = w(shiftInd,ALL); // Perform a small-bulge sweep auto ctrlSweep( ctrl ); ctrlSweep.winBeg = iterBeg; ctrlSweep.winEnd = winEnd; multibulge::Sweep( H, wShifts, Z, ctrlSweep ); ++info.numIterations; if( iterBeg == iterBegLast && winEnd == winEndLast ) ++numIterSinceDeflation; iterBegLast = iterBeg; winEndLast = winEnd; } info.numUnconverged = winEnd-winBeg; return info; }
void TransformRows ( const Matrix<F>& V, DistMatrix<F,MC,MR,BLOCK>& A ) { DEBUG_CSE const Int height = A.Height(); const Grid& grid = A.Grid(); const Int blockHeight = A.BlockHeight(); const Int firstBlockHeight = blockHeight - A.ColCut(); if( height <= firstBlockHeight || grid.Height() == 1 ) { if( grid.Row() == A.RowOwner(0) ) { // This process row can locally update its portion of A TransformRows( V, A.Matrix() ); } } else if( height <= firstBlockHeight + blockHeight ) { const int firstRow = A.RowOwner( 0 ); const int secondRow = A.RowOwner( firstBlockHeight ); if( grid.Row() == firstRow ) { // // Replace A with // // | VLeft, VRight |' | ATop |, // | ABottom | // // where ATop is owned by this process row and ABottom by the next. // auto VLeft = V( ALL, IR(0,firstBlockHeight) ); // Partition space for the combined matrix Matrix<F> ACombine( height, A.LocalWidth() ); auto ATop = ACombine( IR(0,firstBlockHeight), ALL ); auto ABottom = ACombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix ATop = A.LockedMatrix(); // Exchange the data El::SendRecv( ATop, ABottom, A.ColComm(), secondRow, secondRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), VLeft, ACombine, A.Matrix() ); } else if( grid.Row() == secondRow ) { // // Replace A with // // | VLeft, VRight |' | ATop |, // | ABottom | // // where ATop is owned by the previous process row and ABottom by // this one. // auto VRight = V( ALL, IR(firstBlockHeight,END) ); // Partition space for the combined matrix Matrix<F> ACombine( height, A.LocalWidth() ); auto ATop = ACombine( IR(0,firstBlockHeight), ALL ); auto ABottom = ACombine( IR(firstBlockHeight,END), ALL ); // Copy our portion into the combined matrix ABottom = A.LockedMatrix(); // Exchange the data El::SendRecv( ABottom, ATop, A.ColComm(), firstRow, firstRow ); // Form our portion of the result Gemm( ADJOINT, NORMAL, F(1), VRight, ACombine, A.Matrix() ); } } else { // Fall back to the entire process column interacting. // TODO(poulson): Only form the subset of the result that we need. DistMatrix<F,STAR,MR,BLOCK> A_STAR_MR( A ); Matrix<F> ALocCopy( A_STAR_MR.Matrix() ); Gemm( ADJOINT, NORMAL, F(1), V, ALocCopy, A_STAR_MR.Matrix() ); A = A_STAR_MR; } }