void EntrywiseMap ( const ElementalMatrix<S>& A, ElementalMatrix<T>& B, function<T(S)> func ) { if( A.DistData().colDist == B.DistData().colDist && A.DistData().rowDist == B.DistData().rowDist ) { B.AlignWith( A.DistData() ); B.Resize( A.Height(), A.Width() ); EntrywiseMap( A.LockedMatrix(), B.Matrix(), func ); } else { B.Resize( A.Height(), A.Width() ); #define GUARD(CDIST,RDIST) \ B.DistData().colDist == CDIST && B.DistData().rowDist == RDIST #define PAYLOAD(CDIST,RDIST) \ DistMatrix<S,CDIST,RDIST> AProx(B.Grid()); \ AProx.AlignWith( B.DistData() ); \ Copy( A, AProx ); \ EntrywiseMap( AProx.Matrix(), B.Matrix(), func ); #include <El/macros/GuardAndPayload.h> #undef GUARD #undef PAYLOAD } }
void TestCorrectness ( bool print, const ElementalMatrix<Complex<Real>>& A, const ElementalMatrix<Complex<Real>>& w, const ElementalMatrix<Complex<Real>>& V ) { const Int n = A.Height(); const Real eps = limits::Epsilon<Real>(); const Real oneNormA = OneNorm( A ); // Find the residual R = AV-VW DistMatrix<Complex<Real>> R( V.Height(), V.Width(), A.Grid() ); Gemm ( NORMAL, NORMAL, Complex<Real>(1), A, V, Complex<Real>(0), R); DistMatrix<Complex<Real>> VW( V ); DiagonalScale( RIGHT, NORMAL, w, VW ); R -= VW; const Real infError = InfinityNorm( R ); const Real relError = infError / (eps*n*oneNormA); OutputFromRoot (A.Grid().Comm(),"|| A V - V W ||_oo / (eps n || A ||_1) = ",relError); // TODO: A more refined failure condition if( relError > Real(100) ) LogicError("Relative error was unacceptably large"); }
void PartialColScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::PartialColScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("A and B must be the same size"); #ifdef EL_CACHE_WARNINGS if( A.Width() != 1 && A.Grid().Rank() == 0 ) { cerr << "axpy_contract::PartialColScatterUpdate potentially causes a large " "amount of cache-thrashing. If possible, avoid it by forming the " "(conjugate-)transpose of the [UGath,* ] matrix instead." << endl; } #endif if( B.ColAlign() % A.ColStride() == A.ColAlign() ) { const Int colStride = B.ColStride(); const Int colStridePart = B.PartialColStride(); const Int colStrideUnion = B.PartialUnionColStride(); const Int colRankPart = B.PartialColRank(); const Int colAlign = B.ColAlign(); const Int height = B.Height(); const Int width = B.Width(); const Int localHeight = B.LocalHeight(); const Int maxLocalHeight = MaxLength( height, colStride ); const Int recvSize = mpi::Pad( maxLocalHeight*width ); const Int sendSize = colStrideUnion*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::PartialColStridedPack ( height, width, colAlign, colStride, colStrideUnion, colStridePart, colRankPart, A.ColShift(), A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.PartialUnionColComm() ); // Unpack our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, width, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } else LogicError("Unaligned PartialColScatter not implemented"); }
void PartialRowScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::PartialRowScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("Matrix sizes did not match"); if( !B.Participating() ) return; if( B.RowAlign() % A.RowStride() == A.RowAlign() ) { const Int rowStride = B.RowStride(); const Int rowStridePart = B.PartialRowStride(); const Int rowStrideUnion = B.PartialUnionRowStride(); const Int rowRankPart = B.PartialRowRank(); const Int height = B.Height(); const Int width = B.Width(); const Int maxLocalWidth = MaxLength( width, rowStride ); const Int recvSize = mpi::Pad( height*maxLocalWidth ); const Int sendSize = rowStrideUnion*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::PartialRowStridedPack ( height, width, B.RowAlign(), rowStride, rowStrideUnion, rowStridePart, rowRankPart, A.RowShift(), A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.PartialUnionRowComm() ); // Unpack our received data axpy::util::InterleaveMatrixUpdate ( alpha, height, B.LocalWidth(), buffer.data(), 1, height, B.Buffer(), 1, B.LDim() ); } else LogicError("Unaligned PartialRowScatter not implemented"); }
void Contract ( const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("Contract")) AssertSameGrids( A, B ); const Dist U = B.ColDist(); const Dist V = B.RowDist(); // TODO: Shorten this implementation? if( A.ColDist() == U && A.RowDist() == V ) { Copy( A, B ); } else if( A.ColDist() == U && A.RowDist() == Partial(V) ) { B.AlignAndResize ( A.ColAlign(), A.RowAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == Partial(U) && A.RowDist() == V ) { B.AlignAndResize ( A.ColAlign(), A.RowAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == U && A.RowDist() == Collect(V) ) { B.AlignColsAndResize ( A.ColAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == Collect(U) && A.RowDist() == V ) { B.AlignRowsAndResize ( A.RowAlign(), A.Height(), A.Width(), false, false ); Zeros( B.Matrix(), B.LocalHeight(), B.LocalWidth() ); AxpyContract( T(1), A, B ); } else if( A.ColDist() == Collect(U) && A.RowDist() == Collect(V) ) { Zeros( B, A.Height(), A.Width() ); AxpyContract( T(1), A, B ); } else LogicError("Incompatible distributions"); }
void Scatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::Scatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("Sizes of A and B must match"); if( !B.Participating() ) return; const Int colStride = B.ColStride(); const Int rowStride = B.RowStride(); const Int colAlign = B.ColAlign(); const Int rowAlign = B.RowAlign(); const Int height = B.Height(); const Int width = B.Width(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int maxLocalHeight = MaxLength(height,colStride); const Int maxLocalWidth = MaxLength(width,rowStride); const Int recvSize = mpi::Pad( maxLocalHeight*maxLocalWidth ); const Int sendSize = colStride*rowStride*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::StridedPack ( height, width, colAlign, colStride, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.DistComm() ); // Unpack our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); }
void LUNMedium ( const ElementalMatrix<F>& UPre, ElementalMatrix<F>& XPre, bool checkIfSingular ) { DEBUG_CSE const Int m = XPre.Height(); const Int bsize = Blocksize(); const Grid& g = UPre.Grid(); DistMatrixReadProxy<F,F,MC,MR> UProx( UPre ); DistMatrixReadWriteProxy<F,F,MC,MR> XProx( XPre ); auto& U = UProx.GetLocked(); auto& X = XProx.Get(); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MR, STAR> X1Trans_MR_STAR(g); const Int kLast = LastOffset( m, bsize ); Int k=kLast, kOld=m; while( true ) { const bool in2x2 = ( k>0 && U.Get(k,k-1) != F(0) ); if( in2x2 ) --k; const Int nb = kOld-k; const Range<Int> ind0( 0, k ), ind1( k, k+nb ); auto U01 = U( ind0, ind1 ); auto U11 = U( ind1, ind1 ); auto X0 = X( ind0, ALL ); auto X1 = X( ind1, ALL ); U11_STAR_STAR = U11; // U11[* ,* ] <- U11[MC,MR] X1Trans_MR_STAR.AlignWith( X0 ); Transpose( X1, X1Trans_MR_STAR ); // X1^T[MR,* ] := X1^T[MR,* ] U11^-T[* ,* ] // = (U11^-1[* ,* ] X1[* ,MR])^T LocalQuasiTrsm ( RIGHT, UPPER, TRANSPOSE, F(1), U11_STAR_STAR, X1Trans_MR_STAR, checkIfSingular ); Transpose( X1Trans_MR_STAR, X1 ); U01_MC_STAR.AlignWith( X0 ); U01_MC_STAR = U01; // U01[MC,* ] <- U01[MC,MR] // X0[MC,MR] -= U01[MC,* ] X1[* ,MR] LocalGemm ( NORMAL, TRANSPOSE, F(-1), U01_MC_STAR, X1Trans_MR_STAR, F(1), X0 ); if( k == 0 ) break; kOld = k; k -= Min(bsize,k); } }
void TransposeContract ( const ElementalMatrix<T>& A, ElementalMatrix<T>& B, bool conjugate ) { EL_DEBUG_CSE const Dist U = B.ColDist(); const Dist V = B.RowDist(); if( A.ColDist() == V && A.RowDist() == Partial(U) ) { Transpose( A, B, conjugate ); } else { unique_ptr<ElementalMatrix<T>> ASumFilt( B.ConstructTranspose(B.Grid(),B.Root()) ); if( B.ColConstrained() ) ASumFilt->AlignRowsWith( B, true ); if( B.RowConstrained() ) ASumFilt->AlignColsWith( B, true ); Contract( A, *ASumFilt ); if( !B.ColConstrained() ) B.AlignColsWith( *ASumFilt, false ); if( !B.RowConstrained() ) B.AlignRowsWith( *ASumFilt, false ); // We should have ensured that the alignments match B.Resize( A.Width(), A.Height() ); Transpose( ASumFilt->LockedMatrix(), B.Matrix(), conjugate ); } }
void AugmentedKKT ( const ElementalMatrix<Real>& A, const ElementalMatrix<Real>& x, const ElementalMatrix<Real>& z, ElementalMatrix<Real>& JPre, bool onlyLower ) { EL_DEBUG_CSE const Int m = A.Height(); const Int n = A.Width(); DistMatrixWriteProxy<Real,Real,MC,MR> JProx( JPre ); auto& J = JProx.Get(); Zeros( J, m+n, m+n ); const IR xInd(0,n), yInd(n,n+m); auto Jxx = J(xInd,xInd); auto Jxy = J(xInd,yInd); auto Jyx = J(yInd,xInd); auto Jyy = J(yInd,yInd); DistMatrix<Real,MC,STAR> d( z ); DiagonalSolve( LEFT, NORMAL, x, d ); Diagonal( Jxx, d ); Jyx = A; if( !onlyLower ) Transpose( A, Jxy ); }
void MaxEig ( const ElementalMatrix<Real>& xPre, ElementalMatrix<Real>& maxEigsPre, const ElementalMatrix<Int>& orders, const ElementalMatrix<Int>& firstIndsPre, Int cutoff ) { DEBUG_ONLY(CSE cse("soc::MaxEig")) AssertSameGrids( xPre, maxEigsPre, orders, firstIndsPre ); ElementalProxyCtrl ctrl; ctrl.colConstrain = true; ctrl.colAlign = 0; DistMatrixReadProxy<Real,Real,VC,STAR> xProx( xPre, ctrl ); DistMatrixWriteProxy<Real,Real,VC,STAR> maxEigsProx( maxEigsPre, ctrl ); DistMatrixReadProxy<Int,Int,VC,STAR> firstIndsProx( firstIndsPre, ctrl ); auto& x = xProx.GetLocked(); auto& maxEigs = maxEigsProx.Get(); auto& firstInds = firstIndsProx.GetLocked(); const Int height = x.Height(); const Int localHeight = x.LocalHeight(); DEBUG_ONLY( if( x.Width() != 1 || orders.Width() != 1 || firstInds.Width() != 1 ) LogicError("x, orders, and firstInds should be column vectors"); if( orders.Height() != height || firstInds.Height() != height ) LogicError("orders and firstInds should be of the same height as x"); )
void MakeExtendedKahan ( ElementalMatrix<F>& A, Base<F> phi, Base<F> mu ) { EL_DEBUG_CSE typedef Base<F> Real; if( A.Height() != A.Width() ) LogicError("Extended Kahan matrices must be square"); const Int n = A.Height(); if( n % 3 != 0 ) LogicError("Dimension must be an integer multiple of 3"); const Int l = n / 3; if( !l || (l & (l-1)) ) LogicError("n/3 is not a power of two"); Int k=0; while( Int(1u<<k) < l ) ++k; if( phi <= Real(0) || phi >= Real(1) ) LogicError("phi must be in (0,1)"); if( mu <= Real(0) || mu >= Real(1) ) LogicError("mu must be in (0,1)"); // Start by setting A to the identity, and then modify the necessary // l x l blocks of its 3 x 3 partitioning. MakeIdentity( A ); unique_ptr<ElementalMatrix<F>> ABlock( A.Construct(A.Grid(),A.Root()) ); View( *ABlock, A, IR(2*l,3*l), IR(2*l,3*l) ); *ABlock *= mu; View( *ABlock, A, IR(0,l), IR(l,2*l) ); Walsh( *ABlock, k ); *ABlock *= -phi; View( *ABlock, A, IR(l,2*l), IR(2*l,3*l) ); Walsh( *ABlock, k ); *ABlock *= phi; // Now scale A by S const Real zeta = Sqrt(Real(1)-phi*phi); auto& ALoc = A.Matrix(); for( Int iLoc=0; iLoc<A.LocalHeight(); ++iLoc ) { const Int i = A.GlobalRow(iLoc); const Real gamma = Pow(zeta,Real(i)); for( Int jLoc=0; jLoc<A.LocalWidth(); ++jLoc ) ALoc(iLoc,jLoc) *= gamma; } }
void ExplicitTriang( ElementalMatrix<F>& A ) { DEBUG_ONLY(CSE cse("rq::ExplicitTriang")) DistMatrix<F,MD,STAR> t(A.Grid()); DistMatrix<Base<F>,MD,STAR> d(A.Grid()); Householder( A, t, d ); MakeTrapezoidal( UPPER, A, A.Width()-A.Height() ); }
void IPM ( const ElementalMatrix<Real>& A, const ElementalMatrix<Real>& b, Real lambda, ElementalMatrix<Real>& x, const qp::affine::Ctrl<Real>& ctrl ) { DEBUG_CSE const Int m = A.Height(); const Int n = A.Width(); const Grid& g = A.Grid(); const Range<Int> uInd(0,n), vInd(n,2*n), rInd(2*n,2*n+m); DistMatrix<Real> Q(g), c(g), AHat(g), G(g), h(g); // Q := | 0 0 0 | // | 0 0 0 | // | 0 0 I | // ============== Zeros( Q, 2*n+m, 2*n+m ); auto Qrr = Q( rInd, rInd ); FillDiagonal( Qrr, Real(1) ); // c := lambda*[1;1;0] // =================== Zeros( c, 2*n+m, 1 ); auto cuv = c( IR(0,2*n), ALL ); Fill( cuv, lambda ); // \hat A := [A, -A, I] // ==================== Zeros( AHat, m, 2*n+m ); auto AHatu = AHat( IR(0,m), uInd ); auto AHatv = AHat( IR(0,m), vInd ); auto AHatr = AHat( IR(0,m), rInd ); AHatu = A; AHatv -= A; FillDiagonal( AHatr, Real(1) ); // G := | -I 0 0 | // | 0 -I 0 | // ================ Zeros( G, 2*n, 2*n+m ); FillDiagonal( G, Real(-1) ); // h := 0 // ====== Zeros( h, 2*n, 1 ); // Solve the affine QP // =================== DistMatrix<Real> xHat(g), y(g), z(g), s(g); QP( Q, AHat, G, b, c, h, xHat, y, z, s, ctrl ); // x := u - v // ========== x = xHat( uInd, ALL ); x -= xHat( vInd, ALL ); }
void SDC ( UpperOrLower uplo, ElementalMatrix<F>& APre, ElementalMatrix<Base<F>>& wPre, const HermitianSDCCtrl<Base<F>> ctrl ) { DEBUG_CSE typedef Base<F> Real; const Int n = APre.Height(); wPre.Resize( n, 1 ); if( APre.Grid().Size() == 1 ) { HermitianEig( uplo, APre.Matrix(), wPre.Matrix() ); return; } if( n <= ctrl.cutoff ) { HermitianEig( uplo, APre, wPre ); return; } DistMatrixReadWriteProxy<F,F,MC,MR> AProx( APre ); DistMatrixWriteProxy<Real,Real,VR,STAR> wProx( wPre ); auto& A = AProx.Get(); auto& w = wProx.Get(); // Perform this level's split const auto part = SpectralDivide( uplo, A, ctrl ); auto ind1 = IR(0,part.index); auto ind2 = IR(part.index,n); auto ATL = A( ind1, ind1 ); auto ATR = A( ind1, ind2 ); auto ABL = A( ind2, ind1 ); auto ABR = A( ind2, ind2 ); auto wT = w( ind1, ALL ); auto wB = w( ind2, ALL ); if( uplo == LOWER ) Zero( ABL ); else Zero( ATR ); // Recurse on the two subproblems DistMatrix<F> ATLSub, ABRSub; DistMatrix<Real,VR,STAR> wTSub, wBSub; PushSubproblems ( ATL, ABR, ATLSub, ABRSub, wT, wB, wTSub, wBSub, ctrl.progress ); if( ATL.Participating() ) SDC( uplo, ATLSub, wTSub, ctrl ); if( ABR.Participating() ) SDC( uplo, ABRSub, wBSub, ctrl ); PullSubproblems( ATL, ABR, ATLSub, ABRSub, wT, wB, wTSub, wBSub ); }
void LLNMedium ( const ElementalMatrix<F>& LPre, ElementalMatrix<F>& XPre, bool checkIfSingular ) { DEBUG_CSE const Int m = XPre.Height(); const Int bsize = Blocksize(); const Grid& g = LPre.Grid(); DistMatrixReadProxy<F,F,MC,MR> LProx( LPre ); DistMatrixReadWriteProxy<F,F,MC,MR> XProx( XPre ); auto& L = LProx.GetLocked(); auto& X = XProx.Get(); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,MC, STAR> L21_MC_STAR(g); DistMatrix<F,MR, STAR> X1Trans_MR_STAR(g); for( Int k=0; k<m; k+=bsize ) { const Int nbProp = Min(bsize,m-k); const bool in2x2 = ( k+nbProp<m && L.Get(k+nbProp-1,k+nbProp) != F(0) ); const Int nb = ( in2x2 ? nbProp+1 : nbProp ); const Range<Int> ind1( k, k+nb ), ind2( k+nb, m ); auto L11 = L( ind1, ind1 ); auto L21 = L( ind2, ind1 ); auto X1 = X( ind1, ALL ); auto X2 = X( ind2, ALL ); L11_STAR_STAR = L11; // L11[* ,* ] <- L11[MC,MR] X1Trans_MR_STAR.AlignWith( X2 ); Transpose( X1, X1Trans_MR_STAR ); // X1^T[MR,* ] := X1^T[MR,* ] L11^-T[* ,* ] // = (L11^-1[* ,* ] X1[* ,MR])^T LocalQuasiTrsm ( RIGHT, LOWER, TRANSPOSE, F(1), L11_STAR_STAR, X1Trans_MR_STAR, checkIfSingular ); Transpose( X1Trans_MR_STAR, X1 ); L21_MC_STAR.AlignWith( X2 ); L21_MC_STAR = L21; // L21[MC,* ] <- L21[MC,MR] // X2[MC,MR] -= L21[MC,* ] X1[* ,MR] LocalGemm ( NORMAL, TRANSPOSE, F(-1), L21_MC_STAR, X1Trans_MR_STAR, F(1), X2 ); } }
void LLNLarge ( const ElementalMatrix<F>& LPre, ElementalMatrix<F>& XPre, bool checkIfSingular ) { DEBUG_CSE const Int m = XPre.Height(); const Int bsize = Blocksize(); const Grid& g = LPre.Grid(); DistMatrixReadProxy<F,F,MC,MR> LProx( LPre ); DistMatrixReadWriteProxy<F,F,MC,MR> XProx( XPre ); auto& L = LProx.GetLocked(); auto& X = XProx.Get(); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,MC, STAR> L21_MC_STAR(g); DistMatrix<F,STAR,MR > X1_STAR_MR(g); DistMatrix<F,STAR,VR > X1_STAR_VR(g); for( Int k=0; k<m; k+=bsize ) { const Int nbProp = Min(bsize,m-k); const bool in2x2 = ( k+nbProp<m && L.Get(k+nbProp-1,k+nbProp) != F(0) ); const Int nb = ( in2x2 ? nbProp+1 : nbProp ); const Range<Int> ind1( k, k+nb ), ind2( k+nb, m ); auto L11 = L( ind1, ind1 ); auto L21 = L( ind2, ind1 ); auto X1 = X( ind1, ALL ); auto X2 = X( ind2, ALL ); // X1[* ,VR] := L11^-1[* ,* ] X1[* ,VR] L11_STAR_STAR = L11; X1_STAR_VR = X1; LocalQuasiTrsm ( LEFT, LOWER, NORMAL, F(1), L11_STAR_STAR, X1_STAR_VR, checkIfSingular ); X1_STAR_MR.AlignWith( X2 ); X1_STAR_MR = X1_STAR_VR; // X1[* ,MR] <- X1[* ,VR] X1 = X1_STAR_MR; // X1[MC,MR] <- X1[* ,MR] L21_MC_STAR.AlignWith( X2 ); L21_MC_STAR = L21; // L21[MC,* ] <- L21[MC,MR] // X2[MC,MR] -= L21[MC,* ] X1[* ,MR] LocalGemm( NORMAL, NORMAL, F(-1), L21_MC_STAR, X1_STAR_MR, F(1), X2 ); } }
void ExplicitTriang( ElementalMatrix<F>& A ) { DEBUG_CSE const Grid& g = A.Grid(); DistMatrix<F,MD,STAR> householderScalars(g); DistMatrix<Base<F>,MD,STAR> signature(g); LQ( A, householderScalars, signature ); const Int m = A.Height(); const Int n = A.Width(); const Int minDim = Min(m,n); A.Resize( m, minDim ); MakeTrapezoidal( LOWER, A ); }
void KKT ( const ElementalMatrix<Real>& A, const ElementalMatrix<Real>& x, const ElementalMatrix<Real>& z, ElementalMatrix<Real>& JPre, bool onlyLower ) { EL_DEBUG_CSE const Int m = A.Height(); const Int n = A.Width(); DistMatrixWriteProxy<Real,Real,MC,MR> JProx( JPre ); auto& J = JProx.Get(); Zeros( J, 2*n+m, 2*n+m ); const IR xInd(0,n), yInd(n,n+m), zInd(n+m,2*n+m); auto Jxx = J(xInd,xInd); auto Jxy = J(xInd,yInd); auto Jxz = J(xInd,zInd); auto Jyx = J(yInd,xInd); auto Jyy = J(yInd,yInd); auto Jyz = J(yInd,zInd); auto Jzx = J(zInd,xInd); auto Jzy = J(zInd,yInd); auto Jzz = J(zInd,zInd); // Jyx := A // ======== Jyx = A; // Jzx := -I // ========= Identity( Jzx, n, n ); Jzx *= -1; // Jzz := - z <> x // =============== DistMatrix<Real,MC,STAR> t(x); DiagonalSolve( LEFT, NORMAL, z, t ); t *= -1; Diagonal( Jzz, t ); if( !onlyLower ) { // Jxy := A^T // ========== Transpose( A, Jxy ); // Jxz := -I // ========= Identity( Jxz, n, n ); Jxz *= -1; } }
void Householder ( ElementalMatrix<F>& APre, ElementalMatrix<F>& phasePre, ElementalMatrix<Base<F>>& signaturePre ) { DEBUG_CSE DEBUG_ONLY(AssertSameGrids( APre, phasePre, signaturePre )) const Int m = APre.Height(); const Int n = APre.Width(); const Int minDim = Min(m,n); const Int iOff = m-minDim; const Int jOff = n-minDim; DistMatrixReadWriteProxy<F,F,MC,MR> AProx( APre ); DistMatrixWriteProxy<F,F,MD,STAR> phaseProx( phasePre ); DistMatrixWriteProxy<Base<F>,Base<F>,MD,STAR> signatureProx( signaturePre ); auto& A = AProx.Get(); auto& phase = phaseProx.Get(); auto& signature = signatureProx.Get(); phase.Resize( minDim, 1 ); signature.Resize( minDim, 1 ); const Int bsize = Blocksize(); const Int kLast = LastOffset( minDim, bsize ); for( Int k=kLast; k>=0; k-=bsize ) { const Int nb = Min(bsize,minDim-k); const Int ki = k + iOff; const Int kj = k + jOff; const Range<Int> ind0Vert( 0, ki ), ind1( k, k+nb ), ind1Vert( ki, ki+nb ), indL( 0, kj+nb ); auto A0L = A( ind0Vert, indL ); auto A1L = A( ind1Vert, indL ); auto phase1 = phase( ind1, ALL ); auto sig1 = signature( ind1, ALL ); PanelHouseholder( A1L, phase1, sig1 ); ApplyQ( RIGHT, ADJOINT, A1L, phase1, sig1, A0L ); } }
void TestCorrectness ( bool print, UpperOrLower uplo, const ElementalMatrix<F>& AOrig, const ElementalMatrix<F>& A, const ElementalMatrix<Base<F>>& w, const ElementalMatrix<F>& Z ) { typedef Base<F> Real; const Grid& g = A.Grid(); const Int n = Z.Height(); const Int k = Z.Width(); const Real eps = limits::Epsilon<Real>(); DistMatrix<F> X(g); Identity( X, k, k ); Herk( uplo, ADJOINT, Real(-1), Z, Real(1), X ); const Real infOrthogError = HermitianInfinityNorm( uplo, X ); const Real relOrthogError = infOrthogError / (eps*n); OutputFromRoot(g.Comm(),"||Z^H Z - I||_oo / (eps n) = ",relOrthogError); // X := AZ X.AlignWith( Z ); Zeros( X, n, k ); Hemm( LEFT, uplo, F(1), AOrig, Z, F(0), X ); // Find the residual ||X-ZW||_oo = ||AZ-ZW||_oo DistMatrix<F> ZW( Z ); DiagonalScale( RIGHT, NORMAL, w, ZW ); X -= ZW; const Real oneNormA = HermitianOneNorm( uplo, AOrig ); if( oneNormA == Real(0) ) LogicError("Tried to test relative accuracy on zero matrix..."); const Real infError = InfinityNorm( X ); const Real relError = infError / (n*eps*oneNormA); OutputFromRoot(g.Comm(),"||A Z - Z W||_oo / (eps n ||A||_1) = ",relError); // TODO: More refined failure conditions if( relOrthogError > Real(200) ) // yes, really LogicError("Relative orthogonality error was unacceptably large"); if( relError > Real(10) ) LogicError("Relative error was unacceptably large"); }
void Householder ( ElementalMatrix<F>& APre, ElementalMatrix<F>& phasePre, ElementalMatrix<Base<F>>& signaturePre ) { DEBUG_CSE DEBUG_ONLY(AssertSameGrids( APre, phasePre, signaturePre )) const Int m = APre.Height(); const Int n = APre.Width(); const Int minDim = Min(m,n); DistMatrixReadWriteProxy<F,F,MC,MR> AProx( APre ); DistMatrixWriteProxy<F,F,MD,STAR> phaseProx( phasePre ); DistMatrixWriteProxy<Base<F>,Base<F>,MD,STAR> signatureProx( signaturePre ); auto& A = AProx.Get(); auto& phase = phaseProx.Get(); auto& signature = signatureProx.Get(); phase.Resize( minDim, 1 ); signature.Resize( minDim, 1 ); const Int bsize = Blocksize(); for( Int k=0; k<minDim; k+=bsize ) { const Int nb = Min(bsize,minDim-k); const Range<Int> ind1( k, k+nb ), indB( k, END ), ind2( k+nb, END ); auto AB1 = A( indB, ind1 ); auto AB2 = A( indB, ind2 ); auto phase1 = phase( ind1, ALL ); auto sig1 = signature( ind1, ALL ); PanelHouseholder( AB1, phase1, sig1 ); ApplyQ( LEFT, ADJOINT, AB1, phase1, sig1, AB2 ); } }
void IndexDependentMap ( const ElementalMatrix<S>& A, ElementalMatrix<T>& B, function<T(Int,Int,S)> func ) { DEBUG_CSE const Int mLoc = A.LocalHeight(); const Int nLoc = A.LocalWidth(); B.AlignWith( A.DistData() ); B.Resize( A.Height(), A.Width() ); auto& ALoc = A.LockedMatrix(); auto& BLoc = B.Matrix(); for( Int jLoc=0; jLoc<nLoc; ++jLoc ) { const Int j = A.GlobalCol(jLoc); for( Int iLoc=0; iLoc<mLoc; ++iLoc ) { const Int i = A.GlobalRow(iLoc); BLoc(iLoc,jLoc) = func(i,j,ALoc(iLoc,jLoc)); } } }
void LocalTrr2kKernel ( UpperOrLower uplo, Orientation orientA, Orientation orientB, Orientation orientC, Orientation orientD, T alpha, const ElementalMatrix<T>& A, const ElementalMatrix<T>& B, T beta, const ElementalMatrix<T>& C, const ElementalMatrix<T>& D, ElementalMatrix<T>& E ) { DEBUG_CSE const bool transA = orientA != NORMAL; const bool transB = orientB != NORMAL; const bool transC = orientC != NORMAL; const bool transD = orientD != NORMAL; // TODO: Stringent distribution and alignment checks typedef ElementalMatrix<T> ADM; auto A0 = unique_ptr<ADM>( A.Construct(A.Grid(),A.Root()) ); auto A1 = unique_ptr<ADM>( A.Construct(A.Grid(),A.Root()) ); auto B0 = unique_ptr<ADM>( B.Construct(B.Grid(),B.Root()) ); auto B1 = unique_ptr<ADM>( B.Construct(B.Grid(),B.Root()) ); auto C0 = unique_ptr<ADM>( C.Construct(C.Grid(),C.Root()) ); auto C1 = unique_ptr<ADM>( C.Construct(C.Grid(),C.Root()) ); auto D0 = unique_ptr<ADM>( D.Construct(D.Grid(),D.Root()) ); auto D1 = unique_ptr<ADM>( D.Construct(D.Grid(),D.Root()) ); auto ETL = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto ETR = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto EBL = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto EBR = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto FTL = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto FBR = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); const Int half = E.Height() / 2; if( transA ) LockedPartitionRight( A, *A0, *A1, half ); else LockedPartitionDown( A, *A0, *A1, half ); if( transB ) LockedPartitionDown( B, *B0, *B1, half ); else LockedPartitionRight( B, *B0, *B1, half ); if( transC ) LockedPartitionRight( C, *C0, *C1, half ); else LockedPartitionDown( C, *C0, *C1, half ); if( transD ) LockedPartitionDown( D, *D0, *D1, half ); else LockedPartitionRight( D, *D0, *D1, half ); PartitionDownDiagonal( E, *ETL, *ETR, *EBL, *EBR, half ); if( uplo == LOWER ) { Gemm ( orientA, orientB, alpha, A1->LockedMatrix(), B0->LockedMatrix(), T(1), EBL->Matrix() ); Gemm ( orientC, orientD, beta, C1->LockedMatrix(), D0->LockedMatrix(), T(1), EBL->Matrix() ); } else { Gemm ( orientA, orientB, alpha, A0->LockedMatrix(), B1->LockedMatrix(), T(1), ETR->Matrix() ); Gemm ( orientC, orientD, beta, C0->LockedMatrix(), D1->LockedMatrix(), T(1), ETR->Matrix() ); } FTL->AlignWith( *ETL ); FTL->Resize( ETL->Height(), ETL->Width() ); Gemm ( orientA, orientB, alpha, A0->LockedMatrix(), B0->LockedMatrix(), T(0), FTL->Matrix() ); Gemm ( orientC, orientD, beta, C0->LockedMatrix(), D0->LockedMatrix(), T(1), FTL->Matrix() ); AxpyTrapezoid( uplo, T(1), *FTL, *ETL ); FBR->AlignWith( *EBR ); FBR->Resize( EBR->Height(), EBR->Width() ); Gemm ( orientA, orientB, alpha, A1->LockedMatrix(), B1->LockedMatrix(), T(0), FBR->Matrix() ); Gemm ( orientC, orientD, beta, C1->LockedMatrix(), D1->LockedMatrix(), T(1), FBR->Matrix() ); AxpyTrapezoid( uplo, T(1), *FBR, *EBR ); }
void LocalTrr2k ( UpperOrLower uplo, Orientation orientA, Orientation orientB, Orientation orientC, Orientation orientD, T alpha, const ElementalMatrix<T>& A, const ElementalMatrix<T>& B, T beta, const ElementalMatrix<T>& C, const ElementalMatrix<T>& D, T gamma, ElementalMatrix<T>& E ) { using namespace trr2k; DEBUG_CSE const bool transA = orientA != NORMAL; const bool transB = orientB != NORMAL; const bool transC = orientC != NORMAL; const bool transD = orientD != NORMAL; // TODO: Stringent distribution and alignment checks ScaleTrapezoid( gamma, uplo, E ); if( E.Height() < E.Grid().Width()*LocalTrr2kBlocksize<T>() ) { LocalTrr2kKernel ( uplo, orientA, orientB, orientC, orientD, alpha, A, B, beta, C, D, E ); } else { typedef ElementalMatrix<T> ADM; // Ugh. This is likely too much overhead. It should be measured. auto A0 = unique_ptr<ADM>( A.Construct(A.Grid(),A.Root()) ); auto A1 = unique_ptr<ADM>( A.Construct(A.Grid(),A.Root()) ); auto B0 = unique_ptr<ADM>( B.Construct(B.Grid(),B.Root()) ); auto B1 = unique_ptr<ADM>( B.Construct(B.Grid(),B.Root()) ); auto C0 = unique_ptr<ADM>( C.Construct(C.Grid(),C.Root()) ); auto C1 = unique_ptr<ADM>( C.Construct(C.Grid(),C.Root()) ); auto D0 = unique_ptr<ADM>( D.Construct(D.Grid(),D.Root()) ); auto D1 = unique_ptr<ADM>( D.Construct(D.Grid(),D.Root()) ); auto ETL = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto ETR = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto EBL = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); auto EBR = unique_ptr<ADM>( E.Construct(E.Grid(),E.Root()) ); const Int half = E.Height() / 2; if( transA ) LockedPartitionRight( A, *A0, *A1, half ); else LockedPartitionDown( A, *A0, *A1, half ); if( transB ) LockedPartitionDown( B, *B0, *B1, half ); else LockedPartitionRight( B, *B0, *B1, half ); if( transC ) LockedPartitionRight( C, *C0, *C1, half ); else LockedPartitionDown( C, *C0, *C1, half ); if( transD ) LockedPartitionDown( D, *D0, *D1, half ); else LockedPartitionRight( D, *D0, *D1, half ); PartitionDownDiagonal( E, *ETL, *ETR, *EBL, *EBR, half ); if( uplo == LOWER ) { Gemm ( orientA, orientB, alpha, A1->LockedMatrix(), B0->LockedMatrix(), T(1), EBL->Matrix() ); Gemm ( orientC, orientD, beta, C1->LockedMatrix(), D0->LockedMatrix(), T(1), EBL->Matrix() ); } else { Gemm ( orientA, orientB, alpha, A0->LockedMatrix(), B1->LockedMatrix(), T(1), ETR->Matrix() ); Gemm ( orientC, orientD, beta, C0->LockedMatrix(), D1->LockedMatrix(), T(1), ETR->Matrix() ); } // Recurse LocalTrr2k ( uplo, orientA, orientB, orientC, orientD, alpha, *A0, *B0, beta, *C0, *D0, T(1), *ETL ); LocalTrr2k ( uplo, orientA, orientB, orientC, orientD, alpha, *A1, *B1, beta, *C1, *D1, T(1), *EBR ); } }
void Gather ( const ElementalMatrix<T>& A, DistMatrix<T,CIRC,CIRC>& B ) { DEBUG_ONLY(CSE cse("copy::Gather")) AssertSameGrids( A, B ); if( A.DistSize() == 1 && A.CrossSize() == 1 ) { B.Resize( A.Height(), A.Width() ); if( B.CrossRank() == B.Root() ) Copy( A.LockedMatrix(), B.Matrix() ); return; } const Int height = A.Height(); const Int width = A.Width(); B.SetGrid( A.Grid() ); B.Resize( height, width ); // Gather the colShifts and rowShifts // ================================== Int myShifts[2]; myShifts[0] = A.ColShift(); myShifts[1] = A.RowShift(); vector<Int> shifts; const Int crossSize = B.CrossSize(); if( B.CrossRank() == B.Root() ) shifts.resize( 2*crossSize ); mpi::Gather( myShifts, 2, shifts.data(), 2, B.Root(), B.CrossComm() ); // Gather the payload data // ======================= const bool irrelevant = ( A.RedundantRank()!=0 || A.CrossRank()!=A.Root() ); int totalSend = ( irrelevant ? 0 : A.LocalHeight()*A.LocalWidth() ); vector<int> recvCounts, recvOffsets; if( B.CrossRank() == B.Root() ) recvCounts.resize( crossSize ); mpi::Gather( &totalSend, 1, recvCounts.data(), 1, B.Root(), B.CrossComm() ); int totalRecv = Scan( recvCounts, recvOffsets ); //vector<T> sendBuf(totalSend), recvBuf(totalRecv); vector<T> sendBuf, recvBuf; sendBuf.reserve( totalSend ); recvBuf.reserve( totalRecv ); if( !irrelevant ) copy::util::InterleaveMatrix ( A.LocalHeight(), A.LocalWidth(), A.LockedBuffer(), 1, A.LDim(), sendBuf.data(), 1, A.LocalHeight() ); mpi::Gather ( sendBuf.data(), totalSend, recvBuf.data(), recvCounts.data(), recvOffsets.data(), B.Root(), B.CrossComm() ); // Unpack // ====== if( B.Root() == B.CrossRank() ) { for( Int q=0; q<crossSize; ++q ) { if( recvCounts[q] == 0 ) continue; const Int colShift = shifts[2*q+0]; const Int rowShift = shifts[2*q+1]; const Int colStride = A.ColStride(); const Int rowStride = A.RowStride(); const Int localHeight = Length( height, colShift, colStride ); const Int localWidth = Length( width, rowShift, rowStride ); copy::util::InterleaveMatrix ( localHeight, localWidth, &recvBuf[recvOffsets[q]], 1, localHeight, B.Buffer(colShift,rowShift), colStride, rowStride*B.LDim() ); } } }
void SDC ( UpperOrLower uplo, ElementalMatrix<F>& APre, ElementalMatrix<Base<F>>& wPre, ElementalMatrix<F>& QPre, const HermitianSDCCtrl<Base<F>> ctrl ) { DEBUG_CSE typedef Base<F> Real; const Grid& g = APre.Grid(); const Int n = APre.Height(); wPre.Resize( n, 1 ); QPre.Resize( n, n ); if( APre.Grid().Size() == 1 ) { HermitianEig( uplo, APre.Matrix(), wPre.Matrix(), QPre.Matrix() ); return; } if( n <= ctrl.cutoff ) { HermitianEig( uplo, APre, wPre, QPre ); return; } DistMatrixReadWriteProxy<F,F,MC,MR> AProx( APre ); DistMatrixWriteProxy<F,F,MC,MR> QProx( QPre ); DistMatrixWriteProxy<Real,Real,VR,STAR> wProx( wPre ); auto& A = AProx.Get(); auto& Q = QProx.Get(); auto& w = wProx.Get(); // Perform this level's split const auto part = SpectralDivide( uplo, A, Q, ctrl ); auto ind1 = IR(0,part.index); auto ind2 = IR(part.index,n); auto ATL = A( ind1, ind1 ); auto ATR = A( ind1, ind2 ); auto ABL = A( ind2, ind1 ); auto ABR = A( ind2, ind2 ); auto wT = w( ind1, ALL ); auto wB = w( ind2, ALL ); auto QL = Q( ALL, ind1 ); auto QR = Q( ALL, ind2 ); if( uplo == LOWER ) Zero( ABL ); else Zero( ATR ); // Recurse on the two subproblems DistMatrix<F> ATLSub, ABRSub, ZTSub, ZBSub; DistMatrix<Real,VR,STAR> wTSub, wBSub; PushSubproblems ( ATL, ABR, ATLSub, ABRSub, wT, wB, wTSub, wBSub, ZTSub, ZBSub, ctrl.progress ); if( ATLSub.Participating() ) SDC( uplo, ATLSub, wTSub, ZTSub, ctrl ); if( ABRSub.Participating() ) SDC( uplo, ABRSub, wBSub, ZBSub, ctrl ); // Pull the results back to this grid DistMatrix<F> ZT(g), ZB(g); PullSubproblems ( ATL, ABR, ATLSub, ABRSub, wT, wB, wTSub, wBSub, ZT, ZB, ZTSub, ZBSub ); // Update the eigen vectors auto G( QL ); Gemm( NORMAL, NORMAL, F(1), G, ZT, QL ); G = QR; Gemm( NORMAL, NORMAL, F(1), G, ZB, QR ); }
void LUMod ( ElementalMatrix<F>& APre, DistPermutation& P, const ElementalMatrix<F>& u, const ElementalMatrix<F>& v, bool conjugate, Base<F> tau ) { DEBUG_CSE const Grid& g = APre.Grid(); typedef Base<F> Real; DistMatrixReadWriteProxy<F,F,MC,MR> AProx( APre ); auto& A = AProx.Get(); const Int m = A.Height(); const Int n = A.Width(); const Int minDim = Min(m,n); if( minDim != m ) LogicError("It is assumed that height(A) <= width(A)"); if( u.Height() != m || u.Width() != 1 ) LogicError("u is expected to be a conforming column vector"); if( v.Height() != n || v.Width() != 1 ) LogicError("v is expected to be a conforming column vector"); AssertSameGrids( A, u, v ); // w := inv(L) P u // TODO: Consider locally maintaining all of w to avoid unnecessarily // broadcasting at every iteration. DistMatrix<F> w( u ); P.PermuteRows( w ); Trsv( LOWER, NORMAL, UNIT, A, w ); // Maintain an external vector for the temporary subdiagonal of U DistMatrix<F,MD,STAR> uSub(g); uSub.SetRoot( A.DiagonalRoot(-1) ); uSub.AlignCols( A.DiagonalAlign(-1) ); Zeros( uSub, minDim-1, 1 ); // Reduce w to a multiple of e0 for( Int i=minDim-2; i>=0; --i ) { // Decide if we should pivot the i'th and i+1'th rows of w const F lambdaSub = A.Get(i+1,i); const F ups_ii = A.Get(i,i); const F omega_i = w.Get( i, 0 ); const F omega_ip1 = w.Get( i+1, 0 ); const Real rightTerm = Abs(lambdaSub*omega_i+omega_ip1); const bool pivot = ( Abs(omega_i) < tau*rightTerm ); const Range<Int> indB( i+2, m ), indR( i+1, n ), indi( i, i+1 ), indip1( i+1, i+2 ); auto lBi = A( indB, indi ); auto lBip1 = A( indB, indip1 ); auto uiR = A( indi, indR ); auto uip1R = A( indip1, indR ); if( pivot ) { // P := P_i P P.Swap( i, i+1 ); // Simultaneously perform // U := P_i U and // L := P_i L P_i^T // // Then update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // w := T_{i,L} P_i w, // where T_{i,L} is the Gauss transform which zeros (P_i w)_{i+1}. // // More succinctly, // gamma := w(i) / w(i+1), // w(i) := w(i+1), // w(i+1) := 0, // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:). const F gamma = omega_i / omega_ip1; const F lambda_ii = F(1) + gamma*lambdaSub; A.Set( i, i, gamma ); A.Set( i+1, i, 0 ); auto lBiCopy = lBi; Swap( NORMAL, lBi, lBip1 ); Axpy( gamma, lBiCopy, lBi ); auto uip1RCopy = uip1R; RowSwap( A, i, i+1 ); Axpy( -gamma, uip1RCopy, uip1R ); // Force L back to *unit* lower-triangular form via the transform // L := L T_{i,U}^{-1} D^{-1}, // where D is diagonal and responsible for forcing L(i,i) and // L(i+1,i+1) back to 1. The effect on L is: // eta := L(i,i+1)/L(i,i), // L(:,i+1) -= eta L(:,i), // delta_i := L(i,i), // delta_ip1 := L(i+1,i+1), // L(:,i) /= delta_i, // L(:,i+1) /= delta_ip1, // while the effect on U is // U(i,:) += eta U(i+1,:) // U(i,:) *= delta_i, // U(i+1,:) *= delta_{i+1}, // and the effect on w is // w(i) *= delta_i. const F eta = lambdaSub/lambda_ii; const F delta_i = lambda_ii; const F delta_ip1 = F(1) - eta*gamma; Axpy( -eta, lBi, lBip1 ); A.Set( i+1, i, gamma/delta_i ); lBi *= F(1)/delta_i; lBip1 *= F(1)/delta_ip1; A.Set( i, i, eta*ups_ii*delta_i ); Axpy( eta, uip1R, uiR ); uiR *= delta_i; uip1R *= delta_ip1; uSub.Set( i, 0, ups_ii*delta_ip1 ); // Finally set w(i) w.Set( i, 0, omega_ip1*delta_i ); } else { // Update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // w := T_{i,L} w, // where T_{i,L} is the Gauss transform which zeros w_{i+1}. // // More succinctly, // gamma := w(i+1) / w(i), // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:), // w(i+1) := 0. const F gamma = omega_ip1 / omega_i; A.Update( i+1, i, gamma ); Axpy( gamma, lBip1, lBi ); Axpy( -gamma, uiR, uip1R ); uSub.Set( i, 0, -gamma*ups_ii ); } } // Add the modified w v' into U { auto a0 = A( IR(0), ALL ); const F omega_0 = w.Get( 0, 0 ); DistMatrix<F> vTrans(g); vTrans.AlignWith( a0 ); Transpose( v, vTrans, conjugate ); Axpy( omega_0, vTrans, a0 ); } // Transform U from upper-Hessenberg to upper-triangular form for( Int i=0; i<minDim-1; ++i ) { // Decide if we should pivot the i'th and i+1'th rows U const F lambdaSub = A.Get( i+1, i ); const F ups_ii = A.Get( i, i ); const F ups_ip1i = uSub.Get( i, 0 ); const Real rightTerm = Abs(lambdaSub*ups_ii+ups_ip1i); const bool pivot = ( Abs(ups_ii) < tau*rightTerm ); const Range<Int> indB( i+2, m ), indR( i+1, n ), indi( i, i+1 ), indip1( i+1, i+2 ); auto lBi = A( indB, indi ); auto lBip1 = A( indB, indip1 ); auto uiR = A( indi, indR ); auto uip1R = A( indip1, indR ); if( pivot ) { // P := P_i P P.Swap( i, i+1 ); // Simultaneously perform // U := P_i U and // L := P_i L P_i^T // // Then update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // where T_{i,L} is the Gauss transform which zeros U(i+1,i). // // More succinctly, // gamma := U(i+1,i) / U(i,i), // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:). const F gamma = ups_ii / ups_ip1i; const F lambda_ii = F(1) + gamma*lambdaSub; A.Set( i+1, i, ups_ip1i ); A.Set( i, i, gamma ); auto lBiCopy = lBi; Swap( NORMAL, lBi, lBip1 ); Axpy( gamma, lBiCopy, lBi ); auto uip1RCopy = uip1R; RowSwap( A, i, i+1 ); Axpy( -gamma, uip1RCopy, uip1R ); // Force L back to *unit* lower-triangular form via the transform // L := L T_{i,U}^{-1} D^{-1}, // where D is diagonal and responsible for forcing L(i,i) and // L(i+1,i+1) back to 1. The effect on L is: // eta := L(i,i+1)/L(i,i), // L(:,i+1) -= eta L(:,i), // delta_i := L(i,i), // delta_ip1 := L(i+1,i+1), // L(:,i) /= delta_i, // L(:,i+1) /= delta_ip1, // while the effect on U is // U(i,:) += eta U(i+1,:) // U(i,:) *= delta_i, // U(i+1,:) *= delta_{i+1}. const F eta = lambdaSub/lambda_ii; const F delta_i = lambda_ii; const F delta_ip1 = F(1) - eta*gamma; Axpy( -eta, lBi, lBip1 ); A.Set( i+1, i, gamma/delta_i ); lBi *= F(1)/delta_i; lBip1 *= F(1)/delta_ip1; A.Set( i, i, ups_ip1i*delta_i ); Axpy( eta, uip1R, uiR ); uiR *= delta_i; uip1R *= delta_ip1; } else { // Update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // where T_{i,L} is the Gauss transform which zeros U(i+1,i). // // More succinctly, // gamma := U(i+1,i)/ U(i,i), // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:). const F gamma = ups_ip1i / ups_ii; A.Update( i+1, i, gamma ); Axpy( gamma, lBip1, lBi ); Axpy( -gamma, uiR, uip1R ); } } }
void ColScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::ColScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("A and B must be the same size"); #ifdef EL_VECTOR_WARNINGS if( A.Width() == 1 && B.Grid().Rank() == 0 ) { cerr << "The vector version of ColScatter does not" " yet have a vector version implemented, but it would only " "require a modification of the vector version of RowScatter" << endl; } #endif #ifdef EL_CACHE_WARNINGS if( A.Width() != 1 && B.Grid().Rank() == 0 ) { cerr << "axpy_contract::ColScatter potentially causes a large " "amount of cache-thrashing. If possible, avoid it by forming the " "(conjugate-)transpose of the [* ,V] matrix instead." << endl; } #endif if( !B.Participating() ) return; const Int height = B.Height(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int colAlign = B.ColAlign(); const Int colStride = B.ColStride(); const Int rowDiff = B.RowAlign()-A.RowAlign(); // TODO: Allow for modular equivalence if possible if( rowDiff == 0 ) { const Int maxLocalHeight = MaxLength(height,colStride); const Int recvSize = mpi::Pad( maxLocalHeight*localWidth ); const Int sendSize = colStride*recvSize; //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); // Pack copy::util::ColStridedPack ( height, localWidth, colAlign, colStride, A.LockedBuffer(), A.LDim(), buffer.data(), recvSize ); // Communicate mpi::ReduceScatter( buffer.data(), recvSize, B.ColComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) cerr << "Unaligned ColScatter" << endl; #endif const Int localWidthA = A.LocalWidth(); const Int maxLocalHeight = MaxLength(height,colStride); const Int recvSize_RS = mpi::Pad( maxLocalHeight*localWidthA ); const Int sendSize_RS = colStride*recvSize_RS; const Int recvSize_SR = localHeight*localWidth; //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); vector<T> buffer; buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[recvSize_RS]; // Pack copy::util::ColStridedPack ( height, localWidth, colAlign, colStride, A.LockedBuffer(), A.LDim(), secondBuf, recvSize_RS ); // Reduce-scatter over each col mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.ColComm() ); // Trade reduced data with the appropriate col const Int sendCol = Mod( B.RowRank()+rowDiff, B.RowStride() ); const Int recvCol = Mod( B.RowRank()-rowDiff, B.RowStride() ); mpi::SendRecv ( firstBuf, localHeight*localWidthA, sendCol, secondBuf, localHeight*localWidth, recvCol, B.RowComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, secondBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } }
void RowScatter ( T alpha, const ElementalMatrix<T>& A, ElementalMatrix<T>& B ) { DEBUG_ONLY(CSE cse("axpy_contract::RowScatter")) AssertSameGrids( A, B ); if( A.Height() != B.Height() || A.Width() != B.Width() ) LogicError("Matrix sizes did not match"); if( !B.Participating() ) return; const Int width = B.Width(); const Int colDiff = B.ColAlign()-A.ColAlign(); if( colDiff == 0 ) { if( width == 1 ) { const Int localHeight = B.LocalHeight(); const Int portionSize = mpi::Pad( localHeight ); //vector<T> buffer( portionSize ); vector<T> buffer; buffer.reserve( portionSize ); // Reduce to rowAlign const Int rowAlign = B.RowAlign(); mpi::Reduce ( A.LockedBuffer(), buffer.data(), portionSize, rowAlign, B.RowComm() ); if( B.RowRank() == rowAlign ) { axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, 1, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { const Int rowStride = B.RowStride(); const Int rowAlign = B.RowAlign(); const Int localHeight = B.LocalHeight(); const Int localWidth = B.LocalWidth(); const Int maxLocalWidth = MaxLength(width,rowStride); const Int portionSize = mpi::Pad( localHeight*maxLocalWidth ); const Int sendSize = rowStride*portionSize; // Pack //vector<T> buffer( sendSize ); vector<T> buffer; buffer.reserve( sendSize ); copy::util::RowStridedPack ( localHeight, width, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), buffer.data(), portionSize ); // Communicate mpi::ReduceScatter( buffer.data(), portionSize, B.RowComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, buffer.data(), 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { #ifdef EL_UNALIGNED_WARNINGS if( B.Grid().Rank() == 0 ) cerr << "Unaligned RowScatter" << endl; #endif const Int colRank = B.ColRank(); const Int colStride = B.ColStride(); const Int sendRow = Mod( colRank+colDiff, colStride ); const Int recvRow = Mod( colRank-colDiff, colStride ); const Int localHeight = B.LocalHeight(); const Int localHeightA = A.LocalHeight(); if( width == 1 ) { //vector<T> buffer( localHeight+localHeightA ); vector<T> buffer; buffer.reserve( localHeight+localHeightA ); T* sendBuf = &buffer[0]; T* recvBuf = &buffer[localHeightA]; // Reduce to rowAlign const Int rowAlign = B.RowAlign(); mpi::Reduce ( A.LockedBuffer(), sendBuf, localHeightA, rowAlign, B.RowComm() ); if( B.RowRank() == rowAlign ) { // Perform the realignment mpi::SendRecv ( sendBuf, localHeightA, sendRow, recvBuf, localHeight, recvRow, B.ColComm() ); axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, 1, recvBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } } else { const Int rowStride = B.RowStride(); const Int rowAlign = B.RowAlign(); const Int localWidth = B.LocalWidth(); const Int maxLocalWidth = MaxLength(width,rowStride); const Int recvSize_RS = mpi::Pad( localHeightA*maxLocalWidth ); const Int sendSize_RS = rowStride * recvSize_RS; const Int recvSize_SR = localHeight * localWidth; //vector<T> buffer( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); vector<T> buffer; buffer.reserve( recvSize_RS + Max(sendSize_RS,recvSize_SR) ); T* firstBuf = &buffer[0]; T* secondBuf = &buffer[recvSize_RS]; // Pack copy::util::RowStridedPack ( localHeightA, width, rowAlign, rowStride, A.LockedBuffer(), A.LDim(), secondBuf, recvSize_RS ); // Reduce-scatter over each process row mpi::ReduceScatter( secondBuf, firstBuf, recvSize_RS, B.RowComm() ); // Trade reduced data with the appropriate process row mpi::SendRecv ( firstBuf, localHeightA*localWidth, sendRow, secondBuf, localHeight*localWidth, recvRow, B.ColComm() ); // Update with our received data axpy::util::InterleaveMatrixUpdate ( alpha, localHeight, localWidth, secondBuf, 1, localHeight, B.Buffer(), 1, B.LDim() ); } } }
void SolveAfter ( Orientation orientation, const ElementalMatrix<F>& APre, const ElementalMatrix<F>& householderScalars, const ElementalMatrix<Base<F>>& signature, const ElementalMatrix<F>& B, ElementalMatrix<F>& XPre ) { DEBUG_CSE const Int m = APre.Height(); const Int n = APre.Width(); if( m > n ) LogicError("Must have full row rank"); DistMatrixReadProxy<F,F,MC,MR> AProx( APre ); DistMatrixWriteProxy<F,F,MC,MR> XProx( XPre ); auto& A = AProx.GetLocked(); auto& X = XProx.Get(); X.Resize( n, B.Width() ); // TODO: Add scaling auto AL = A( IR(0,m), IR(0,m) ); if( orientation == NORMAL ) { if( m != B.Height() ) LogicError("A and B do not conform"); // Copy B into X auto XT = X( IR(0,m), ALL ); auto XB = X( IR(m,n), ALL ); XT = B; Zero( XB ); if( orientation == TRANSPOSE ) Conjugate( XT ); // Solve against L (checking for singularities) Trsm( LEFT, LOWER, NORMAL, NON_UNIT, F(1), AL, XT, true ); // Apply Q' to X lq::ApplyQ( LEFT, ADJOINT, A, householderScalars, signature, X ); if( orientation == TRANSPOSE ) Conjugate( X ); } else { // Copy B into X X = B; if( orientation == TRANSPOSE ) Conjugate( X ); // Apply Q to X lq::ApplyQ( LEFT, NORMAL, A, householderScalars, signature, X ); // Shrink X to its new height X.Resize( m, X.Width() ); // Solve against L' (check for singularities) Trsm( LEFT, LOWER, ADJOINT, NON_UNIT, F(1), AL, X, true ); if( orientation == TRANSPOSE ) Conjugate( X ); } }