inline void GemmNNB ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmNNB"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal GemmNNB: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> AT(g), A0(g), AB(g), A1(g), A2(g); DistMatrix<T> CT(g), C0(g), CB(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,STAR,MC> A1_STAR_MC(g); DistMatrix<T,MR,STAR> D1Trans_MR_STAR(g); A1_STAR_MC.AlignWith( B ); D1Trans_MR_STAR.AlignWith( B ); // Start the algorithm Scale( beta, C ); LockedPartitionDown ( A, AT, AB, 0 ); PartitionDown ( C, CT, CB, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); RepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); Zeros( C1.Width(), C1.Height(), D1Trans_MR_STAR ); //--------------------------------------------------------------------// A1_STAR_MC = A1; // A1[*,MC] <- A1[MC,MR] // D1^T[MR,* ] := alpha B^T[MR,MC] A1^T[MC,* ] LocalGemm ( TRANSPOSE, TRANSPOSE, alpha, B, A1_STAR_MC, T(0), D1Trans_MR_STAR ); C1.TransposeSumScatterUpdate( T(1), D1Trans_MR_STAR ); //--------------------------------------------------------------------// SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlidePartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TwoSidedTrsmUVar1( UnitOrNonUnit diag, Matrix<F>& A, const Matrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar1"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; // Temporary products Matrix<F> Y01; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); //--------------------------------------------------------------------// // Y01 := A00 U01 Zeros( Y01, A01.Height(), A01.Width() ); Hemm( LEFT, UPPER, F(1), A00, U01, F(0), Y01 ); // A01 := inv(U00)' A01 Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U00, A01 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A11 := A11 - (U01' A01 + A01' U01) Her2k( UPPER, ADJOINT, F(-1), U01, A01, F(1), A11 ); // A11 := inv(U11)' A11 inv(U11) TwoSidedTrsmUUnb( diag, A11, U11 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A01 := A01 inv(U11) Trsm( RIGHT, UPPER, NORMAL, diag, F(1), U11, A01 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void ApplyPackedReflectorsLUVF ( Conjugation conjugation, int offset, const Matrix<Complex<R> >& H, const Matrix<Complex<R> >& t, Matrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUVF"); if( offset < 0 || offset > H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); #endif typedef Complex<R> C; Matrix<C> HTL, HTR, H00, H01, H02, HPan, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<C> AT, A0, ATop, AB, A1, A2; Matrix<C> tT, t0, tB, t1, t2; Matrix<C> HPanCopy; Matrix<C> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanHeight = H01.Height() + H11.Height(); const int HPanOffset = std::min( H11.Width(), std::max(offset-H00.Width(),0) ); const int HPanWidth = H11.Width()-HPanOffset; HPan.LockedView( H, 0, H00.Width()+HPanOffset, HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); ATop.View2x1( A0, A1 ); Zeros( HPan.Width(), ATop.Width(), Z ); Zeros( HPan.Width(), HPan.Width(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, UPPER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); Herk( LOWER, ADJOINT, C(1), HPanCopy, C(0), SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( ADJOINT, NORMAL, C(1), HPanCopy, ATop, C(0), Z ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, C(1), SInv, Z ); Gemm( NORMAL, NORMAL, C(-1), HPanCopy, Z, C(1), ATop ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrmmLLNCOld ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE CallStackEntry entry("internal::TrmmLLNCOld"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( L.Height() != L.Width() || L.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLNC: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,STAR,MC > L10_STAR_MC(g); DistMatrix<T,STAR,STAR> L11_STAR_STAR(g); DistMatrix<T,STAR,VR > X1_STAR_VR(g); DistMatrix<T,MR, STAR> D1Trans_MR_STAR(g); DistMatrix<T,MR, MC > D1Trans_MR_MC(g); DistMatrix<T,MC, MR > D1(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); L10_STAR_MC.AlignWith( X0 ); D1Trans_MR_STAR.AlignWith( X1 ); D1Trans_MR_MC.AlignWith( X1 ); D1.AlignWith( X1 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; X1_STAR_VR = X1; LocalTrmm( LEFT, LOWER, NORMAL, diag, T(1), L11_STAR_STAR, X1_STAR_VR ); X1 = X1_STAR_VR; L10_STAR_MC = L10; LocalGemm ( TRANSPOSE, TRANSPOSE, T(1), X0, L10_STAR_MC, D1Trans_MR_STAR ); D1Trans_MR_MC.SumScatterFrom( D1Trans_MR_STAR ); Zeros( D1, X1.Height(), X1.Width() ); Transpose( D1Trans_MR_MC.Matrix(), D1.Matrix() ); Axpy( T(1), D1, X1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); D1Trans_MR_MC.FreeAlignments(); D1Trans_MR_STAR.FreeAlignments(); L10_STAR_MC.FreeAlignments(); SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } }
inline void TrsvUN( UnitOrNonUnit diag, const DistMatrix<F>& U, DistMatrix<F>& x ) { #ifndef RELEASE PushCallStack("internal::TrsvUN"); if( U.Grid() != x.Grid() ) throw std::logic_error("{U,x} must be distributed over the same grid"); if( U.Height() != U.Width() ) throw std::logic_error("U must be square"); if( x.Width() != 1 && x.Height() != 1 ) throw std::logic_error("x must be a vector"); const int xLength = ( x.Width() == 1 ? x.Height() : x.Width() ); if( U.Width() != xLength ) throw std::logic_error("Nonconformal TrsvUN"); #endif const Grid& g = U.Grid(); if( x.Width() == 1 ) { // Matrix views DistMatrix<F> U01(g), U11(g); DistMatrix<F> xT(g), x0(g), xB(g), x1(g), x2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,STAR> x1_STAR_STAR(g); DistMatrix<F,MR, STAR> x1_MR_STAR(g); DistMatrix<F,MC, STAR> z_MC_STAR(g); // Views of z[MC,* ], which will store updates to x DistMatrix<F,MC,STAR> z0_MC_STAR(g), z1_MC_STAR(g); z_MC_STAR.AlignWith( U ); Zeros( x.Height(), 1, z_MC_STAR ); // Start the algorithm PartitionUp ( x, xT, xB, 0 ); while( xT.Height() > 0 ) { RepartitionUp ( xT, x0, x1, /**/ /**/ xB, x2 ); const int n0 = x0.Height(); const int n1 = x1.Height(); LockedView( U01, U, 0, n0, n0, n1 ); LockedView( U11, U, n0, n0, n1, n1 ); View( z0_MC_STAR, z_MC_STAR, 0, 0, n0, 1 ); View( z1_MC_STAR, z_MC_STAR, n0, 0, n1, 1 ); x1_MR_STAR.AlignWith( U01 ); //----------------------------------------------------------------// if( x2.Height() != 0 ) x1.SumScatterUpdate( F(1), z1_MC_STAR ); x1_STAR_STAR = x1; U11_STAR_STAR = U11; Trsv ( UPPER, NORMAL, diag, U11_STAR_STAR.LockedLocalMatrix(), x1_STAR_STAR.LocalMatrix() ); x1 = x1_STAR_STAR; x1_MR_STAR = x1_STAR_STAR; Gemv ( NORMAL, F(-1), U01.LockedLocalMatrix(), x1_MR_STAR.LockedLocalMatrix(), F(1), z0_MC_STAR.LocalMatrix() ); //----------------------------------------------------------------// x1_MR_STAR.FreeAlignments(); SlidePartitionUp ( xT, x0, /**/ /**/ x1, xB, x2 ); } } else { // Matrix views DistMatrix<F> U01(g), U11(g); DistMatrix<F> xL(g), xR(g), x0(g), x1(g), x2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,STAR> x1_STAR_STAR(g); DistMatrix<F,STAR,MR > x1_STAR_MR(g); DistMatrix<F,MC, MR > z1(g); DistMatrix<F,MR, MC > z1_MR_MC(g); DistMatrix<F,STAR,MC > z_STAR_MC(g); // Views of z[* ,MC] DistMatrix<F,STAR,MC> z0_STAR_MC(g), z1_STAR_MC(g); z_STAR_MC.AlignWith( U ); Zeros( 1, x.Width(), z_STAR_MC ); // Start the algorithm PartitionLeft( x, xL, xR, 0 ); while( xL.Width() > 0 ) { RepartitionLeft ( xL, /**/ xR, x0, x1, /**/ x2 ); const int n0 = x0.Width(); const int n1 = x1.Width(); LockedView( U01, U, 0, n0, n0, n1 ); LockedView( U11, U, n0, n0, n1, n1 ); View( z0_STAR_MC, z_STAR_MC, 0, 0, 1, n0 ); View( z1_STAR_MC, z_STAR_MC, 0, n0, 1, n1 ); x1_STAR_MR.AlignWith( U01 ); z1.AlignWith( x1 ); //----------------------------------------------------------------// if( x2.Width() != 0 ) { z1_MR_MC.SumScatterFrom( z1_STAR_MC ); z1 = z1_MR_MC; Axpy( F(1), z1, x1 ); } x1_STAR_STAR = x1; U11_STAR_STAR = U11; Trsv ( UPPER, NORMAL, diag, U11_STAR_STAR.LockedLocalMatrix(), x1_STAR_STAR.LocalMatrix() ); x1 = x1_STAR_STAR; x1_STAR_MR = x1_STAR_STAR; Gemv ( NORMAL, F(-1), U01.LockedLocalMatrix(), x1_STAR_MR.LockedLocalMatrix(), F(1), z0_STAR_MC.LocalMatrix() ); //----------------------------------------------------------------// x1_STAR_MR.FreeAlignments(); z1.FreeAlignments(); SlidePartitionLeft ( xL, /**/ xR, x0, /**/ x1, x2 ); } } #ifndef RELEASE PopCallStack(); #endif }
inline void PanelHouseholder( DistMatrix<F>& A, DistMatrix<F,MD,STAR>& t ) { #ifndef RELEASE CallStackEntry entry("lq::PanelHouseholder"); if( A.Grid() != t.Grid() ) LogicError("{A,t} must be distributed over the same grid"); if( t.Height() != Min(A.Height(),A.Width()) || t.Width() != 1 ) LogicError ("t must be a vector of height equal to the minimum dimension of A"); if( !t.AlignedWithDiagonal( A, 0 ) ) LogicError("t must be aligned with A's main diagonal"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), a01(g), A02(g), aTopRow(g), ABottomPan(g), ABL(g), ABR(g), a10(g), alpha11(g), a12(g), A20(g), a21(g), A22(g); DistMatrix<F,MD,STAR> tT(g), t0(g), tB(g), tau1(g), t2(g); // Temporary distributions DistMatrix<F> aTopRowConj(g); DistMatrix<F,STAR,MR > aTopRowConj_STAR_MR(g); DistMatrix<F,MC, STAR> z_MC_STAR(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( t, tT, tB, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ a01, A02, /*************/ /**********************/ /**/ a10, /**/ alpha11, a12, ABL, /**/ ABR, A20, /**/ a21, A22, 1 ); RepartitionDown ( tT, t0, /**/ /****/ tau1, tB, t2, 1 ); View1x2( aTopRow, alpha11, a12 ); View1x2( ABottomPan, a21, A22 ); aTopRowConj_STAR_MR.AlignWith( ABottomPan ); z_MC_STAR.AlignWith( ABottomPan ); //--------------------------------------------------------------------// // Compute the Householder reflector const F tau = Reflector( alpha11, a12 ); tau1.Set( 0, 0, tau ); // Apply the Householder reflector const bool myDiagonalEntry = ( g.Row() == alpha11.ColAlignment() && g.Col() == alpha11.RowAlignment() ); F alpha = 0; if( myDiagonalEntry ) { alpha = alpha11.GetLocal(0,0); alpha11.SetLocal(0,0,1); } Conjugate( aTopRow, aTopRowConj ); aTopRowConj_STAR_MR = aTopRowConj; Zeros( z_MC_STAR, ABottomPan.Height(), 1 ); LocalGemv ( NORMAL, F(1), ABottomPan, aTopRowConj_STAR_MR, F(0), z_MC_STAR ); z_MC_STAR.SumOverRow(); Ger ( -Conj(tau), z_MC_STAR.LockedMatrix(), aTopRowConj_STAR_MR.LockedMatrix(), ABottomPan.Matrix() ); if( myDiagonalEntry ) alpha11.SetLocal(0,0,alpha); //--------------------------------------------------------------------// SlidePartitionDown ( tT, t0, tau1, /**/ /****/ tB, t2 ); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, a01, /**/ A02, /**/ a10, alpha11, /**/ a12, /*************/ /**********************/ ABL, /**/ ABR, A20, a21, /**/ A22 ); } }
void FoxLi( ElementalMatrix<Complex<Real>>& APre, Int n, Real omega ) { DEBUG_CSE typedef Complex<Real> C; const Real pi = 4*Atan( Real(1) ); const C phi = Sqrt( C(0,omega/pi) ); DistMatrixWriteProxy<C,C,MC,MR> AProx( APre ); auto& A = AProx.Get(); // Compute Gauss quadrature points and weights const Grid& g = A.Grid(); DistMatrix<Real,VR,STAR> d(g), e(g); Zeros( d, n, 1 ); e.Resize( n-1, 1 ); auto& eLoc = e.Matrix(); for( Int iLoc=0; iLoc<e.LocalHeight(); ++iLoc ) { const Int i = e.GlobalRow(iLoc); const Real betaInv = 2*Sqrt(1-Pow(i+Real(1),-2)/4); eLoc(iLoc) = 1/betaInv; } DistMatrix<Real,VR,STAR> x(g); DistMatrix<Real,STAR,VR> Z(g); HermitianTridiagEig( d, e, x, Z, UNSORTED ); auto z = Z( IR(0), ALL ); DistMatrix<Real,STAR,VR> sqrtWeights( z ); auto& sqrtWeightsLoc = sqrtWeights.Matrix(); for( Int jLoc=0; jLoc<sqrtWeights.LocalWidth(); ++jLoc ) sqrtWeightsLoc(0,jLoc) = Sqrt(Real(2))*Abs(sqrtWeightsLoc(0,jLoc)); herm_eig::Sort( x, sqrtWeights, ASCENDING ); // Form the integral operator A.Resize( n, n ); DistMatrix<Real,MC,STAR> x_MC( A.Grid() ); DistMatrix<Real,MR,STAR> x_MR( A.Grid() ); x_MC.AlignWith( A ); x_MR.AlignWith( A ); x_MC = x; x_MR = x; auto& ALoc = A.Matrix(); auto& x_MCLoc = x_MC.Matrix(); auto& x_MRLoc = x_MR.Matrix(); for( Int jLoc=0; jLoc<A.LocalWidth(); ++jLoc ) { for( Int iLoc=0; iLoc<A.LocalHeight(); ++iLoc ) { const Real diff = x_MCLoc(iLoc)-x_MRLoc(jLoc); const Real theta = -omega*Pow(diff,2); const Real realPart = Cos(theta); const Real imagPart = Sin(theta); ALoc(iLoc,jLoc) = phi*C(realPart,imagPart); } } // Apply the weighting DistMatrix<Real,VR,STAR> sqrtWeightsTrans(g); Transpose( sqrtWeights, sqrtWeightsTrans ); DiagonalScale( LEFT, NORMAL, sqrtWeightsTrans, A ); DiagonalScale( RIGHT, NORMAL, sqrtWeightsTrans, A ); }
void IPM ( const DistSparseMatrix<Real>& A, const DistMultiVec<Real>& b, Real lambda, DistMultiVec<Real>& x, const qp::affine::Ctrl<Real>& ctrl ) { DEBUG_CSE const Int m = A.Height(); const Int n = A.Width(); mpi::Comm comm = A.Comm(); DistSparseMatrix<Real> Q(comm), AHat(comm), G(comm); DistMultiVec<Real> c(comm), h(comm); // Q := | 0 0 0 | // | 0 0 0 | // | 0 0 I | // ============== Zeros( Q, 2*n+m, 2*n+m ); { Int numLocalUpdates = 0; for( Int iLoc=0; iLoc<Q.LocalHeight(); ++iLoc ) if( Q.GlobalRow(iLoc) >= 2*n ) ++numLocalUpdates; Q.Reserve( numLocalUpdates ); for( Int iLoc=0; iLoc<Q.LocalHeight(); ++iLoc ) if( Q.GlobalRow(iLoc) >= 2*n ) Q.QueueLocalUpdate( iLoc, Q.GlobalRow(iLoc), Real(1) ); Q.ProcessLocalQueues(); } // c := lambda*[1;1;0] // =================== Zeros( c, 2*n+m, 1 ); auto& cLoc = c.Matrix(); for( Int iLoc=0; iLoc<c.LocalHeight(); ++iLoc ) if( c.GlobalRow(iLoc) < 2*n ) cLoc(iLoc) = lambda; // \hat A := [A, -A, I] // ==================== // NOTE: Since A and \hat A are the same height and each distributed within // columns, it is possible to form \hat A from A without communication const Int numLocalEntriesA = A.NumLocalEntries(); Zeros( AHat, m, 2*n+m ); AHat.Reserve( 2*numLocalEntriesA+AHat.LocalHeight() ); for( Int e=0; e<numLocalEntriesA; ++e ) { AHat.QueueUpdate( A.Row(e), A.Col(e), A.Value(e) ); AHat.QueueUpdate( A.Row(e), A.Col(e)+n, -A.Value(e) ); } for( Int iLoc=0; iLoc<AHat.LocalHeight(); ++iLoc ) { const Int i = AHat.GlobalRow(iLoc); AHat.QueueLocalUpdate( iLoc, i+2*n, Real(1) ); } AHat.ProcessLocalQueues(); // G := | -I 0 0 | // | 0 -I 0 | // ================ Zeros( G, 2*n, 2*n+m ); G.Reserve( G.LocalHeight() ); for( Int iLoc=0; iLoc<G.LocalHeight(); ++iLoc ) { const Int i = G.GlobalRow(iLoc); G.QueueLocalUpdate( iLoc, i, Real(-1) ); } G.ProcessLocalQueues(); // h := 0 // ====== Zeros( h, 2*n, 1 ); // Solve the affine QP // =================== DistMultiVec<Real> xHat(comm), y(comm), z(comm), s(comm); QP( Q, AHat, G, b, c, h, xHat, y, z, s, ctrl ); // x := u - v // ========== Zeros( x, n, 1 ); Int numRemoteUpdates = 0; for( Int iLoc=0; iLoc<xHat.LocalHeight(); ++iLoc ) if( xHat.GlobalRow(iLoc) < 2*n ) ++numRemoteUpdates; else break; x.Reserve( numRemoteUpdates ); auto& xHatLoc = xHat.LockedMatrix(); for( Int iLoc=0; iLoc<xHat.LocalHeight(); ++iLoc ) { const Int i = xHat.GlobalRow(iLoc); if( i < n ) x.QueueUpdate( i, 0, xHatLoc(iLoc) ); else if( i < 2*n ) x.QueueUpdate( i-n, 0, -xHatLoc(iLoc) ); else break; } x.ProcessQueues(); }
inline void RLHF ( int offset, const DistMatrix<R>& H, DistMatrix<R>& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLHF"); if( H.Grid() != A.Grid() ) throw std::logic_error("{H,A} must be distributed over the same grid"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); #endif const Grid& g = H.Grid(); DistMatrix<R> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<R> ALeft(g); DistMatrix<R,STAR,VR > HPan_STAR_VR(g); DistMatrix<R,STAR,MR > HPan_STAR_MR(g); DistMatrix<R,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<R,STAR,MC > ZTrans_STAR_MC(g); DistMatrix<R,STAR,VC > ZTrans_STAR_VC(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); HPan_STAR_MR.AlignWith( ALeft ); ZTrans_STAR_MC.AlignWith( ALeft ); ZTrans_STAR_VC.AlignWith( ALeft ); Zeros( HPan.Height(), ALeft.Height(), ZTrans_STAR_MC ); Zeros( HPan.Height(), HPan.Height(), SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, R(1) ); HPan_STAR_VR = HPanCopy; Syrk ( UPPER, NORMAL, R(1), HPan_STAR_VR.LockedMatrix(), R(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); HalveMainDiagonal( SInv_STAR_STAR ); HPan_STAR_MR = HPan_STAR_VR; LocalGemm ( NORMAL, TRANSPOSE, R(1), HPan_STAR_MR, ALeft, R(0), ZTrans_STAR_MC ); ZTrans_STAR_VC.SumScatterFrom( ZTrans_STAR_MC ); LocalTrsm ( LEFT, UPPER, TRANSPOSE, NON_UNIT, R(1), SInv_STAR_STAR, ZTrans_STAR_VC ); ZTrans_STAR_MC = ZTrans_STAR_VC; LocalGemm ( TRANSPOSE, NORMAL, R(-1), ZTrans_STAR_MC, HPan_STAR_MR, R(1), ALeft ); //--------------------------------------------------------------------// HPan_STAR_MR.FreeAlignments(); ZTrans_STAR_MC.FreeAlignments(); ZTrans_STAR_VC.FreeAlignments(); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }
void LUMod ( Matrix<F>& A, Permutation& P, const Matrix<F>& u, const Matrix<F>& v, bool conjugate, Base<F> tau ) { DEBUG_CSE typedef Base<F> Real; const Int m = A.Height(); const Int n = A.Width(); const Int minDim = Min(m,n); if( minDim != m ) LogicError("It is assumed that height(A) <= width(A)"); if( u.Height() != m || u.Width() != 1 ) LogicError("u is expected to be a conforming column vector"); if( v.Height() != n || v.Width() != 1 ) LogicError("v is expected to be a conforming column vector"); // w := inv(L) P u auto w( u ); P.PermuteRows( w ); Trsv( LOWER, NORMAL, UNIT, A, w ); // Maintain an external vector for the temporary subdiagonal of U Matrix<F> uSub; Zeros( uSub, minDim-1, 1 ); // Reduce w to a multiple of e0 for( Int i=minDim-2; i>=0; --i ) { // Decide if we should pivot the i'th and i+1'th rows of w const F lambdaSub = A(i+1,i); const F ups_ii = A(i,i); const F omega_i = w(i); const F omega_ip1 = w(i+1); const Real rightTerm = Abs(lambdaSub*omega_i+omega_ip1); const bool pivot = ( Abs(omega_i) < tau*rightTerm ); const Range<Int> indi( i, i+1 ), indip1( i+1, i+2 ), indB( i+2, m ), indR( i+1, n ); auto lBi = A( indB, indi ); auto lBip1 = A( indB, indip1 ); auto uiR = A( indi, indR ); auto uip1R = A( indip1, indR ); if( pivot ) { // P := P_i P P.Swap( i, i+1 ); // Simultaneously perform // U := P_i U and // L := P_i L P_i^T // // Then update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // w := T_{i,L} P_i w, // where T_{i,L} is the Gauss transform which zeros (P_i w)_{i+1}. // // More succinctly, // gamma := w(i) / w(i+1), // w(i) := w(i+1), // w(i+1) := 0, // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:). const F gamma = omega_i / omega_ip1; const F lambda_ii = F(1) + gamma*lambdaSub; A(i, i) = gamma; A(i+1,i) = 0; auto lBiCopy = lBi; Swap( NORMAL, lBi, lBip1 ); Axpy( gamma, lBiCopy, lBi ); auto uip1RCopy = uip1R; RowSwap( A, i, i+1 ); Axpy( -gamma, uip1RCopy, uip1R ); // Force L back to *unit* lower-triangular form via the transform // L := L T_{i,U}^{-1} D^{-1}, // where D is diagonal and responsible for forcing L(i,i) and // L(i+1,i+1) back to 1. The effect on L is: // eta := L(i,i+1)/L(i,i), // L(:,i+1) -= eta L(:,i), // delta_i := L(i,i), // delta_ip1 := L(i+1,i+1), // L(:,i) /= delta_i, // L(:,i+1) /= delta_ip1, // while the effect on U is // U(i,:) += eta U(i+1,:) // U(i,:) *= delta_i, // U(i+1,:) *= delta_{i+1}, // and the effect on w is // w(i) *= delta_i. const F eta = lambdaSub/lambda_ii; const F delta_i = lambda_ii; const F delta_ip1 = F(1) - eta*gamma; Axpy( -eta, lBi, lBip1 ); A(i+1,i) = gamma/delta_i; lBi *= F(1)/delta_i; lBip1 *= F(1)/delta_ip1; A(i,i) = eta*ups_ii*delta_i; Axpy( eta, uip1R, uiR ); uiR *= delta_i; uip1R *= delta_ip1; uSub(i) = ups_ii*delta_ip1; // Finally set w(i) w(i) = omega_ip1*delta_i; } else { // Update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // w := T_{i,L} w, // where T_{i,L} is the Gauss transform which zeros w_{i+1}. // // More succinctly, // gamma := w(i+1) / w(i), // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:), // w(i+1) := 0. const F gamma = omega_ip1 / omega_i; A(i+1,i) += gamma; Axpy( gamma, lBip1, lBi ); Axpy( -gamma, uiR, uip1R ); uSub(i) = -gamma*ups_ii; } } // Add the modified w v' into U { auto a0 = A( IR(0), ALL ); const F omega_0 = w(0); Matrix<F> vTrans; Transpose( v, vTrans, conjugate ); Axpy( omega_0, vTrans, a0 ); } // Transform U from upper-Hessenberg to upper-triangular form for( Int i=0; i<minDim-1; ++i ) { // Decide if we should pivot the i'th and i+1'th rows U const F lambdaSub = A(i+1,i); const F ups_ii = A(i,i); const F ups_ip1i = uSub(i); const Real rightTerm = Abs(lambdaSub*ups_ii+ups_ip1i); const bool pivot = ( Abs(ups_ii) < tau*rightTerm ); const Range<Int> indi( i, i+1 ), indip1( i+1, i+2 ), indB( i+2, m ), indR( i+1, n ); auto lBi = A( indB, indi ); auto lBip1 = A( indB, indip1 ); auto uiR = A( indi, indR ); auto uip1R = A( indip1, indR ); if( pivot ) { // P := P_i P P.Swap( i, i+1 ); // Simultaneously perform // U := P_i U and // L := P_i L P_i^T // // Then update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // where T_{i,L} is the Gauss transform which zeros U(i+1,i). // // More succinctly, // gamma := U(i+1,i) / U(i,i), // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:). const F gamma = ups_ii / ups_ip1i; const F lambda_ii = F(1) + gamma*lambdaSub; A(i+1,i) = ups_ip1i; A(i, i) = gamma; auto lBiCopy = lBi; Swap( NORMAL, lBi, lBip1 ); Axpy( gamma, lBiCopy, lBi ); auto uip1RCopy = uip1R; RowSwap( A, i, i+1 ); Axpy( -gamma, uip1RCopy, uip1R ); // Force L back to *unit* lower-triangular form via the transform // L := L T_{i,U}^{-1} D^{-1}, // where D is diagonal and responsible for forcing L(i,i) and // L(i+1,i+1) back to 1. The effect on L is: // eta := L(i,i+1)/L(i,i), // L(:,i+1) -= eta L(:,i), // delta_i := L(i,i), // delta_ip1 := L(i+1,i+1), // L(:,i) /= delta_i, // L(:,i+1) /= delta_ip1, // while the effect on U is // U(i,:) += eta U(i+1,:) // U(i,:) *= delta_i, // U(i+1,:) *= delta_{i+1}. const F eta = lambdaSub/lambda_ii; const F delta_i = lambda_ii; const F delta_ip1 = F(1) - eta*gamma; Axpy( -eta, lBi, lBip1 ); A(i+1,i) = gamma/delta_i; lBi *= F(1)/delta_i; lBip1 *= F(1)/delta_ip1; A(i,i) = ups_ip1i*delta_i; Axpy( eta, uip1R, uiR ); uiR *= delta_i; uip1R *= delta_ip1; } else { // Update // L := L T_{i,L}^{-1}, // U := T_{i,L} U, // where T_{i,L} is the Gauss transform which zeros U(i+1,i). // // More succinctly, // gamma := U(i+1,i)/ U(i,i), // L(:,i) += gamma L(:,i+1), // U(i+1,:) -= gamma U(i,:). const F gamma = ups_ip1i / ups_ii; A(i+1,i) += gamma; Axpy( gamma, lBip1, lBi ); Axpy( -gamma, uiR, uip1R ); } } }
void IPM ( const SparseMatrix<Real>& A, const Matrix<Real>& b, Real lambda, Matrix<Real>& x, const qp::affine::Ctrl<Real>& ctrl ) { DEBUG_CSE const Int m = A.Height(); const Int n = A.Width(); const Range<Int> uInd(0,n), vInd(n,2*n), rInd(2*n,2*n+m); SparseMatrix<Real> Q, AHat, G; Matrix<Real> c, h; // Q := | 0 0 0 | // | 0 0 0 | // | 0 0 I | // ============== Zeros( Q, 2*n+m, 2*n+m ); Q.Reserve( m ); for( Int e=0; e<m; ++e ) Q.QueueUpdate( 2*n+e, 2*n+e, Real(1) ); Q.ProcessQueues(); // c := lambda*[1;1;0] // =================== Zeros( c, 2*n+m, 1 ); auto cuv = c( IR(0,2*n), ALL ); Fill( cuv, lambda ); // \hat A := [A, -A, I] // ==================== const Int numEntriesA = A.NumEntries(); Zeros( AHat, m, 2*n+m ); AHat.Reserve( 2*numEntriesA+m ); for( Int e=0; e<numEntriesA; ++e ) { AHat.QueueUpdate( A.Row(e), A.Col(e), A.Value(e) ); AHat.QueueUpdate( A.Row(e), A.Col(e)+n, -A.Value(e) ); } for( Int e=0; e<m; ++e ) AHat.QueueUpdate( e, e+2*n, Real(1) ); AHat.ProcessQueues(); // G := | -I 0 0 | // | 0 -I 0 | // ================ Zeros( G, 2*n, 2*n+m ); G.Reserve( 2*m ); for( Int e=0; e<2*m; ++e ) G.QueueUpdate( e, e, Real(-1) ); G.ProcessQueues(); // h := 0 // ====== Zeros( h, 2*n, 1 ); // Solve the affine QP // =================== Matrix<Real> xHat, y, z, s; QP( Q, AHat, G, b, c, h, xHat, y, z, s, ctrl ); // x := u - v // ========== x = xHat( uInd, ALL ); x -= xHat( vInd, ALL ); }
inline void TwoSidedTrmmUVar5( UnitOrNonUnit diag, Matrix<F>& A, const Matrix<F>& U ) { #ifndef RELEASE PushCallStack("internal::TwoSidedTrmmUVar5"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( U.Height() != U.Width() ) throw std::logic_error("Triangular matrices must be square"); if( A.Height() != U.Height() ) throw std::logic_error("A and U must be the same size"); #endif // Matrix views Matrix<F> ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; Matrix<F> UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; // Temporary products Matrix<F> Y01; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); //--------------------------------------------------------------------// // Y01 := U01 A11 Zeros( A01.Height(), A01.Width(), Y01 ); Hemm( RIGHT, UPPER, F(1), A11, U01, F(0), Y01 ); // A01 := U00 A01 Trmm( LEFT, UPPER, NORMAL, diag, F(1), U00, A01 ); // A01 := A01 + 1/2 Y01 Axpy( F(1)/F(2), Y01, A01 ); // A00 := A00 + (U01 A01' + A01 U01') Her2k( UPPER, NORMAL, F(1), U01, A01, F(1), A00 ); // A01 := A01 + 1/2 Y01 Axpy( F(1)/F(2), Y01, A01 ); // A01 := A01 U11' Trmm( RIGHT, UPPER, ADJOINT, diag, F(1), U11, A01 ); // A11 := U11 A11 U11' TwoSidedTrmmUUnb( diag, A11, U11 ); //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void GemmTTA ( Orientation orientationOfA, Orientation orientationOfB, T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmTTA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( orientationOfA == NORMAL || orientationOfB == NORMAL ) throw std::logic_error ("GemmTTA expects A and B to be (Conjugate)Transposed"); if( A.Width() != C.Height() || B.Height() != C.Width() || A.Height() != B.Width() ) { std::ostringstream msg; msg << "Nonconformal GemmTTA: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> BT(g), B0(g), BB(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,STAR,MC > B1_STAR_MC(g); DistMatrix<T,MR, STAR> D1_MR_STAR(g); DistMatrix<T,MR, MC > D1_MR_MC(g); DistMatrix<T> D1(g); B1_STAR_MC.AlignWith( A ); D1_MR_STAR.AlignWith( A ); // Start the algorithm Scale( beta, C ); LockedPartitionDown ( B, BT, BB, 0 ); PartitionRight( C, CL, CR, 0 ); while( BB.Height() > 0 ) { LockedRepartitionDown ( BT, B0, /**/ /**/ B1, BB, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); D1.AlignWith( C1 ); Zeros( C1.Height(), C1.Width(), D1_MR_STAR ); //--------------------------------------------------------------------// B1_STAR_MC = B1; // B1[*,MC] <- B1[MC,MR] // D1[MR,*] := alpha (A[MC,MR])^T (B1[*,MC])^T // = alpha (A^T)[MR,MC] (B1^T)[MC,*] LocalGemm ( orientationOfA, orientationOfB, alpha, A, B1_STAR_MC, T(0), D1_MR_STAR ); // C1[MC,MR] += scattered & transposed D1[MR,*] summed over grid cols D1_MR_MC.SumScatterFrom( D1_MR_STAR ); D1 = D1_MR_MC; Axpy( T(1), D1, C1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); SlideLockedPartitionDown ( BT, B0, B1, /**/ /**/ BB, B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void GemmTTB ( Orientation orientationOfA, Orientation orientationOfB, T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmTTB"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( orientationOfA == NORMAL || orientationOfB == NORMAL ) throw std::logic_error ("GemmTTB expects A and B to be (Conjugate)Transposed"); if( A.Width() != C.Height() || B.Height() != C.Width() || A.Height() != B.Width() ) { std::ostringstream msg; msg << "Nonconformal GemmTTB: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<T> AL(g), AR(g), A0(g), A1(g), A2(g); DistMatrix<T> CT(g), C0(g), CB(g), C1(g), C2(g); // Temporary distributions DistMatrix<T,VR, STAR> A1_VR_STAR(g); DistMatrix<T,STAR,MR > A1AdjOrTrans_STAR_MR(g); DistMatrix<T,STAR,MC > D1_STAR_MC(g); DistMatrix<T,MR, MC > D1_MR_MC(g); DistMatrix<T> D1(g); A1_VR_STAR.AlignWith( B ); A1AdjOrTrans_STAR_MR.AlignWith( B ); D1_STAR_MC.AlignWith( B ); // Start the algorithm Scale( beta, C ); LockedPartitionRight( A, AL, AR, 0 ); PartitionDown ( C, CT, CB, 0 ); while( AR.Width() > 0 ) { LockedRepartitionRight ( AL, /**/ AR, A0, /**/ A1, A2 ); RepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); D1.AlignWith( C1 ); Zeros( C1.Height(), C1.Width(), D1_STAR_MC ); //--------------------------------------------------------------------// A1_VR_STAR = A1; if( orientationOfA == ADJOINT ) A1AdjOrTrans_STAR_MR.AdjointFrom( A1_VR_STAR ); else A1AdjOrTrans_STAR_MR.TransposeFrom( A1_VR_STAR ); // D1[*,MC] := alpha (A1[MR,*])^[T/H] (B[MC,MR])^[T/H] // = alpha (A1^[T/H])[*,MR] (B^[T/H])[MR,MC] LocalGemm ( NORMAL, orientationOfB, alpha, A1AdjOrTrans_STAR_MR, B, T(0), D1_STAR_MC ); // C1[MC,MR] += scattered & transposed D1[*,MC] summed over grid rows D1_MR_MC.SumScatterFrom( D1_STAR_MC ); D1 = D1_MR_MC; Axpy( T(1), D1, C1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); SlideLockedPartitionRight ( AL, /**/ AR, A0, A1, /**/ A2 ); SlidePartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); } #ifndef RELEASE PopCallStack(); #endif }
SE2() : t(Zeros()) {}
inline void RLHF ( Conjugation conjugation, int offset, const Matrix<Complex<R> >& H, const Matrix<Complex<R> >& t, Matrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLHF"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); #endif typedef Complex<R> C; Matrix<C> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<C> ALeft; Matrix<C> tT, t0, tB, t1, t2; Matrix<C> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanHeight ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); Zeros( ALeft.Height(), HPan.Height(), Z ); Zeros( HPan.Height(), HPan.Height(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, C(1) ); Herk( UPPER, NORMAL, C(1), HPanCopy, C(0), SInv ); FixDiagonal( conjugation, t1, SInv ); Gemm( NORMAL, ADJOINT, C(1), ALeft, HPanCopy, C(0), Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, C(1), SInv, Z ); Gemm( NORMAL, NORMAL, C(-1), Z, HPanCopy, C(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); } #ifndef RELEASE PopCallStack(); #endif }
SE2(const SO2<Scalar>& R_) : t(Zeros()), R(R_){}
inline void RLHF ( Conjugation conjugation, int offset, const DistMatrix<Complex<R> >& H, const DistMatrix<Complex<R>,MD,STAR>& t, DistMatrix<Complex<R> >& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLHF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) throw std::logic_error ("{H,t,A} must be distributed over the same grid"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); if( t.Height() != H.DiagonalLength( offset ) ) throw std::logic_error("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) throw std::logic_error("t must be aligned with H's 'offset' diagonal"); #endif typedef Complex<R> C; const Grid& g = H.Grid(); DistMatrix<C> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<C> ALeft(g); DistMatrix<C,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<C,STAR,VR > HPan_STAR_VR(g); DistMatrix<C,STAR,MR > HPan_STAR_MR(g); DistMatrix<C,STAR,STAR> t1_STAR_STAR(g); DistMatrix<C,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<C,STAR,MC > ZAdj_STAR_MC(g); DistMatrix<C,STAR,VC > ZAdj_STAR_VC(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2, HPanHeight ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); HPan_STAR_MR.AlignWith( ALeft ); ZAdj_STAR_MC.AlignWith( ALeft ); ZAdj_STAR_VC.AlignWith( ALeft ); Zeros( HPan.Height(), ALeft.Height(), ZAdj_STAR_MC ); Zeros( HPan.Height(), HPan.Height(), SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, C(1) ); HPan_STAR_VR = HPanCopy; Herk ( UPPER, NORMAL, C(1), HPan_STAR_VR.LockedMatrix(), C(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_STAR_MR = HPan_STAR_VR; LocalGemm ( NORMAL, ADJOINT, C(1), HPan_STAR_MR, ALeft, C(0), ZAdj_STAR_MC ); ZAdj_STAR_VC.SumScatterFrom( ZAdj_STAR_MC ); LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, C(1), SInv_STAR_STAR, ZAdj_STAR_VC ); ZAdj_STAR_MC = ZAdj_STAR_VC; LocalGemm ( ADJOINT, NORMAL, C(-1), ZAdj_STAR_MC, HPan_STAR_MR, C(1), ALeft ); //--------------------------------------------------------------------// HPan_STAR_MR.FreeAlignments(); ZAdj_STAR_MC.FreeAlignments(); ZAdj_STAR_VC.FreeAlignments(); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void PanelHouseholder( Matrix<F>& A, Matrix<F>& t ) { #ifndef RELEASE CallStackEntry entry("lq::PanelHouseholder"); if( t.Height() != Min(A.Height(),A.Width()) || t.Width() != 1 ) LogicError ("t must be a vector of height equal to the minimum dimension of A"); #endif Matrix<F> ATL, ATR, A00, a01, A02, aTopRow, ABottomPan, ABL, ABR, a10, alpha11, a12, A20, a21, A22; Matrix<F> tT, t0, tB, tau1, t2; Matrix<F> z, aTopRowConj; PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); PartitionDown ( t, tT, tB, 0 ); while( ATL.Height() < A.Height() && ATL.Width() < A.Width() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ a01, A02, /*************/ /**********************/ /**/ a10, /**/ alpha11, a12, ABL, /**/ ABR, A20, /**/ a21, A22, 1 ); RepartitionDown ( tT, t0, /**/ /****/ tau1, tB, t2, 1 ); View1x2( aTopRow, alpha11, a12 ); View1x2( ABottomPan, a21, A22 ); //--------------------------------------------------------------------// // Compute the Householder reflector const F tau = Reflector( alpha11, a12 ); tau1.Set( 0, 0, tau ); // Apply the Householder reflector const F alpha = alpha11.Get(0,0); alpha11.Set(0,0,1); Conjugate( aTopRow, aTopRowConj ); Zeros( z, ABottomPan.Height(), 1 ); Gemv( NORMAL, F(1), ABottomPan, aTopRowConj, F(0), z ); Ger( -Conj(tau), z, aTopRowConj, ABottomPan ); alpha11.Set(0,0,alpha); //--------------------------------------------------------------------// SlidePartitionDown ( tT, t0, tau1, /**/ /****/ tB, t2 ); SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, a01, /**/ A02, /**/ a10, alpha11, /**/ a12, /*************/ /**********************/ ABL, /**/ ABR, A20, a21, /**/ A22 ); } }
inline void RLHF( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("apply_packed_reflectors::RLHF"); if( offset > 0 || offset < -H.Width() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Width() ) throw std::logic_error ("Width of transforms must equal width of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> ALeft; Matrix<R> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanWidth = H10.Width() + H11.Width(); const int HPanOffset = std::min( H11.Height(), std::max(-offset-H00.Height(),0) ); const int HPanHeight = H11.Height()-HPanOffset; LockedView ( HPan, H, H00.Height()+HPanOffset, 0, HPanHeight, HPanWidth ); View( ALeft, A, 0, 0, A.Height(), HPanWidth ); Zeros( ALeft.Height(), HPan.Height(), Z ); Zeros( HPan.Height(), HPan.Height(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, LOWER, offset, HPanCopy ); SetDiagonal( RIGHT, offset, HPanCopy, R(1) ); Syrk( UPPER, NORMAL, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( NORMAL, TRANSPOSE, R(1), ALeft, HPanCopy, R(0), Z ); Trsm( RIGHT, UPPER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, NORMAL, R(-1), Z, HPanCopy, R(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void Symv ( UpperOrLower uplo, T alpha, const DistMatrix<T>& A, const DistMatrix<T>& x, T beta, DistMatrix<T>& y, bool conjugate=false ) { #ifndef RELEASE CallStackEntry entry("Symv"); if( A.Grid() != x.Grid() || x.Grid() != y.Grid() ) throw std::logic_error ("{A,x,y} must be distributed over the same grid"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( ( x.Width() != 1 && x.Height() != 1 ) || ( y.Width() != 1 && y.Height() != 1 ) ) throw std::logic_error("x and y are assumed to be vectors"); const int xLength = ( x.Width()==1 ? x.Height() : x.Width() ); const int yLength = ( y.Width()==1 ? y.Height() : y.Width() ); if( A.Height() != xLength || A.Height() != yLength ) { std::ostringstream msg; msg << "Nonconformal Symv: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x ~ " << x.Height() << " x " << x.Width() << "\n" << " y ~ " << y.Height() << " x " << y.Width() << "\n"; throw std::logic_error( msg.str() ); } #endif const Grid& g = A.Grid(); if( x.Width() == 1 && y.Width() == 1 ) { // Temporary distributions DistMatrix<T,MC,STAR> x_MC_STAR(g), z_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g), z_MR_STAR(g); DistMatrix<T,MR,MC > z_MR_MC(g); DistMatrix<T> z(g); // Begin the algoritm Scale( beta, y ); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); z_MC_STAR.AlignWith( A ); z_MR_STAR.AlignWith( A ); z.AlignWith( y ); Zeros( z_MC_STAR, y.Height(), 1 ); Zeros( z_MR_STAR, y.Height(), 1 ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; if( uplo == LOWER ) { internal::LocalSymvColAccumulateL ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } else { internal::LocalSymvColAccumulateU ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } z_MR_MC.SumScatterFrom( z_MR_STAR ); z = z_MR_MC; z.SumScatterUpdate( T(1), z_MC_STAR ); Axpy( T(1), z, y ); //--------------------------------------------------------------------// x_MC_STAR.FreeAlignments(); x_MR_STAR.FreeAlignments(); z_MC_STAR.FreeAlignments(); z_MR_STAR.FreeAlignments(); z.FreeAlignments(); } else if( x.Width() == 1 ) { // Temporary distributions DistMatrix<T,MC,STAR> x_MC_STAR(g), z_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g), z_MR_STAR(g); DistMatrix<T,MR,MC > z_MR_MC(g); DistMatrix<T> z(g), zTrans(g); // Begin the algoritm Scale( beta, y ); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); z_MC_STAR.AlignWith( A ); z_MR_STAR.AlignWith( A ); z.AlignWith( y ); z_MR_MC.AlignWith( y ); Zeros( z_MC_STAR, y.Width(), 1 ); Zeros( z_MR_STAR, y.Width(), 1 ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; if( uplo == LOWER ) { internal::LocalSymvColAccumulateL ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } else { internal::LocalSymvColAccumulateU ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } z.SumScatterFrom( z_MC_STAR ); z_MR_MC = z; z_MR_MC.SumScatterUpdate( T(1), z_MR_STAR ); Transpose( z_MR_MC, zTrans ); Axpy( T(1), zTrans, y ); //--------------------------------------------------------------------// x_MC_STAR.FreeAlignments(); x_MR_STAR.FreeAlignments(); z_MC_STAR.FreeAlignments(); z_MR_STAR.FreeAlignments(); z.FreeAlignments(); z_MR_MC.FreeAlignments(); } else if( y.Width() == 1 ) { // Temporary distributions DistMatrix<T,STAR,MC> x_STAR_MC(g), z_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g), z_STAR_MR(g); DistMatrix<T,MR, MC> z_MR_MC(g); DistMatrix<T> z(g), zTrans(g); // Begin the algoritm Scale( beta, y ); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); z_STAR_MC.AlignWith( A ); z_STAR_MR.AlignWith( A ); z.AlignWith( y ); z_MR_MC.AlignWith( y ); Zeros( z_STAR_MC, 1, y.Height() ); Zeros( z_STAR_MR, 1, y.Height() ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; if( uplo == LOWER ) { internal::LocalSymvRowAccumulateL ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } else { internal::LocalSymvRowAccumulateU ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } z.SumScatterFrom( z_STAR_MR ); z_MR_MC = z; z_MR_MC.SumScatterUpdate( T(1), z_STAR_MC ); Transpose( z_MR_MC, zTrans ); Axpy( T(1), zTrans, y ); //--------------------------------------------------------------------// x_STAR_MC.FreeAlignments(); x_STAR_MR.FreeAlignments(); z_STAR_MC.FreeAlignments(); z_STAR_MR.FreeAlignments(); z.FreeAlignments(); z_MR_MC.FreeAlignments(); } else { // Temporary distributions DistMatrix<T,STAR,MC> x_STAR_MC(g), z_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g), z_STAR_MR(g); DistMatrix<T,MR, MC> z_MR_MC(g); DistMatrix<T> z(g); // Begin the algoritm Scale( beta, y ); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); z_STAR_MC.AlignWith( A ); z_STAR_MR.AlignWith( A ); z.AlignWith( y ); z_MR_MC.AlignWith( y ); Zeros( z_STAR_MC, 1, y.Width() ); Zeros( z_STAR_MR, 1, y.Width() ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; if( uplo == LOWER ) { internal::LocalSymvRowAccumulateL ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } else { internal::LocalSymvRowAccumulateU ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } z_MR_MC.SumScatterFrom( z_STAR_MC ); z = z_MR_MC; z.SumScatterUpdate( T(1), z_STAR_MR ); Axpy( T(1), z, y ); //--------------------------------------------------------------------// x_STAR_MC.FreeAlignments(); x_STAR_MR.FreeAlignments(); z_STAR_MC.FreeAlignments(); z_STAR_MR.FreeAlignments(); z.FreeAlignments(); z_MR_MC.FreeAlignments(); } }
inline void RUVF ( Conjugation conjugation, Int offset, const DistMatrix<F>& H, const DistMatrix<F,MD,STAR>& t, DistMatrix<F>& A ) { #ifndef RELEASE CallStackEntry cse("apply_packed_reflectors::RUVF"); if( H.Grid() != t.Grid() || t.Grid() != A.Grid() ) LogicError("{H,t,A} must be distributed over the same grid"); // TODO: Proper dimension checks if( t.Height() != H.DiagonalLength(offset) ) LogicError("t must be the same length as H's offset diag"); if( !t.AlignedWithDiagonal( H, offset ) ) LogicError("t must be aligned with H's 'offset' diagonal"); #endif const Grid& g = H.Grid(); DistMatrix<F> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HPanCopy(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<F> ALeft(g); DistMatrix<F,MD,STAR> tT(g), t0(g), tB(g), t1(g), t2(g); DistMatrix<F,VC, STAR> HPan_VC_STAR(g); DistMatrix<F,MR, STAR> HPan_MR_STAR(g); DistMatrix<F,STAR,STAR> t1_STAR_STAR(g); DistMatrix<F,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<F,STAR,MC > ZAdj_STAR_MC(g); DistMatrix<F,STAR,VC > ZAdj_STAR_VC(g); LockedPartitionDownOffsetDiagonal ( offset, H, HTL, HTR, HBL, HBR, 0 ); LockedPartitionDown ( t, tT, tB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); LockedRepartitionDown ( tT, t0, /**/ /**/ t1, tB, t2 ); LockedView2x1( HPan, H01, H11 ); View( ALeft, A, 0, 0, A.Height(), HPan.Height() ); HPan_MR_STAR.AlignWith( ALeft ); ZAdj_STAR_MC.AlignWith( ALeft ); ZAdj_STAR_VC.AlignWith( ALeft ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( UPPER, HPanCopy, 0, RIGHT ); SetDiagonal( HPanCopy, F(1), 0, RIGHT ); HPan_VC_STAR = HPanCopy; Zeros( SInv_STAR_STAR, HPan.Width(), HPan.Width() ); Herk ( UPPER, ADJOINT, F(1), HPan_VC_STAR.LockedMatrix(), F(0), SInv_STAR_STAR.Matrix() ); SInv_STAR_STAR.SumOverGrid(); t1_STAR_STAR = t1; FixDiagonal( conjugation, t1_STAR_STAR, SInv_STAR_STAR ); HPan_MR_STAR = HPan_VC_STAR; LocalGemm( ADJOINT, ADJOINT, F(1), HPan_MR_STAR, ALeft, ZAdj_STAR_MC ); ZAdj_STAR_VC.SumScatterFrom( ZAdj_STAR_MC ); LocalTrsm ( LEFT, UPPER, ADJOINT, NON_UNIT, F(1), SInv_STAR_STAR, ZAdj_STAR_VC ); ZAdj_STAR_MC = ZAdj_STAR_VC; LocalGemm ( ADJOINT, ADJOINT, F(-1), ZAdj_STAR_MC, HPan_MR_STAR, F(1), ALeft ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlideLockedPartitionDown ( tT, t0, t1, /**/ /**/ tB, t2 ); } }
inline void TrsmLLTSmall ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F,VC,STAR>& L, DistMatrix<F,VC,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLLTSmall"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error("TrsmLLT expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLLT: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( L.ColAlignment() != X.ColAlignment() ) throw std::logic_error("L and X must be aligned"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F,VC,STAR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,VC,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> Z1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// // X1 -= L21' X2 Zeros( X1.Height(), X1.Width(), Z1_STAR_STAR ); LocalGemm( orientation, NORMAL, F(-1), L21, X2, F(0), Z1_STAR_STAR ); AddInLocalData( X1, Z1_STAR_STAR ); Z1_STAR_STAR.SumOverGrid(); // X1 := L11^-1 X1 L11_STAR_STAR = L11; LocalTrsm ( LEFT, LOWER, orientation, diag, F(1), L11_STAR_STAR, Z1_STAR_STAR, checkIfSingular ); X1 = Z1_STAR_STAR; //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
void Ones( DistSparseMatrix<T>& A, Int m, Int n ) { EL_DEBUG_CSE Zeros( A, m, n ); Fill( A, T(1) ); }
inline void TwoSidedTrsmUVar1 ( UnitOrNonUnit diag, DistMatrix<F>& A, const DistMatrix<F>& U ) { #ifndef RELEASE CallStackEntry entry("internal::TwoSidedTrsmUVar1"); if( A.Height() != A.Width() ) LogicError("A must be square"); if( U.Height() != U.Width() ) LogicError("Triangular matrices must be square"); if( A.Height() != U.Height() ) LogicError("A and U must be the same size"); #endif const Grid& g = A.Grid(); // Matrix views DistMatrix<F> ATL(g), ATR(g), A00(g), A01(g), A02(g), ABL(g), ABR(g), A10(g), A11(g), A12(g), A20(g), A21(g), A22(g); DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); // Temporary distributions DistMatrix<F,STAR,STAR> A11_STAR_STAR(g); DistMatrix<F,VC, STAR> A01_VC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,VC, STAR> U01_VC_STAR(g); DistMatrix<F,VR, STAR> U01_VR_STAR(g); DistMatrix<F,STAR,MR > U01Adj_STAR_MR(g); DistMatrix<F,STAR,STAR> X11_STAR_STAR(g); DistMatrix<F,MR, MC > Z01_MR_MC(g); DistMatrix<F,MC, STAR> Z01_MC_STAR(g); DistMatrix<F,MR, STAR> Z01_MR_STAR(g); DistMatrix<F> Y01(g); PartitionDownDiagonal ( A, ATL, ATR, ABL, ABR, 0 ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); while( ATL.Height() < A.Height() ) { RepartitionDownDiagonal ( ATL, /**/ ATR, A00, /**/ A01, A02, /*************/ /******************/ /**/ A10, /**/ A11, A12, ABL, /**/ ABR, A20, /**/ A21, A22 ); LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); A01_VC_STAR.AlignWith( A01 ); U01_MC_STAR.AlignWith( A00 ); U01_VR_STAR.AlignWith( A00 ); U01_VC_STAR.AlignWith( A00 ); U01Adj_STAR_MR.AlignWith( A00 ); Y01.AlignWith( A01 ); Z01_MR_MC.AlignWith( A01 ); Z01_MC_STAR.AlignWith( A00 ); Z01_MR_STAR.AlignWith( A00 ); //--------------------------------------------------------------------// // Y01 := A00 U01 U01_MC_STAR = U01; U01_VR_STAR = U01_MC_STAR; U01Adj_STAR_MR.AdjointFrom( U01_VR_STAR ); Zeros( Z01_MC_STAR, A01.Height(), A01.Width() ); Zeros( Z01_MR_STAR, A01.Height(), A01.Width() ); LocalSymmetricAccumulateLU ( ADJOINT, F(1), A00, U01_MC_STAR, U01Adj_STAR_MR, Z01_MC_STAR, Z01_MR_STAR ); Z01_MR_MC.SumScatterFrom( Z01_MR_STAR ); Y01 = Z01_MR_MC; Y01.SumScatterUpdate( F(1), Z01_MC_STAR ); // A01 := inv(U00)' A01 // // This is the bottleneck because A01 only has blocksize columns Trsm( LEFT, UPPER, ADJOINT, diag, F(1), U00, A01 ); // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A11 := A11 - (U01' A01 + A01' U01) A01_VC_STAR = A01; U01_VC_STAR = U01_MC_STAR; Zeros( X11_STAR_STAR, A11.Height(), A11.Width() ); Her2k ( UPPER, ADJOINT, F(-1), A01_VC_STAR.Matrix(), U01_VC_STAR.Matrix(), F(0), X11_STAR_STAR.Matrix() ); A11.SumScatterUpdate( F(1), X11_STAR_STAR ); // A11 := inv(U11)' A11 inv(U11) A11_STAR_STAR = A11; U11_STAR_STAR = U11; LocalTwoSidedTrsm( UPPER, diag, A11_STAR_STAR, U11_STAR_STAR ); A11 = A11_STAR_STAR; // A01 := A01 - 1/2 Y01 Axpy( F(-1)/F(2), Y01, A01 ); // A01 := A01 inv(U11) A01_VC_STAR = A01; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, A01_VC_STAR ); A01 = A01_VC_STAR; //--------------------------------------------------------------------// SlidePartitionDownDiagonal ( ATL, /**/ ATR, A00, A01, /**/ A02, /**/ A10, A11, /**/ A12, /*************/ /******************/ ABL, /**/ ABR, A20, A21, /**/ A22 ); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); } }
inline void HemmLUA ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::HemmLUA"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); #endif const Grid& g = A.Grid(); DistMatrix<T> BL(g), BR(g), B0(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C0(g), C1(g), C2(g); DistMatrix<T,MC,STAR> B1_MC_STAR(g); DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,MR> B1Adj_STAR_MR(g); DistMatrix<T,MC,STAR> Z1_MC_STAR(g); DistMatrix<T,MR,STAR> Z1_MR_STAR(g); DistMatrix<T,MR,MC > Z1_MR_MC(g); DistMatrix<T> Z1(g); B1_MC_STAR.AlignWith( A ); B1_VR_STAR.AlignWith( A ); B1Adj_STAR_MR.AlignWith( A ); Z1_MC_STAR.AlignWith( A ); Z1_MR_STAR.AlignWith( A ); Scale( beta, C ); LockedPartitionRight ( B, BL, BR, 0 ); PartitionRight ( C, CL, CR, 0 ); while( CL.Width() < C.Width() ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); Z1.AlignWith( C1 ); Zeros( C1.Height(), C1.Width(), Z1_MC_STAR ); Zeros( C1.Height(), C1.Width(), Z1_MR_STAR ); //--------------------------------------------------------------------// B1_MC_STAR = B1; B1_VR_STAR = B1_MC_STAR; B1Adj_STAR_MR.AdjointFrom( B1_VR_STAR ); LocalSymmetricAccumulateLU ( ADJOINT, alpha, A, B1_MC_STAR, B1Adj_STAR_MR, Z1_MC_STAR, Z1_MR_STAR ); Z1_MR_MC.SumScatterFrom( Z1_MR_STAR ); Z1 = Z1_MR_MC; Z1.SumScatterUpdate( T(1), Z1_MC_STAR ); Axpy( T(1), Z1, C1 ); //--------------------------------------------------------------------// Z1.FreeAlignments(); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void ApplyPackedReflectorsLUVF ( int offset, const DistMatrix<R>& H, DistMatrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUVF"); if( H.Grid() != A.Grid() ) throw std::logic_error("{H,A} must be distributed over the same grid"); if( offset < 0 || offset > H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); #endif const Grid& g = H.Grid(); DistMatrix<R> HTL(g), HTR(g), H00(g), H01(g), H02(g), HPan(g), HBL(g), HBR(g), H10(g), H11(g), H12(g), H20(g), H21(g), H22(g); DistMatrix<R> AT(g), A0(g), ATop(g), AB(g), A1(g), A2(g); DistMatrix<R> HPanCopy(g); DistMatrix<R,VC, STAR> HPan_VC_STAR(g); DistMatrix<R,MC, STAR> HPan_MC_STAR(g); DistMatrix<R,STAR,STAR> SInv_STAR_STAR(g); DistMatrix<R,STAR,MR > Z_STAR_MR(g); DistMatrix<R,STAR,VR > Z_STAR_VR(g); LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanHeight = H01.Height() + H11.Height(); const int HPanOffset = std::min( H11.Width(), std::max(offset-H00.Width(),0) ); const int HPanWidth = H11.Width()-HPanOffset; HPan.LockedView( H, 0, H00.Width()+HPanOffset, HPanHeight, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); ATop.View2x1( A0, A1 ); HPan_MC_STAR.AlignWith( ATop ); Z_STAR_MR.AlignWith( ATop ); Z_STAR_VR.AlignWith( ATop ); Zeros( HPan.Width(), ATop.Width(), Z_STAR_MR ); Zeros( HPan.Width(), HPan.Width(), SInv_STAR_STAR ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, UPPER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); HPan_VC_STAR = HPanCopy; Syrk ( LOWER, TRANSPOSE, R(1), HPan_VC_STAR.LockedLocalMatrix(), R(0), SInv_STAR_STAR.LocalMatrix() ); SInv_STAR_STAR.SumOverGrid(); HalveMainDiagonal( SInv_STAR_STAR ); HPan_MC_STAR = HPanCopy; LocalGemm ( TRANSPOSE, NORMAL, R(1), HPan_MC_STAR, ATop, R(0), Z_STAR_MR ); Z_STAR_VR.SumScatterFrom( Z_STAR_MR ); LocalTrsm ( LEFT, LOWER, NORMAL, NON_UNIT, R(1), SInv_STAR_STAR, Z_STAR_VR ); Z_STAR_MR = Z_STAR_VR; LocalGemm( NORMAL, NORMAL, R(-1), HPan_MC_STAR, Z_STAR_MR, R(1), ATop ); //--------------------------------------------------------------------// HPan_MC_STAR.FreeAlignments(); Z_STAR_MR.FreeAlignments(); Z_STAR_VR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }
void Ridge ( Orientation orientation, const Matrix<Field>& A, const Matrix<Field>& B, Base<Field> gamma, Matrix<Field>& X, RidgeAlg alg ) { EL_DEBUG_CSE const bool normal = ( orientation==NORMAL ); const Int m = ( normal ? A.Height() : A.Width() ); const Int n = ( normal ? A.Width() : A.Height() ); if( orientation == TRANSPOSE && IsComplex<Field>::value ) LogicError("Transpose version of complex Ridge not yet supported"); if( m >= n ) { Matrix<Field> Z; if( alg == RIDGE_CHOLESKY ) { if( orientation == NORMAL ) Herk( LOWER, ADJOINT, Base<Field>(1), A, Z ); else Herk( LOWER, NORMAL, Base<Field>(1), A, Z ); ShiftDiagonal( Z, Field(gamma*gamma) ); Cholesky( LOWER, Z ); if( orientation == NORMAL ) Gemm( ADJOINT, NORMAL, Field(1), A, B, X ); else Gemm( NORMAL, NORMAL, Field(1), A, B, X ); cholesky::SolveAfter( LOWER, NORMAL, Z, X ); } else if( alg == RIDGE_QR ) { Zeros( Z, m+n, n ); auto ZT = Z( IR(0,m), IR(0,n) ); auto ZB = Z( IR(m,m+n), IR(0,n) ); if( orientation == NORMAL ) ZT = A; else Adjoint( A, ZT ); FillDiagonal( ZB, Field(gamma) ); // NOTE: This QR factorization could exploit the upper-triangular // structure of the diagonal matrix ZB qr::ExplicitTriang( Z ); if( orientation == NORMAL ) Gemm( ADJOINT, NORMAL, Field(1), A, B, X ); else Gemm( NORMAL, NORMAL, Field(1), A, B, X ); cholesky::SolveAfter( LOWER, NORMAL, Z, X ); } else { Matrix<Field> U, V; Matrix<Base<Field>> s; if( orientation == NORMAL ) { SVDCtrl<Base<Field>> ctrl; ctrl.overwrite = false; SVD( A, U, s, V, ctrl ); } else { Matrix<Field> AAdj; Adjoint( A, AAdj ); SVDCtrl<Base<Field>> ctrl; ctrl.overwrite = true; SVD( AAdj, U, s, V, ctrl ); } auto sigmaMap = [=]( const Base<Field>& sigma ) { return sigma / (sigma*sigma + gamma*gamma); }; EntrywiseMap( s, MakeFunction(sigmaMap) ); Gemm( ADJOINT, NORMAL, Field(1), U, B, X ); DiagonalScale( LEFT, NORMAL, s, X ); U = X; Gemm( NORMAL, NORMAL, Field(1), V, U, X ); } } else { LogicError("This case not yet supported"); } }
inline void ApplyPackedReflectorsLUVF ( int offset, const Matrix<R>& H, Matrix<R>& A ) { #ifndef RELEASE PushCallStack("internal::ApplyPackedReflectorsLUVF"); if( offset < 0 || offset > H.Height() ) throw std::logic_error("Transforms out of bounds"); if( H.Width() != A.Height() ) throw std::logic_error ("Width of transforms must equal height of target matrix"); #endif Matrix<R> HTL, HTR, H00, H01, H02, HPan, HPanCopy, HBL, HBR, H10, H11, H12, H20, H21, H22; Matrix<R> AT, A0, ATop, AB, A1, A2; Matrix<R> SInv, Z; LockedPartitionDownDiagonal ( H, HTL, HTR, HBL, HBR, 0 ); PartitionDown ( A, AT, AB, 0 ); while( HTL.Height() < H.Height() && HTL.Width() < H.Width() ) { LockedRepartitionDownDiagonal ( HTL, /**/ HTR, H00, /**/ H01, H02, /*************/ /******************/ /**/ H10, /**/ H11, H12, HBL, /**/ HBR, H20, /**/ H21, H22 ); const int HPanHeight = H01.Height() + H11.Height(); const int HPanOffset = std::min( H11.Width(), std::max(offset-H00.Width(),0) ); const int HPanWidth = H11.Width()-HPanOffset; HPan.LockedView( H, 0, H00.Width()+HPanOffset, HPanHeight, HPanWidth ); RepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); ATop.View2x1( A0, A1 ); Zeros( HPan.Width(), ATop.Width(), Z ); Zeros( HPan.Width(), HPan.Width(), SInv ); //--------------------------------------------------------------------// HPanCopy = HPan; MakeTrapezoidal( RIGHT, UPPER, offset, HPanCopy ); SetDiagonalToOne( RIGHT, offset, HPanCopy ); Syrk( LOWER, TRANSPOSE, R(1), HPanCopy, R(0), SInv ); HalveMainDiagonal( SInv ); Gemm( TRANSPOSE, NORMAL, R(1), HPanCopy, ATop, R(0), Z ); Trsm( LEFT, LOWER, NORMAL, NON_UNIT, R(1), SInv, Z ); Gemm( NORMAL, NORMAL, R(-1), HPanCopy, Z, R(1), ATop ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( HTL, /**/ HTR, H00, H01, /**/ H02, /**/ H10, H11, /**/ H12, /*************/ /******************/ HBL, /**/ HBR, H20, H21, /**/ H22 ); SlidePartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void GemmNNDot ( T alpha, const DistMatrix<T>& A, const DistMatrix<T>& B, T beta, DistMatrix<T>& C ) { #ifndef RELEASE PushCallStack("internal::GemmNNDot"); if( A.Grid() != B.Grid() || B.Grid() != C.Grid() ) throw std::logic_error ("{A,B,C} must be distributed over the same grid"); if( A.Height() != C.Height() || B.Width() != C.Width() || A.Width() != B.Height() ) { std::ostringstream msg; msg << "Nonconformal GemmNNDot: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " B ~ " << B.Height() << " x " << B.Width() << "\n" << " C ~ " << C.Height() << " x " << C.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = A.Grid(); if( A.Height() > B.Width() ) { // Matrix views DistMatrix<T> AT(g), AB(g), A0(g), A1(g), A2(g); DistMatrix<T> BL(g), B0(g), BR(g), B1(g), B2(g); DistMatrix<T> CT(g), C0(g), C1L(g), C1R(g), CB(g), C1(g), C10(g), C11(g), C12(g), C2(g); // Temporary distributions DistMatrix<T,STAR,VC> A1_STAR_VC(g); DistMatrix<T,VC,STAR> B1_VC_STAR(g); DistMatrix<T,STAR,STAR> C11_STAR_STAR(g); // Star the algorithm Scale( beta, C ); LockedPartitionDown ( A, AT, AB, 0 ); PartitionDown ( C, CT, CB, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); RepartitionDown ( CT, C0, /**/ /**/ C1, CB, C2 ); A1_STAR_VC = A1; B1_VC_STAR.AlignWith( A1_STAR_VC ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C1, C1L, C1R, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( C1L, /**/ C1R, C10, /**/ C11, C12 ); Zeros( C11.Height(), C11.Width(), C11_STAR_STAR ); //------------------------------------------------------------// B1_VC_STAR = B1; LocalGemm ( NORMAL, NORMAL, alpha, A1_STAR_VC, B1_VC_STAR, T(0), C11_STAR_STAR ); C11.SumScatterUpdate( T(1), C11_STAR_STAR ); //------------------------------------------------------------// SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( C1L, /**/ C1R, C10, C11, /**/ C12 ); } B1_VC_STAR.FreeAlignments(); SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlidePartitionDown ( CT, C0, C1, /**/ /**/ CB, C2 ); } } else { // Matrix views DistMatrix<T> AT(g), AB(g), A0(g), A1(g), A2(g); DistMatrix<T> BL(g), B0(g), BR(g), B1(g), B2(g); DistMatrix<T> CL(g), CR(g), C1T(g), C01(g), C0(g), C1(g), C2(g), C1B(g), C11(g), C21(g); // Temporary distributions DistMatrix<T,STAR,VR> A1_STAR_VR(g); DistMatrix<T,VR,STAR> B1_VR_STAR(g); DistMatrix<T,STAR,STAR> C11_STAR_STAR(g); // Star the algorithm Scale( beta, C ); LockedPartitionRight( B, BL, BR, 0 ); PartitionRight( C, CL, CR, 0 ); while( BR.Width() > 0 ) { LockedRepartitionRight ( BL, /**/ BR, B0, /**/ B1, B2 ); RepartitionRight ( CL, /**/ CR, C0, /**/ C1, C2 ); B1_VR_STAR = B1; A1_STAR_VR.AlignWith( B1_VR_STAR ); LockedPartitionDown ( A, AT, AB, 0 ); PartitionDown ( C1, C1T, C1B, 0 ); while( AB.Height() > 0 ) { LockedRepartitionDown ( AT, A0, /**/ /**/ A1, AB, A2 ); RepartitionDown ( C1T, C01, /***/ /***/ C11, C1B, C21 ); Zeros( C11.Height(), C11.Width(), C11_STAR_STAR ); //------------------------------------------------------------// A1_STAR_VR = A1; LocalGemm ( NORMAL, NORMAL, alpha, A1_STAR_VR, B1_VR_STAR, T(0), C11_STAR_STAR ); C11.SumScatterUpdate( T(1), C11_STAR_STAR ); //------------------------------------------------------------// SlideLockedPartitionDown ( AT, A0, A1, /**/ /**/ AB, A2 ); SlidePartitionDown ( C1T, C01, C11, /***/ /***/ C1B, C21 ); } A1_STAR_VR.FreeAlignments(); SlideLockedPartitionRight ( BL, /**/ BR, B0, B1, /**/ B2 ); SlidePartitionRight ( CL, /**/ CR, C0, C1, /**/ C2 ); } } #ifndef RELEASE PopCallStack(); #endif }