int QDWH ( DistMatrix<F>& A, typename Base<F>::type lowerBound, typename Base<F>::type upperBound ) { #ifndef RELEASE PushCallStack("QDWH"); #endif typedef typename Base<F>::type R; const Grid& g = A.Grid(); const int height = A.Height(); const int width = A.Width(); const R oneHalf = R(1)/R(2); const R oneThird = R(1)/R(3); if( height < width ) throw std::logic_error("Height cannot be less than width"); const R epsilon = lapack::MachineEpsilon<R>(); const R tol = 5*epsilon; const R cubeRootTol = Pow(tol,oneThird); // Form the first iterate Scale( 1/upperBound, A ); int numIts=0; R frobNormADiff; DistMatrix<F> ALast( g ); DistMatrix<F> Q( height+width, width, g ); DistMatrix<F> QT(g), QB(g); PartitionDown( Q, QT, QB, height ); DistMatrix<F> C( g ); DistMatrix<F> ATemp( g ); do { ++numIts; ALast = A; R L2; Complex<R> dd, sqd; if( Abs(1-lowerBound) < tol ) { L2 = 1; dd = 0; sqd = 1; } else { L2 = lowerBound*lowerBound; dd = Pow( 4*(1-L2)/(L2*L2), oneThird ); sqd = Sqrt( 1+dd ); } const Complex<R> arg = 8 - 4*dd + 8*(2-L2)/(L2*sqd); const R a = (sqd + Sqrt( arg )/2).real; const R b = (a-1)*(a-1)/4; const R c = a+b-1; const Complex<R> alpha = a-b/c; const Complex<R> beta = b/c; lowerBound = lowerBound*(a+b*L2)/(1+c*L2); if( c > 100 ) { // // The standard QR-based algorithm // QT = A; Scale( Sqrt(c), QT ); MakeIdentity( QB ); ExplicitQR( Q ); Gemm( NORMAL, ADJOINT, alpha/Sqrt(c), QT, QB, beta, A ); } else { // // Use faster Cholesky-based algorithm since A is well-conditioned // Identity( width, width, C ); Herk( LOWER, ADJOINT, F(c), A, F(1), C ); Cholesky( LOWER, C ); ATemp = A; Trsm( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), C, ATemp ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, F(1), C, ATemp ); Scale( beta, A ); Axpy( alpha, ATemp, A ); } Axpy( F(-1), A, ALast ); frobNormADiff = Norm( ALast, FROBENIUS_NORM ); } while( frobNormADiff > cubeRootTol || Abs(1-lowerBound) > tol ); #ifndef RELEASE PopCallStack(); #endif return numIts; }
QDWHInfo QDWHInner ( AbstractDistMatrix<F>& APre, Base<F> sMinUpper, const QDWHCtrl& ctrl ) { EL_DEBUG_CSE DistMatrixReadWriteProxy<F,F,MC,MR> AProx( APre ); auto& A = AProx.Get(); typedef Base<F> Real; typedef Complex<Real> Cpx; const Int m = A.Height(); const Int n = A.Width(); const Real oneThird = Real(1)/Real(3); if( m < n ) LogicError("Height cannot be less than width"); QDWHInfo info; QRCtrl<Base<F>> qrCtrl; qrCtrl.colPiv = ctrl.colPiv; const Real eps = limits::Epsilon<Real>(); const Real tol = 5*eps; const Real cubeRootTol = Pow(tol,oneThird); Real L = sMinUpper / Sqrt(Real(n)); const Grid& g = A.Grid(); DistMatrix<F> ALast(g), ATemp(g), C(g); DistMatrix<F> Q( m+n, n, g ); auto QT = Q( IR(0,m ), ALL ); auto QB = Q( IR(m,END), ALL ); Real frobNormADiff; while( info.numIts < ctrl.maxIts ) { ALast = A; Real L2; Cpx dd, sqd; if( Abs(1-L) < tol ) { L2 = 1; dd = 0; sqd = 1; } else { L2 = L*L; dd = Pow( 4*(1-L2)/(L2*L2), oneThird ); sqd = Sqrt( Real(1)+dd ); } const Cpx arg = Real(8) - Real(4)*dd + Real(8)*(2-L2)/(L2*sqd); const Real a = (sqd + Sqrt(arg)/Real(2)).real(); const Real b = (a-1)*(a-1)/4; const Real c = a+b-1; const Real alpha = a-b/c; const Real beta = b/c; L = L*(a+b*L2)/(1+c*L2); if( c > 100 ) { // // The standard QR-based algorithm // QT = A; QT *= Sqrt(c); MakeIdentity( QB ); qr::ExplicitUnitary( Q, true, qrCtrl ); Gemm( NORMAL, ADJOINT, F(alpha/Sqrt(c)), QT, QB, F(beta), A ); ++info.numQRIts; } else { // // Use faster Cholesky-based algorithm since A is well-conditioned // Identity( C, n, n ); Herk( LOWER, ADJOINT, c, A, Real(1), C ); Cholesky( LOWER, C ); ATemp = A; Trsm( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), C, ATemp ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, F(1), C, ATemp ); A *= beta; Axpy( alpha, ATemp, A ); ++info.numCholIts; } ++info.numIts; ALast -= A; frobNormADiff = FrobeniusNorm( ALast ); if( frobNormADiff <= cubeRootTol && Abs(1-L) <= tol ) break; } return info; }
int Halley ( DistMatrix<F>& A, typename Base<F>::type upperBound ) { #ifndef RELEASE PushCallStack("Halley"); #endif typedef typename Base<F>::type R; const Grid& g = A.Grid(); const int height = A.Height(); const int width = A.Width(); const R oneHalf = R(1)/R(2); const R oneThird = R(1)/R(3); if( height < width ) throw std::logic_error("Height cannot be less than width"); const R epsilon = lapack::MachineEpsilon<R>(); const R tol = 5*epsilon; const R cubeRootTol = Pow(tol,oneThird); const R a = 3; const R b = 1; const R c = 3; // Form the first iterate Scale( 1/upperBound, A ); int numIts=0; R frobNormADiff; DistMatrix<F> ALast( g ); DistMatrix<F> Q( height+width, width, g ); DistMatrix<F> QT(g), QB(g); PartitionDown( Q, QT, QB, height ); DistMatrix<F> C( g ); DistMatrix<F> ATemp( g ); do { if( numIts > 100 ) throw std::runtime_error("Halley iteration did not converge"); ++numIts; ALast = A; // TODO: Come up with a test for when we can use the Cholesky approach if( true ) { // // The standard QR-based algorithm // QT = A; Scale( Sqrt(c), QT ); MakeIdentity( QB ); ExplicitQR( Q ); Gemm( NORMAL, ADJOINT, F(a-b/c)/Sqrt(c), QT, QB, F(b/c), A ); } else { // // Use faster Cholesky-based algorithm since A is well-conditioned // Identity( width, width, C ); Herk( LOWER, ADJOINT, F(c), A, F(1), C ); Cholesky( LOWER, C ); ATemp = A; Trsm( RIGHT, LOWER, ADJOINT, NON_UNIT, F(1), C, ATemp ); Trsm( RIGHT, LOWER, NORMAL, NON_UNIT, F(1), C, ATemp ); Scale( b/c, A ); Axpy( a-b/c, ATemp, A ); } Axpy( F(-1), A, ALast ); frobNormADiff = Norm( ALast, FROBENIUS_NORM ); } while( frobNormADiff > cubeRootTol ); #ifndef RELEASE PopCallStack(); #endif return numIts; }