예제 #1
0
void LBFGSSolver::solve(const Function& function,
                        SolverResults* results) const
{
	double global_start_time = wall_time();

	// Dimension of problem.
	size_t n = function.get_number_of_scalars();

	if (n == 0) {
		results->exit_condition = SolverResults::FUNCTION_TOLERANCE;
		return;
	}

	// Current point, gradient and Hessian.
	double fval   = std::numeric_limits<double>::quiet_NaN();
	double fprev  = std::numeric_limits<double>::quiet_NaN();
	double normg0 = std::numeric_limits<double>::quiet_NaN();
	double normg  = std::numeric_limits<double>::quiet_NaN();
	double normdx = std::numeric_limits<double>::quiet_NaN();

	Eigen::VectorXd x, g;

	// Copy the user state to the current point.
	function.copy_user_to_global(&x);
	Eigen::VectorXd x2(n);

	// L-BFGS history.
	std::vector<Eigen::VectorXd>  s_data(this->lbfgs_history_size),
	                              y_data(this->lbfgs_history_size);
	std::vector<Eigen::VectorXd*> s(this->lbfgs_history_size),
	                              y(this->lbfgs_history_size);
	for (int h = 0; h < this->lbfgs_history_size; ++h) {
		s_data[h].resize(function.get_number_of_scalars());
		s_data[h].setZero();
		y_data[h].resize(function.get_number_of_scalars());
		y_data[h].setZero();
		s[h] = &s_data[h];
		y[h] = &y_data[h];
	}

	Eigen::VectorXd rho(this->lbfgs_history_size);
	rho.setZero();

	Eigen::VectorXd alpha(this->lbfgs_history_size);
	alpha.setZero();
	Eigen::VectorXd q(n);
	Eigen::VectorXd r(n);

	// Needed from the previous iteration.
	Eigen::VectorXd x_prev(n), s_tmp(n), y_tmp(n);

	CheckExitConditionsCache exit_condition_cache;

	//
	// START MAIN ITERATION
	//
	results->startup_time   += wall_time() - global_start_time;
	results->exit_condition = SolverResults::INTERNAL_ERROR;
	int iter = 0;
	bool last_iteration_successful = true;
	int number_of_line_search_failures = 0;
	int number_of_restarts = 0;
	while (true) {

		//
		// Evaluate function and derivatives.
		//
		double start_time = wall_time();
		// y[0] should contain the difference between the gradient
		// in this iteration and the gradient from the previous.
		// Therefore, update y before and after evaluating the
		// function.
		if (iter > 0) {
			y_tmp = -g;
		}
		fval = function.evaluate(x, &g);

		normg = std::max(g.maxCoeff(), -g.minCoeff());
		if (iter == 0) {
			normg0 = normg;
		}
		results->function_evaluation_time += wall_time() - start_time;

		//
		// Update history
		//
		start_time = wall_time();

		if (iter > 0 && last_iteration_successful) {
			s_tmp = x - x_prev;
			y_tmp += g;

			double sTy = s_tmp.dot(y_tmp);
			if (sTy > 1e-16) {
				// Shift all pointers one step back, discarding the oldest one.
				Eigen::VectorXd* sh = s[this->lbfgs_history_size - 1];
				Eigen::VectorXd* yh = y[this->lbfgs_history_size - 1];
				for (int h = this->lbfgs_history_size - 1; h >= 1; --h) {
					s[h]   = s[h - 1];
					y[h]   = y[h - 1];
					rho[h] = rho[h - 1];
				}
				// Reuse the storage of the discarded data for the new data.
				s[0] = sh;
				y[0] = yh;

				*y[0] = y_tmp;
				*s[0] = s_tmp;
				rho[0] = 1.0 / sTy;
			}
		}

		results->lbfgs_update_time += wall_time() - start_time;

		//
		// Test stopping criteriea
		//
		start_time = wall_time();
		if (iter > 1 && this->check_exit_conditions(fval, fprev, normg,
		                                            normg0, x.norm(), normdx,
		                                            last_iteration_successful, 
		                                            &exit_condition_cache, results)) {
			break;
		}
		if (iter >= this->maximum_iterations) {
			results->exit_condition = SolverResults::NO_CONVERGENCE;
			break;
		}

		if (this->callback_function) {
			CallbackInformation information;
			information.objective_value = fval;
			information.x = &x;
			information.g = &g;

			if (!callback_function(information)) {
				results->exit_condition = SolverResults::USER_ABORT;
				break;
			}
		}

		results->stopping_criteria_time += wall_time() - start_time;

		//
		// Compute search direction via L-BGFS two-loop recursion.
		//
		start_time = wall_time();
		bool should_restart = false;

		double H0 = 1.0;
		if (iter > 0) {
			// If the gradient is identical two iterations in a row,
			// y will be the zero vector and H0 will be NaN. In this
			// case the line search will fail and L-BFGS will be restarted
			// with a steepest descent step.
			H0 = s[0]->dot(*y[0]) / y[0]->dot(*y[0]);

			// If isinf(H0) || isnan(H0)
			if (H0 ==  std::numeric_limits<double>::infinity() ||
			    H0 == -std::numeric_limits<double>::infinity() ||
			    H0 != H0) {
				should_restart = true;
			}
		}

		q = -g;

		for (int h = 0; h < this->lbfgs_history_size; ++h) {
			alpha[h] = rho[h] * s[h]->dot(q);
			q = q - alpha[h] * (*y[h]);
		}

		r = H0 * q;

		for (int h = this->lbfgs_history_size - 1; h >= 0; --h) {
			double beta = rho[h] * y[h]->dot(r);
			r = r + (*s[h]) * (alpha[h] - beta);
		}

		// If the function improves very little, the approximated Hessian
		// might be very bad. If this is the case, it is better to discard
		// the history once in a while. This allows the solver to correctly
		// solve some badly scaled problems.
		double restart_test = std::fabs(fval - fprev) /
		                      (std::fabs(fval) + std::fabs(fprev));
		if (iter > 0 && iter % 100 == 0 && restart_test
		                                   < this->lbfgs_restart_tolerance) {
			should_restart = true;
		}
		if (! last_iteration_successful) {
			should_restart = true;
		}

		if (should_restart) {
			if (this->log_function) {
				char str[1024];
				if (number_of_restarts <= 10) {
					std::sprintf(str, "Restarting: fval = %.3e, deltaf = %.3e, max|g_i| = %.3e, test = %.3e",
								 fval, std::fabs(fval - fprev), normg, restart_test);
					this->log_function(str);
				}
				if (number_of_restarts == 10) {
					this->log_function("NOTE: No more restarts will be reported.");
				}
				number_of_restarts++;
			}
			r = -g;
			for (int h = 0; h < this->lbfgs_history_size; ++h) {
				(*s[h]).setZero();
				(*y[h]).setZero();
			}
			rho.setZero();
			alpha.setZero();
			// H0 is not used, but its value will be printed.
			H0 = std::numeric_limits<double>::quiet_NaN();
		}

		results->lbfgs_update_time += wall_time() - start_time;

		//
		// Perform line search.
		//
		start_time = wall_time();
		double start_alpha = 1.0;
		// In the first iteration, start with a much smaller step
		// length. (heuristic used by e.g. minFunc)
		if (iter == 0) {
			double sumabsg = 0.0;
			for (size_t i = 0; i < n; ++i) {
				sumabsg += std::fabs(g[i]);
			}
			start_alpha = std::min(1.0, 1.0 / sumabsg);
		}
		double alpha_step = this->perform_linesearch(function, x, fval, g,
		                                             r, &x2, start_alpha);

		if (alpha_step <= 0) {
			if (this->log_function) {
				this->log_function("Line search failed.");
				char str[1024];
				std::sprintf(str, "%4d %+.3e %9.3e %.3e %.3e %.3e %.3e",
					iter, fval, std::fabs(fval - fprev), normg, alpha_step, H0, rho[0]);
				this->log_function(str);
			}
			if (! last_iteration_successful || number_of_line_search_failures++ > 10) {
				// This happens quite seldom. Every time it has happened, the function
				// was actually converged to a solution.
				results->exit_condition = SolverResults::GRADIENT_TOLERANCE;
				break;
			}

			last_iteration_successful = false;
		}
		else {
			// Record length of this step.
			normdx = alpha_step * r.norm();
			// Compute new point.
			x_prev = x;
			x = x + alpha_step * r;

			last_iteration_successful = true;
		}

		results->backtracking_time += wall_time() - start_time;

		//
		// Log the results of this iteration.
		//
		start_time = wall_time();

		int log_interval = 1;
		if (iter > 30) {
			log_interval = 10;
		}
		if (iter > 200) {
			log_interval = 100;
		}
		if (iter > 2000) {
			log_interval = 1000;
		}
		if (this->log_function && iter % log_interval == 0) {
			if (iter == 0) {
				this->log_function("Itr       f       deltaf   max|g_i|   alpha      H0       rho");
			}

			this->log_function(
				to_string(
					std::setw(4), iter, " ",
					std::setw(10), std::setprecision(3), std::scientific, std::showpos, fval, std::noshowpos, " ",
					std::setw(9),  std::setprecision(3), std::scientific, std::fabs(fval - fprev), " ",
					std::setw(9),  std::setprecision(3), std::setprecision(3), std::scientific, normg, " ",
					std::setw(9),  std::setprecision(3), std::scientific, alpha_step, " ",
					std::setw(9),  std::setprecision(3), std::scientific, H0, " ",
					std::setw(9),  std::setprecision(3), std::scientific, rho[0]
				)
			);
		}
		results->log_time += wall_time() - start_time;

		fprev = fval;
		iter++;
	}

	function.copy_global_to_user(x);
	results->total_time += wall_time() - global_start_time;

	if (this->log_function) {
		char str[1024];
		std::sprintf(str, " end %+.3e           %.3e", fval, normg);
		this->log_function(str);
	}
}
예제 #2
0
  void CG3::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
  {

    // Check to see that we're not trying to invert on a zero-field source    
    const double b2 = norm2(b);
    if(b2 == 0){
      profile.TPSTOP(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      x=b;
      param.true_res = 0.0;
      param.true_res_hq = 0.0;
      return;
    }

    ColorSpinorParam csParam(x);
    csParam.create = QUDA_ZERO_FIELD_CREATE;
  
    
    cudaColorSpinorField x_prev(b, csParam);  
    cudaColorSpinorField r_prev(b, csParam);
    cudaColorSpinorField temp(b, csParam);

    cudaColorSpinorField r(b);
    cudaColorSpinorField w(b);


    mat(r, x, temp);  // r = Mx
    double r2 = xmyNormCuda(b,r); // r = b - Mx
    PrintStats("CG3", 0, r2, b2, 0.0);


    double stop = stopping(param.tol, b2, param.residual_type);
    if(convergence(r2, 0.0, stop, 0.0)) return;
    // First iteration 
    mat(w, r, temp);
    double rAr = reDotProductCuda(r,w);
    double rho = 1.0;
    double gamma_prev = 0.0;
    double gamma = r2/rAr;


    cudaColorSpinorField x_new(x);
    cudaColorSpinorField r_new(r);
    axpyCuda(gamma, r, x_new);  // x_new += gamma*r
    axpyCuda(-gamma, w, r_new); // r_new -= gamma*w
    // end of first iteration  

    // axpbyCuda(a,b,x,y) => y = a*x + b*y

    int k = 1; // First iteration performed above

    double r2_prev;
    while(!convergence(r2, 0.0, stop, 0.0) && k<param.maxiter){
      x_prev = x; x = x_new;
      r_prev = r; r = r_new;
      mat(w, r, temp);
      rAr = reDotProductCuda(r,w);
      r2_prev = r2;
      r2 = norm2(r);

      // Need to rearrange this!
      PrintStats("CG3", k, r2, b2, 0.0);

      gamma_prev = gamma;
      gamma = r2/rAr;
      rho = 1.0/(1. - (gamma/gamma_prev)*(r2/r2_prev)*(1.0/rho));
      
      x_new = x;
      axCuda(rho,x_new); 
      axpyCuda(rho*gamma,r,x_new);
      axpyCuda((1. - rho),x_prev,x_new);

      r_new = r;
      axCuda(rho,r_new);
      axpyCuda(-rho*gamma,w,r_new);
      axpyCuda((1.-rho),r_prev,r_new);


       double rr_old = reDotProductCuda(r_new,r);
      printfQuda("rr_old = %1.14lf\n", rr_old);


 
      k++;
    }


    if(k == param.maxiter)
      warningQuda("Exceeded maximum iterations %d", param.maxiter);

    // compute the true residual
    mat(r, x, temp);
    param.true_res = sqrt(xmyNormCuda(b, r)/b2);

    PrintSummary("CG3", k, r2, b2);

    return;
  }
예제 #3
0
void sLinsysRootAug::solveWithIterRef( sData *prob, SimpleVector& r)
{
  SimpleVector r2(&r[locnx],       locmy);
  SimpleVector r1(&r[0],           locnx);

  //SimpleVector realRhs(&r[0], locnx+locmy);
#ifdef TIMING
  taux=MPI_Wtime();
#endif

  double rhsNorm=r.twonorm(); //r== the initial rhs of the reduced system here

  int myRank; MPI_Comm_rank(mpiComm, &myRank);
  SimpleVector rxy(locnx+locmy); rxy.copyFrom(r);
  SimpleVector   x(locnx+locmy); x.setToZero(); //solution
  SimpleVector  dx(locnx+locmy);                //update from iter refinement
  SimpleVector x_prev(locnx+locmy);
  int refinSteps=0;
  std::vector<double> histResid;
  int maxRefinSteps=(gLackOfAccuracy>0?9:8);
  do { //iterative refinement
#ifdef TIMING
    taux=MPI_Wtime();
#endif

    x_prev.copyFrom(x);
    //dx = Ainv * r 
    dx.copyFrom(rxy);
    solver->Dsolve(dx);
    //update x
    x.axpy(1.0,dx);

#ifdef TIMING
    troot_total += (MPI_Wtime()-taux);
#endif  

    if(gLackOfAccuracy<0) break;
    if(refinSteps==maxRefinSteps) break;

    //////////////////////////////////////////////////////////////////////
    //iterative refinement
    //////////////////////////////////////////////////////////////////////
    //compute residual
    
    //if (iAmDistrib) {
    //only one process substracts [ (Q+Dx0+C'*Dz0*C)*xx + A'*xy ] from r
    //                            [  A*xx                       ]
    if(myRank==0) {
      rxy.copyFrom(r);
      if(locmz>0) {
	SparseSymMatrix* CtDC_sp = dynamic_cast<SparseSymMatrix*>(CtDC);
	CtDC_sp->mult(1.0,&rxy[0],1, 1.0,&x[0],1);
      }
      SparseSymMatrix& Q = prob->getLocalQ();
      Q.mult(1.0,&rxy[0],1, -1.0,&x[0],1);
      
      SimpleVector& xDiagv = dynamic_cast<SimpleVector&>(*xDiag);
      assert(xDiagv.length() == locnx);
      for(int i=0; i<xDiagv.length(); i++)
	rxy[i] -= xDiagv[i]*x[i];
      
      SparseGenMatrix& A=prob->getLocalB();
      A.transMult(1.0,&rxy[0],1, -1.0,&x[locnx],1);
      A.mult(1.0,&rxy[locnx],1, -1.0,&x[0],1);
    } else {
      //other processes set r to zero since they will get this portion from process 0
      rxy.setToZero();
    }

#ifdef TIMING
    taux=MPI_Wtime();
#endif  
    // now children add [0 A^T C^T ]*inv(KKT)*[0;A;C] x
    SimpleVector xx(&x[0], locnx);
    for(size_t it=0; it<children.size(); it++) {
      children[it]->addTermToSchurResidual(prob->children[it],rxy,xx);  
    }
#ifdef TIMING
    tchild_total +=  (MPI_Wtime()-taux);
#endif
    //~done computing residual 

#ifdef TIMING
    taux=MPI_Wtime();
#endif
    //all-reduce residual
    if(iAmDistrib) {
      dx.setToZero(); //we use dx as the recv buffer
      MPI_Allreduce(rxy.elements(), dx.elements(), locnx+locmy, MPI_DOUBLE, MPI_SUM, mpiComm);
      rxy.copyFrom(dx);
    }
#ifdef TIMING
    tcomm_total += (MPI_Wtime()-taux);
#endif

    double relResNorm=rxy.twonorm()/rhsNorm;
    
    if(relResNorm<1.0e-10) {
      break;
    } else {
      double prevRelResNorm=1.0e10;
      if(histResid.size()) 
	prevRelResNorm=histResid[histResid.size()-1];

      //check for stop, divergence or slow convergence conditions
      if(relResNorm>prevRelResNorm) {
	// diverging; restore iteration
	if(myRank==0) {
	  cout << "1st stg - iter refinement diverges relResNorm=" << relResNorm 
	       << "  before was " << prevRelResNorm << endl;
	  cout << "Restoring iterate." << endl;
	}
	x.copyFrom(x_prev);
	break;
      }else {
	//check slow convergence for the last xxx iterates.
	// xxx is 1 for now
	//if(relResNorm>0.*prevRelResNorm) {

	//  if(myRank==0) {
	//    cout << "1st stg - iter refinement stuck relResNorm=" << relResNorm 
	//	 << "  before was " << prevRelResNorm << endl;
	//    cout << "exiting refinement." << endl;
	//  }
	//  break;
	//
	//} else {
	//  //really nothing, continue
	//}
      }
      histResid.push_back(relResNorm);
      if(myRank==0)
	cout << "1st stg - sol does NOT  have enough accuracy (" << relResNorm << ") after " 
	     << refinSteps << " refinement steps" << endl;
    }
    refinSteps++;
  }while(refinSteps<=maxRefinSteps);

#ifdef TIMING
  taux = MPI_Wtime();
#endif

  r1.copyFrom(x);
  r2.copyFromArray(&x[locnx]);

#ifdef TIMING
  troot_total += (MPI_Wtime()-taux);
#endif  
}