示例#1
0
int main( void ) {
  Fbw(init);
  Ap(init);
  while (1) {
    if (sys_time_periodic()) {
      Fbw(periodic_task);
      Ap(periodic_task);
    }
    Fbw(event_task);
    Ap(event_task);
  }
  return 0;
}
示例#2
0
/*
 * PPRZ/AP thread
 *
 * Call PPRZ AP periodic and event functions
 */
static void thd_ap(void *arg)
{
  (void) arg;
  chRegSetThreadName("AP");

  while (!chThdShouldTerminateX()) {
    Ap(handle_periodic_tasks);
    Ap(event_task);
    chThdSleepMicroseconds(500);
  }

  chThdExit(0);
}
示例#3
0
int main(int argc, char** argv)
{  
    
//------------------------------------------------------------------------------
//                    Assimilando dados do arquivo externo
//------------------------------------------------------------------------------
    Volumes v1;    
    LeituraDadosProblema (v1);
    
    std :: vector <Real>      xFronteiras (v1.NVOL()+1), //localizações fronteiras
                              xCentro     (v1.NVOL()),   //localizações centros
                              DistCentro  (v1.NVOL()+1), //distâncias entre centros
                              DistFace    (v1.NVOL());   //distância entre fronteiras adjacentes
   

    GeracaoMalha (xFronteiras, xCentro, DistCentro, DistFace, v1);
    
    std :: vector <Real>      Ap(v1.NVOL()),
                              Ae(v1.NVOL()),
                              Aw(v1.NVOL()),
                              Sp(v1.NVOL());
    
    CalculaCoeficientes (DistFace, DistCentro, Ap, Ae, Aw, Sp);
    
    for (int i=0; i<v1.NVOL(); i++)
    {
    std :: cout << "Ae[" << i << "]=" << Ap[i] << std :: endl;
    } 
    
}    
示例#4
0
TSIL_COMPLEX Aeps (TSIL_REAL x, TSIL_REAL qq)
{
  TSIL_COMPLEX lnbarx = Ap (x, qq);
  
  if (TSIL_FABS(x) < TSIL_TOL) return 0.0L;
  else return x * (-1.0L - 0.5L*Zeta2 + lnbarx - 0.5L*lnbarx*lnbarx);
}
示例#5
0
  CGSolve( const ImportType       & import ,
           const SparseMatrixType & A ,
           const VectorType       & b ,
           const VectorType       & x ,
           const size_t             maximum_iteration = 200 ,
           const double             tolerance = std::numeric_limits<double>::epsilon() )
    : iteration(0)
    , iter_time(0)
    , norm_res(0)
  {
    const size_t count_owned = import.count_owned ;
    const size_t count_total = import.count_owned + import.count_receive;

    // Need input vector to matvec to be owned + received
    VectorType pAll ( "cg::p" , count_total );

    VectorType p = Kokkos::subview< VectorType >( pAll , std::pair<size_t,size_t>(0,count_owned) );
    VectorType r ( "cg::r" , count_owned );
    VectorType Ap( "cg::Ap", count_owned );

    /* r = b - A * x ; */

    /* p  = x       */  Kokkos::deep_copy( p , x );
    /* import p     */  import( pAll );
    /* Ap = A * p   */  Kokkos::MV_Multiply( Ap , A , pAll );
    /* b - Ap => r  */  Kokkos::V_Add( r , 1.0 , b , -1.0 , Ap );
    /* p  = r       */  Kokkos::deep_copy( p , r );

    double old_rdot = Kokkos::Example::all_reduce( Kokkos::V_Dot( r , r ) , import.comm );

    norm_res  = sqrt( old_rdot );
    iteration = 0 ;

    Kokkos::Impl::Timer wall_clock ;

    while ( tolerance < norm_res && iteration < maximum_iteration ) {

      /* pAp_dot = dot( p , Ap = A * p ) */

      /* import p    */  import( pAll );
      /* Ap = A * p  */  Kokkos::MV_Multiply( Ap , A , pAll );

      const double pAp_dot = Kokkos::Example::all_reduce( Kokkos::V_Dot( p , Ap ) , import.comm );
      const double alpha   = old_rdot / pAp_dot ;

      /* x +=  alpha * p ;  */ Kokkos::V_Add( x ,  alpha, p  , 1.0 , x );
      /* r += -alpha * Ap ; */ Kokkos::V_Add( r , -alpha, Ap , 1.0 , r );

      const double r_dot = Kokkos::Example::all_reduce( Kokkos::V_Dot( r , r ) , import.comm );
      const double beta  = r_dot / old_rdot ;

      /* p = r + beta * p ; */ Kokkos::V_Add( p , 1.0 , r , beta , p );

      norm_res = sqrt( old_rdot = r_dot );

      ++iteration ;
    }

    iter_time = wall_clock.seconds();
  }
示例#6
0
void cgsolve(
  const CrsMatrix<AScalarType,Device>  & A ,
  const View<VScalarType*,LayoutRight,Device> & b ,
  const View<VScalarType*,LayoutRight,Device> & x ,
  size_t & iteration ,
  double & normr ,
  double & iter_time ,
  const size_t maximum_iteration = 200 ,
  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
{
  typedef View<VScalarType*,LayoutRight,Device> vector_type ;

  const size_t count = b.dimension_0();

  vector_type p ( "cg::p" , count );
  vector_type r ( "cg::r" , count );
  vector_type Ap( "cg::Ap", count );

  /* r = b - A * x ; */

  /* p  = x      */ deep_copy( p , x );
  /* Ap = A * p  */ multiply( A , p , Ap );
  /* r  = b - Ap */ waxpby( count , 1.0 , b , -1.0 , Ap , r );
  /* p  = r      */ deep_copy( p , r );

  double old_rdot = dot( count , r );

  normr     = std::sqrt( old_rdot );
  iteration = 0 ;

  Kokkos::Impl::Timer wall_clock ;

  while ( tolerance < normr && iteration < maximum_iteration ) {

    /* pAp_dot = dot( p , Ap = A * p ) */

    /* Ap = A * p  */ multiply( A , p , Ap );

    const double pAp_dot = dot( count , p , Ap );
    const double alpha   = old_rdot / pAp_dot ;

    /* x += alpha * p ;  */ axpy( count,  alpha, p , x );
    /* r -= alpha * Ap ; */ axpy( count, -alpha, Ap, r );

    const double r_dot = dot( count , r );
    const double beta  = r_dot / old_rdot ;

    /* p = r + beta * p ; */ xpby( count , r , beta , p );

    normr = std::sqrt( old_rdot = r_dot );
    ++iteration ;
  }

  iter_time = wall_clock.seconds();
}
static int32_t pprz_thd(void *arg)
{
  /*
     To be compatible with rtos architecture, each of this 4 workers should
     be implemented in differents threads, each of them waiting for job to be done:
     periodic task should sleep, and event task should wait for event
     */
  (void) arg;
  chibios_chRegSetThreadName("pprz big loop");

  while (!chThdShouldTerminate()) {
    Fbw(handle_periodic_tasks);
    Ap(handle_periodic_tasks);
    Fbw(event_task);
    Ap(event_task);
    chibios_chThdSleepMilliseconds(1);
  }

  return 0;
}
示例#8
0
void RpalParser::Af()
{
  pushProc("Af()");
  Ap();
  while (_nt == "**")
  {
    read_token(_nt);
    Af();
    build("**", 2);
  }
  popProc("Af()");
}
示例#9
0
rtems_task Init(
  rtems_task_argument ignored
){
#ifndef SERIO_TESTING
  Fbw(init);
  Ap(init);
  while (1) {
	update_bat(12.0);
    Fbw(handle_periodic_tasks);
    Ap(handle_periodic_tasks);
    Fbw(event_task);
    Ap(event_task);
  }
#else
  UART1Init();
  while(1){
	  Ap(event_task);
  }
#endif
  return ;
}
示例#10
0
文件: cg.hpp 项目: jonathanzung/faces
 void step() {
   Vector Ap(mA*mP);
   Scalar alpha = mRNorm2 / mP.dot(Ap);
   mx += alpha * mP;
   mR -= alpha * Ap;
   Scalar newRNorm2 = mR.squaredNorm();
   Scalar beta = newRNorm2 / mRNorm2;
   mRNorm2 = newRNorm2;
   mP *= beta;
   mP += mR;
   mIt++;
 }
Expression* sintactico::Ap()
{
    if(CurrentToken->tipo == OR)
    {
        CurrentToken = Lexer->NexToken();
        Expression* izq = B();
        Expression* der = Ap();
        if(der != NULL)
            return new ExprOr(izq, der,Lexer->linea);
        return izq;
    }
    return NULL;
}
示例#12
0
double RkPdf::MfRk(const double& x, const double& tau, const double& dm){
  const double r_ckak = ck/ak;
  const double inv_ak = 1.0/ak;
  const double ndmtau = r_ckak*dm*tau;
  const double fact = 1.0/(1.0+ndmtau*ndmtau);
  double Li = 0.0;
  if(x<0.0){
    const double ndm  = dm/(ak-ck);
    const double ntau = tau*(ak-ck);
    Li = inv_ak*fact*(Mn(x,ntau,ndm)+ndmtau*An(x,ntau,ndm));
  }else{
    const double ndm  = dm/(ak+ck);
    const double ntau = tau*(ak+ck);
    Li = inv_ak*fact*(Mp(x,ntau,ndm)+ndmtau*Ap(x,ntau,ndm));
  }
  return Li;
}
示例#13
0
/**
 * Main function
 */
int main(void)
{
  // Init
  Fbw(init);
  Ap(init);

  chThdSleepMilliseconds(100);

  // Create threads
  apThdPtr = chThdCreateStatic(wa_thd_ap, sizeof(wa_thd_ap), NORMALPRIO, thd_ap, NULL);
  fbwThdPtr = chThdCreateStatic(wa_thd_fbw, sizeof(wa_thd_fbw), NORMALPRIO, thd_fbw, NULL);

  // Main loop, do nothing
  while (TRUE) {
    chThdSleepMilliseconds(1000);
  }
  return 0;
}
SparseVector<double> SparseSolverEigenCustom::cgSolveSparse(const SparseMatrix<double> & A,const SparseVector<double> & b,int iter, double residual)
{
	SparseVector<double> r(b.rows());
	SparseVector<double> p(b.rows());
	SparseVector<double> Ap(b.rows());
	SparseVector<double> x(b.rows());

	r = b - A *x;
	p = r;

	double rTr,pTAp,alpha,beta,rTrnew,rnorm;
	SparseVector<double> vtemp;
	bool isConverged = false;
	for(int k=0;k<iter;k++)
	{
		Ap = A*p;
		vtemp = r.transpose()*r;
		rTr = vtemp.coeff(0);

		vtemp = p.transpose()*Ap;
		pTAp = vtemp.coeff(0);
		alpha = rTr/pTAp;

		x = x + (alpha * p);
		r = r - (alpha * Ap);
		rnorm = r.norm();
		if(rnorm<residual)
		{
			isConverged = true;
			break;
		}

		vtemp = r.transpose()*r;
		rTrnew = vtemp.coeff(0);

		beta = rTrnew / rTr;
		p = r + (beta * p);
	}

	return x;
}
示例#15
0
文件: CG.hpp 项目: m0nT3cR1s70/Solve-
   void solve(T &A, Vector &x, Vector &b,U &M)  //T: tipo de matriz de entrada, 
                                                //U: tipo de precondicinador de entrada
   {

      timer.tic();                 //comienza a medir tiempo de ejecucion
      double tol2 = mtol*mtol;
      int n = x.size();
      Vector r(n),z(n),p(n),Ap(n);
      double alpha;
      double beta;
      double rr0,rr;
      int k;
      
      r = b-A*x;                    //primer residual operacion vector = vector - matriz*vector
      M.solve(z,r);                 //resuleve el sistema Mz=r
      p=z;                          //copia de vector
      rr0 = z*r;                    //producto interno de dos vectores
      for(k =0;k<mmaxIts;++k)
      { 
         Ap = A*p;                  //producto matriz vector
         alpha =  (rr0) / (Ap*p);   //producto interno en el divisor
         x = x + alpha*p;           //operacion vector = vector + escalar*vector (saxpy en BLAS)
         r = r - alpha*Ap;
         M.solve(z,r);              //resuleve el sistema Mz=r
         rr = z*r;                  //producto interno
         merror = r*r;              //producto interno
         if(merror < tol2)
            break;
         beta = rr / rr0; 
         p = z + beta*p;            //operacion vector = vector + escalar*vector
         rr0 = rr;
      }
      mits = k+1;
      merror = sqrt(merror);
      timer.toc();                 //termina de medir tiempo
      etime = timer.etime();       //guarda en etime el tiempo de ejecucion
      precond = true;
      namep = M.name();            //guarda el nombre del precondicionador
   }
int main(void)
{
  // Init
  sys_time_init();

  // Init ChibiOS
  sdlogOk = chibios_init();

  // Init PPRZ
  Fbw(init);
  Ap(init);

  chibios_chThdSleepMilliseconds(100);

  launch_pprz_thd(&pprz_thd);
  pprzReady = true;
  // Call PPRZ periodic and event functions
  while (TRUE) {
    chibios_chThdSleepMilliseconds(1000);
  }
  return 0;
}
示例#17
0
文件: CG.hpp 项目: m0nT3cR1s70/Solve-
   void solve(T &A, Vector &x, Vector &b)
   {

      timer.tic();
      double tol2 = mtol*mtol;
      int n = x.size();
      Vector r(n);
      Vector p(n);
      Vector Ap(n);
      double alpha;
      double beta;
      double rr0,rr;
      int k;
      
      r = b-A*x;
      rr0 = r*r;
      p=r;
      for(k =0;k<mmaxIts;++k)
      {
         Ap = A*p;
         alpha =  (rr0) / (Ap*p);
         x = x + alpha*p;
         r = r - alpha*Ap;
         rr = r*r;
         merror = rr;
         if(merror < tol2)
            break;
         beta = rr / rr0; 
         p = r + beta*p;
         rr0 = rr;
      }
      mits = k+1;
      merror = sqrt(merror);
      timer.toc();
      etime = timer.etime();
      precond = false;
   }
示例#18
0
void VanDerWaals::generateTriangle(Data::Mesh& mesh, Data::OMMesh::VertexHandle const& Av,
   Data::OMMesh::VertexHandle const& Bv, Data::OMMesh::VertexHandle const& Cv, int div)
{
   if (div <= 0) {
      mesh.addFace(Cv, Bv, Av);

   }else {

      Data::OMMesh::Point Ap(mesh.vertex(Av));
      Data::OMMesh::Point Bp(mesh.vertex(Bv));
      Data::OMMesh::Point Cp(mesh.vertex(Cv));

      // create 3 new vertices at the edge midpoints
      Data::OMMesh::Point ABp((Ap+Bp)*0.5);
      Data::OMMesh::Point BCp((Bp+Cp)*0.5);
      Data::OMMesh::Point CAp((Cp+Ap)*0.5);

      // Normalize the midpoints to keep them on the sphere
      ABp.normalize();
      BCp.normalize();
      CAp.normalize();

      Data::OMMesh::VertexHandle ABv(mesh.addVertex(ABp));
      Data::OMMesh::VertexHandle BCv(mesh.addVertex(BCp));
      Data::OMMesh::VertexHandle CAv(mesh.addVertex(CAp));

      mesh.setNormal(ABv, ABp);
      mesh.setNormal(BCv, BCp);
      mesh.setNormal(CAv, CAp);

      generateTriangle(mesh, Av,  ABv, CAv, div-1);
      generateTriangle(mesh, Bv,  BCv, ABv, div-1);
      generateTriangle(mesh, Cv,  CAv, BCv, div-1);
      generateTriangle(mesh, ABv, BCv, CAv, div-1);  //<-- Remove for serpinski
   }  
}
示例#19
0
SGVector<complex128_t> CCGMShiftedFamilySolver::solve_shifted_weighted(
	CLinearOperator<SGVector<float64_t>, SGVector<float64_t> >* A, SGVector<float64_t> b,
	SGVector<complex128_t> shifts, SGVector<complex128_t> weights)
{
	SG_DEBUG("Entering\n");

	// sanity check
	REQUIRE(A, "Operator is NULL!\n");
	REQUIRE(A->get_dimension()==b.vlen, "Dimension mismatch! [%d vs %d]\n",
		A->get_dimension(), b.vlen);
	REQUIRE(shifts.vector,"Shifts are not initialized!\n");
	REQUIRE(weights.vector,"Weights are not initialized!\n");
	REQUIRE(shifts.vlen==weights.vlen, "Number of shifts and number of "
		"weights are not equal! [%d vs %d]\n", shifts.vlen, weights.vlen);

	// the solution matrix, one column per shift, initial guess 0 for all
	MatrixXcd x_sh=MatrixXcd::Zero(b.vlen, shifts.vlen);
	MatrixXcd p_sh=MatrixXcd::Zero(b.vlen, shifts.vlen);

	// non-shifted direction
	SGVector<float64_t> p_(b.vlen);

	// the rest of the part hinges on eigen3 for computing norms
	Map<VectorXd> b_map(b.vector, b.vlen);
	Map<VectorXd> p(p_.vector, p_.vlen);

	// residual r_i=b-Ax_i, here x_0=[0], so r_0=b
	VectorXd r=b_map;

	// initial direction is same as residual
	p=r;
	p_sh=r.replicate(1, shifts.vlen).cast<complex128_t>();

	// non shifted initializers
	float64_t r_norm2=r.dot(r);
	float64_t beta_old=1.0;
	float64_t alpha=1.0;

	// shifted quantities
	SGVector<complex128_t> alpha_sh(shifts.vlen);
	SGVector<complex128_t> beta_sh(shifts.vlen);
	SGVector<complex128_t> zeta_sh_old(shifts.vlen);
	SGVector<complex128_t> zeta_sh_cur(shifts.vlen);
	SGVector<complex128_t> zeta_sh_new(shifts.vlen);

	// shifted initializers
	zeta_sh_old.set_const(1.0);
	zeta_sh_cur.set_const(1.0);

	// the iterator for this iterative solver
	IterativeSolverIterator<float64_t> it(r, m_max_iteration_limit,
		m_relative_tolerence, m_absolute_tolerence);

	// start the timer
	CTime time;
	time.start();

	// set the residuals to zero
	if (m_store_residuals)
		m_residuals.set_const(0.0);

	// CG iteration begins
	for (it.begin(r); !it.end(r); ++it)
	{

		SG_DEBUG("CG iteration %d, residual norm %f\n",
				it.get_iter_info().iteration_count,
				it.get_iter_info().residual_norm);

		if (m_store_residuals)
		{
			m_residuals[it.get_iter_info().iteration_count]
				=it.get_iter_info().residual_norm;
		}

		// apply linear operator to the direction vector
		SGVector<float64_t> Ap_=A->apply(p_);
		Map<VectorXd> Ap(Ap_.vector, Ap_.vlen);

		// compute p^{T}Ap, if zero, failure
		float64_t p_dot_Ap=p.dot(Ap);
		if (p_dot_Ap==0.0)
			break;

		// compute the beta parameter of CG_M
		float64_t beta=-r_norm2/p_dot_Ap;

		// compute the zeta-shifted parameter of CG_M
		compute_zeta_sh_new(zeta_sh_old, zeta_sh_cur, shifts, beta_old, beta,
			alpha, zeta_sh_new);

		// compute beta-shifted parameter of CG_M
		compute_beta_sh(zeta_sh_new, zeta_sh_cur, beta, beta_sh);

		// update the solution vector and residual
		for (index_t i=0; i<shifts.vlen; ++i)
			x_sh.col(i)-=beta_sh[i]*p_sh.col(i);

		// r_{i}=r_{i-1}+\beta_{i}Ap
		r+=beta*Ap;

		// compute new ||r||_{2}, if zero, converged
		float64_t r_norm2_i=r.dot(r);
		if (r_norm2_i==0.0)
			break;

		// compute the alpha parameter of CG_M
		alpha=r_norm2_i/r_norm2;

		// update ||r||_{2}
		r_norm2=r_norm2_i;

		// update direction
		p=r+alpha*p;

		compute_alpha_sh(zeta_sh_new, zeta_sh_cur, beta_sh, beta, alpha, alpha_sh);

		for (index_t i=0; i<shifts.vlen; ++i)
		{
			p_sh.col(i)*=alpha_sh[i];
			p_sh.col(i)+=zeta_sh_new[i]*r;
		}

		// update parameters
		for (index_t i=0; i<shifts.vlen; ++i)
		{
			zeta_sh_old[i]=zeta_sh_cur[i];
			zeta_sh_cur[i]=zeta_sh_new[i];
		}
		beta_old=beta;
	}

	float64_t elapsed=time.cur_time_diff();

	if (!it.succeeded(r))
		SG_WARNING("Did not converge!\n");

	SG_INFO("Iteration took %d times, residual norm=%.20lf, time elapsed=%f\n",
		it.get_iter_info().iteration_count, it.get_iter_info().residual_norm, elapsed);

	// compute the final result vector multiplied by weights
	SGVector<complex128_t> result(b.vlen);
	result.set_const(0.0);
	Map<VectorXcd> x(result.vector, result.vlen);

	for (index_t i=0; i<x_sh.cols(); ++i)
		x+=x_sh.col(i)*weights[i];

	SG_DEBUG("Leaving\n");
	return result;
}
示例#20
0
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
{
  int k=0;
  int rUpdate = 0;
    
  cudaColorSpinorField r(b);

  ColorSpinorParam param(x);
  param.create = QUDA_ZERO_FIELD_CREATE;
  cudaColorSpinorField y(b, param); 
  
  mat(r, x, y);
  zeroCuda(y);

  double r2 = xmyNormCuda(b, r);
  rUpdate ++;
  
  param.precision = invParam.cuda_prec_sloppy;
  cudaColorSpinorField Ap(x, param);
  cudaColorSpinorField tmp(x, param);
  cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass

  cudaColorSpinorField *x_sloppy, *r_sloppy;
  if (invParam.cuda_prec_sloppy == x.Precision()) {
    param.create = QUDA_REFERENCE_FIELD_CREATE;
    x_sloppy = &x;
    r_sloppy = &r;
  } else {
    param.create = QUDA_COPY_FIELD_CREATE;
    x_sloppy = new cudaColorSpinorField(x, param);
    r_sloppy = new cudaColorSpinorField(r, param);
  }

  cudaColorSpinorField &xSloppy = *x_sloppy;
  cudaColorSpinorField &rSloppy = *r_sloppy;

  cudaColorSpinorField p(rSloppy);

  double r2_old;
  double src_norm = norm2(b);
  double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver

  double alpha, beta;
  double pAp;

  double rNorm = sqrt(r2);
  double r0Norm = rNorm;
  double maxrx = rNorm;
  double maxrr = rNorm;
  double delta = invParam.reliable_delta;

  if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2);

  quda::blas_flops = 0;

  stopwatchStart();
  while (r2 > stop && k<invParam.maxiter) {

    matSloppy(Ap, p, tmp, tmp2); // tmp as tmp
    
    pAp = reDotProductCuda(p, Ap);
    alpha = r2 / pAp;        
    r2_old = r2;
    r2 = axpyNormCuda(-alpha, Ap, rSloppy);

    // reliable update conditions
    rNorm = sqrt(r2);
    if (rNorm > maxrx) maxrx = rNorm;
    if (rNorm > maxrr) maxrr = rNorm;
    int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
    int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
    
    if (!(updateR || updateX)) {
      beta = r2 / r2_old;
      axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);
    } else {
      axpyCuda(alpha, p, xSloppy);
      if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
      
      xpyCuda(x, y); // swap these around?
      mat(r, y, x); // here we can use x as tmp
      r2 = xmyNormCuda(b, r);
      if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r);            
      zeroCuda(xSloppy);

      rNorm = sqrt(r2);
      maxrr = rNorm;
      maxrx = rNorm;
      r0Norm = rNorm;      
      rUpdate++;

      beta = r2 / r2_old; 
      xpayCuda(rSloppy, beta, p);
    }

    k++;
    if (invParam.verbosity >= QUDA_VERBOSE)
      printfQuda("CG: %d iterations, r2 = %e\n", k, r2);
  }

  if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
  xpyCuda(y, x);

  invParam.secs = stopwatchReadSeconds();

  
  if (k==invParam.maxiter) 
    warningQuda("Exceeded maximum iterations %d", invParam.maxiter);

  if (invParam.verbosity >= QUDA_SUMMARIZE)
    printfQuda("CG: Reliable updates = %d\n", rUpdate);

  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
  reduceDouble(gflops);

  //  printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
  invParam.gflops = gflops;
  invParam.iter = k;

  quda::blas_flops = 0;

  if (invParam.verbosity >= QUDA_SUMMARIZE){
    mat(r, x, y);
    double true_res = xmyNormCuda(b, r);
    printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", 
	       k, sqrt(r2/src_norm), sqrt(true_res / src_norm));    
  }

  if (invParam.cuda_prec_sloppy != x.Precision()) {
    delete r_sloppy;
    delete x_sloppy;
  }

  return;
}
示例#21
0
int CrsMatrixTranspose( Epetra_CrsMatrix *In,  Epetra_CrsMatrix *Out ) { 

   
  int iam = In->Comm().MyPID() ;

  int numentries = In->NumGlobalNonzeros();
  int NumRowEntries = 0;
  double *RowValues = 0;
  int *ColIndices = 0;

  int numrows = In->NumGlobalRows();
  int numcols = In->NumGlobalCols();

  std::vector <int> Ap( numcols+1 );       // Column i is stored in Aval(Ap[i]..Ap[i+1]-1)
  std::vector <int> nextAp( numcols+1 );   // Where to store next value in Column i
  std::vector <int> Ai( EPETRA_MAX( numcols, numentries) ) ; //  Row indices
  std::vector <double> Aval( EPETRA_MAX( numcols, numentries) ) ; 

  if ( iam == 0 ) { 

    assert( In->NumMyRows() == In->NumGlobalRows() ) ; 
    //
    //  Count the number of entries in each column
    //
    std::vector <int>RowsPerCol( numcols ) ; 
    for ( int i = 0 ; i < numcols ; i++ ) RowsPerCol[i] = 0 ; 
    for ( int MyRow = 0; MyRow <numrows; MyRow++ ) {
      assert( In->ExtractMyRowView( MyRow, NumRowEntries, RowValues, ColIndices ) == 0 ) ;
      for ( int j = 0; j < NumRowEntries; j++ ) { 
	RowsPerCol[ ColIndices[j] ] ++ ; 
      }
    }
    //
    //  Set Ap and nextAp based on RowsPerCol
    //
    Ap[0] = 0 ; 
    for ( int i = 0 ; i < numcols ; i++ ) {
      Ap[i+1]= Ap[i] + RowsPerCol[i] ; 
      nextAp[i] = Ap[i];
    }
    //
    //  Populate Ai and Aval 
    //
    for ( int MyRow = 0; MyRow <numrows; MyRow++ ) {
      assert( In->ExtractMyRowView( MyRow, NumRowEntries, RowValues, ColIndices ) == 0 ) ;
      for ( int j = 0; j < NumRowEntries; j++ ) { 
	Ai[ nextAp[ ColIndices[j] ] ] = MyRow ; 
	Aval[ nextAp[ ColIndices[j] ] ] = RowValues[j] ; 
	nextAp[ ColIndices[j] ] ++ ; 
      }
    }

    //
    //  Insert values into Out 
    //
    for ( int MyRow = 0; MyRow <numrows; MyRow++ ) {
      int NumInCol = Ap[MyRow+1] -  Ap[MyRow] ;
      Out->InsertGlobalValues( MyRow, NumInCol, &Aval[Ap[MyRow]], 
			   &Ai[Ap[MyRow]] );
      assert( Out->IndicesAreGlobal() ) ; 
    }
  } else {
    assert( In->NumMyRows() == 0 ) ; 
  }


  assert( Out->FillComplete()==0 ) ;
  return 0 ; 
}
示例#22
0
// CG
int CG(const  Teuchos::SerialDenseMatrix<int, double> &  A, Teuchos::SerialDenseMatrix<int,double>   X,const Teuchos::SerialDenseMatrix<int,double> &   B, int max_iter, double tolerance, Stokhos::DiagPreconditioner<int,double> prec)

{
  int n; 
  int k=0;
  double resid;
  
  n=A.numRows();
  std::cout << "A= " << A << std::endl;
  std::cout << "B= " << B << std::endl;
  Teuchos::SerialDenseMatrix<int, double> Ax(n,1);
  Ax.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, X, 0.0);

  Teuchos::SerialDenseMatrix<int, double> r(B);
  r-=Ax;  
  resid=r.normFrobenius(); 
  Teuchos::SerialDenseMatrix<int, double> rho(1,1);
  Teuchos::SerialDenseMatrix<int, double> oldrho(1,1);
  Teuchos::SerialDenseMatrix<int, double> pAp(1,1);
  Teuchos::SerialDenseMatrix<int, double> Ap(n,1);
  
  double b;
  double a;
  Teuchos::SerialDenseMatrix<int, double> p(r);

  
 
  while (resid > tolerance && k < max_iter){
 
     Teuchos::SerialDenseMatrix<int, double> z(r);
     
     //z=M-1r
//     prec.ApplyInverse(r,z);

     rho.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, r, z, 0.0);
  
     if (k==0){
       p.assign(z);
       rho.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, r, z, 0.0);
      }  
      else {
        b=rho(0,0)/oldrho(0,0);
        p.scale(b);
        p+=z;
      }
      Ap.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, p, 0.0);
      pAp.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, p, Ap, 0.0);
      a=rho(0,0)/pAp(0,0);
      Teuchos::SerialDenseMatrix<int, double> scalep(p);
      scalep.scale(a);
      X+=scalep;
      Ap.scale(a);
      r-=Ap;
      oldrho.assign(rho);
      resid=r.normFrobenius();
   
 
     k++;
  } 
  
 std::cout << "X=  " << X << std::endl;

 return 0;
}
SGVector<float64_t> CConjugateGradientSolver::solve(
	CLinearOperator<float64_t>* A, SGVector<float64_t> b)
{
	SG_DEBUG("CConjugateGradientSolve::solve(): Entering..\n");

	// sanity check
	REQUIRE(A, "Operator is NULL!\n");
	REQUIRE(A->get_dimension()==b.vlen, "Dimension mismatch!\n");

	// the final solution vector, initial guess is 0
	SGVector<float64_t> result(b.vlen);
	result.set_const(0.0);

	// the rest of the part hinges on eigen3 for computing norms
	Map<VectorXd> x(result.vector, result.vlen);
	Map<VectorXd> b_map(b.vector, b.vlen);

	// direction vector
	SGVector<float64_t> p_(result.vlen);
	Map<VectorXd> p(p_.vector, p_.vlen);

	// residual r_i=b-Ax_i, here x_0=[0], so r_0=b
	VectorXd r=b_map;

	// initial direction is same as residual
	p=r;

	// the iterator for this iterative solver
	IterativeSolverIterator<float64_t> it(b_map, m_max_iteration_limit,
		m_relative_tolerence, m_absolute_tolerence);

	// CG iteration begins
	float64_t r_norm2=r.dot(r);

	// start the timer
	CTime time;
	time.start();

	// set the residuals to zero
	if (m_store_residuals)
		m_residuals.set_const(0.0);

	for (it.begin(r); !it.end(r); ++it)
	{
		SG_DEBUG("CG iteration %d, residual norm %f\n",
			it.get_iter_info().iteration_count,
			it.get_iter_info().residual_norm);

		if (m_store_residuals)
		{
			m_residuals[it.get_iter_info().iteration_count]
				=it.get_iter_info().residual_norm;
		}

		// apply linear operator to the direction vector
		SGVector<float64_t> Ap_=A->apply(p_);
		Map<VectorXd> Ap(Ap_.vector, Ap_.vlen);

		// compute p^{T}Ap, if zero, failure
		float64_t p_dot_Ap=p.dot(Ap);
		if (p_dot_Ap==0.0)
			break;

		// compute the alpha parameter of CG
		float64_t alpha=r_norm2/p_dot_Ap;

		// update the solution vector and residual
		// x_{i}=x_{i-1}+\alpha_{i}p
		x+=alpha*p;

		// r_{i}=r_{i-1}-\alpha_{i}p
		r-=alpha*Ap;

		// compute new ||r||_{2}, if zero, converged
		float64_t r_norm2_i=r.dot(r);
		if (r_norm2_i==0.0)
			break;

		// compute the beta parameter of CG
		float64_t beta=r_norm2_i/r_norm2;

		// update direction, and ||r||_{2}
		r_norm2=r_norm2_i;
		p=r+beta*p;
	}

	float64_t elapsed=time.cur_time_diff();

	if (!it.succeeded(r))
		SG_WARNING("Did not converge!\n");

	SG_INFO("Iteration took %ld times, residual norm=%.20lf, time elapsed=%lf\n",
		it.get_iter_info().iteration_count, it.get_iter_info().residual_norm, elapsed);

	SG_DEBUG("CConjugateGradientSolve::solve(): Leaving..\n");
	return result;
}
示例#24
0
inline
void pcgsolve( //const ImportType & import,
              KernelHandle &kh
            ,  const CrsMatrix <typename KernelHandle::nonzero_value_type , typename KernelHandle::row_index_type, typename KernelHandle::HandleExecSpace >      & A
            , const Kokkos::View <typename KernelHandle::nonzero_value_type *,
                                  typename KernelHandle::HandleExecSpace> & b
            , const Kokkos::View <typename KernelHandle::nonzero_value_type * ,
                                  typename KernelHandle::HandleExecSpace > & x
            , const size_t  maximum_iteration = 200
            , const double  tolerance = std::numeric_limits<double>::epsilon()
            , CGSolveResult * result = 0
            , bool use_sgs = true
            )
{
  typedef typename KernelHandle::HandleExecSpace Space;
  //typedef typename KernelHandle::nonzero_value_type MScalar;
  typedef typename KernelHandle::nonzero_value_type VScalar;
  //typedef typename KernelHandle::row_index_type Idx_Type;
  //typedef typename KernelHandle::idx_array_type idx_array_type;
  typedef typename Kokkos::View< VScalar * , Space >  VectorType ;

  //const size_t count_owned = import.count_owned ;
  //const size_t count_total = import.count_owned + import.count_receive;
  const size_t count_owned = A.graph.nv;
  const size_t count_total  = count_owned;

  size_t  iteration = 0 ;
  double  iter_time = 0 ;
  double  matvec_time = 0 ;
  double  norm_res = 0 ;
  double precond_time = 0;
  double precond_init_time = 0;

  Kokkos::Impl::Timer wall_clock ;
  Kokkos::Impl::Timer timer;
  // Need input vector to matvec to be owned + received
  VectorType pAll ( "cg::p" , count_total );

  VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
  VectorType r ( "cg::r" , count_owned );
  VectorType Ap( "cg::Ap", count_owned );

  /* r = b - A * x ; */

  /* p  = x       */  Kokkos::deep_copy( p , x );
  ///* import p     */  import( pAll );
  /* Ap = A * p   */  multiply( count_owned , Ap , A , pAll );
  /* r = b - Ap   */  waxpby( count_owned , r , 1.0 , b , -1.0 , Ap );
  /* p  = r       */  Kokkos::deep_copy( p , r );

  //double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
  double old_rdot = dot( count_owned , r , r );

  norm_res  = sqrt( old_rdot );



  int apply_count = 1;
  VectorType z;
  //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
  double precond_old_rdot = 1;
#ifdef PRECOND_NORM
  double precond_norm_res  = 1;
#endif
  Kokkos::deep_copy( p , z );

  //typename KernelHandle::GaussSeidelHandleType *gsHandler;
  bool owner_handle = false;
  if (use_sgs){
    if (kh.get_gs_handle() == NULL){

      owner_handle = true;
      kh.create_gs_handle();
    }
    //gsHandler = kh.get_gs_handle();
    timer.reset();

    KokkosKernels::Experimental::Graph::gauss_seidel_numeric
      (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff);

    Space::fence();
    precond_init_time += timer.seconds();

    z = VectorType( "pcg::z" , count_owned );
    Space::fence();
    timer.reset();

    KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply
        (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count);

    Space::fence();
    precond_time += timer.seconds();
    //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
    precond_old_rdot = dot( count_owned , r , z );
#ifdef PRECOND_NORM
    precond_norm_res  = sqrt( precond_old_rdot );
#endif

    Kokkos::deep_copy( p , z );
  }

  iteration = 0 ;

#ifdef PRINTRES

  std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
#ifdef PRECOND_NORM
  if (use_sgs)
  std::cout << "precond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<<  std::endl;
#endif

#endif
  while ( tolerance < norm_res && iteration < maximum_iteration ) {

    /* pAp_dot = dot( p , Ap = A * p ) */

    timer.reset();
    ///* import p    */  import( pAll );
    /* Ap = A * p  */  multiply( count_owned , Ap , A , pAll );
    Space::fence();
    matvec_time += timer.seconds();

    //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
    const double pAp_dot = dot( count_owned , p , Ap ) ;

    double alpha  = 0;
    if (use_sgs){
      alpha = precond_old_rdot / pAp_dot ;
    }
    else {
      alpha = old_rdot / pAp_dot ;
    }

    /* x +=  alpha * p ;  */ waxpby( count_owned , x ,  alpha, p  , 1.0 , x );
    /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r );

    //const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
    const double r_dot = dot( count_owned , r , r );
    const double beta_original  = r_dot / old_rdot ;

    double precond_r_dot = 1;
    double precond_beta = 1;
    if (use_sgs){
      Space::fence();
      timer.reset();
      KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply(&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count);

      Space::fence();
      precond_time += timer.seconds();
      //const double precond_r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
      precond_r_dot = dot( count_owned , r , z );
      precond_beta  = precond_r_dot / precond_old_rdot ;
    }

    double beta  = 1;
    if (!use_sgs){
      beta = beta_original;
      /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p );
    }
    else {
      beta = precond_beta;
      waxpby( count_owned , p , 1.0 , z , beta , p );
    }

#ifdef PRINTRES
    std::cout << "\tbeta_original:" << beta_original <<  std::endl;

    if (use_sgs)
    std::cout << "\tprecond_beta:" << precond_beta <<  std::endl;

#endif


    norm_res = sqrt( old_rdot = r_dot );
#ifdef PRECOND_NORM
    if (use_sgs){
      precond_norm_res = sqrt( precond_old_rdot = precond_r_dot );
    }
#else
    precond_old_rdot = precond_r_dot;
#endif

#ifdef PRINTRES
    std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
#ifdef PRECOND_NORM

    if (use_sgs)
    std::cout << "\tprecond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<<  std::endl;
#endif
#endif
    ++iteration ;
  }

  Space::fence();
  iter_time = wall_clock.seconds();

  if ( 0 != result ) {
    result->iteration   = iteration ;
    result->iter_time   = iter_time ;
    result->matvec_time = matvec_time ;
    result->norm_res    = norm_res ;
    result->precond_time = precond_time;
    result->precond_init_time = precond_init_time;
  }

  if (use_sgs & owner_handle ){

    kh.destroy_gs_handle();
  }
}
示例#25
0
/**
 * @function calculateTrifocalTensor
 */
void trifocalTensor::calculateTrifocalTensor() {

  Eigen::JacobiSVD<Eigen::MatrixXf> svd( mEq, Eigen::ComputeThinU | Eigen::ComputeThinV );
  Eigen::MatrixXf V = svd.matrixV();
  printf("* V has %d rows and %d cols \n", V.rows(), V.cols() );


  Eigen::FullPivLU<Eigen::MatrixXf> lu(mEq);
  printf("* Rank of mEq is: %d \n", lu.rank() );
  //std::cout << "Columns are nullspace : " << std::endl;
  //std::cout<< lu.kernel() << std::endl;
  //Eigen::MatrixXf kernel = lu.kernel();
  //mmEq(mPointer, ToIndex1(3,1,2)=;3 = kernel.col( kernel.cols() - 1 );
  
  // Eigen::MatrixXf Vt = V.transpose(); mmEq(mPointer, ToIndex1(3,1,2)=;3 = Vt.col( Vt.cols() - 1 );
  mT123 = V.col( V.cols() - 1 );
  printf("mT123: Rows: %d  cols: %d \n", mT123.rows(), mT123.cols() );

  // Saving them properly
  mT.resize(0);
  Eigen::MatrixXf T1(3,3);
  T1(0,0) = mT123(0,0); T1(0,1) = mT123(1,0); T1(0,2) = mT123(2,0);
  T1(1,0) = mT123(3,0); T1(1,1) = mT123(4,0); T1(1,2) = mT123(5,0);
  T1(2,0) = mT123(6,0); T1(2,1) = mT123(7,0); T1(2,2) = mT123(8,0);

  mT.push_back(T1);
  printf("Saved T1 \n");

  Eigen::MatrixXf T2(3,3);
  T2(0,0) = mT123(9,0); T2(0,1) = mT123(10,0); T2(0,2) = mT123(11,0);
  T2(1,0) = mT123(12,0); T2(1,1) = mT123(13,0); T2(1,2) = mT123(14,0);
  T2(2,0) = mT123(15,0); T2(2,1) = mT123(16,0); T2(2,2) = mT123(17,0);

  mT.push_back(T2);
  printf("Saved T2 \n");

  Eigen::MatrixXf T3(3,3);
  T3(0,0) = mT123(18,0); T3(0,1) = mT123(19,0); T3(0,2) = mT123(20,0);
  T3(1,0) = mT123(21,0); T3(1,1) = mT123(22,0); T3(1,2) = mT123(23,0);
  T3(2,0) = mT123(24,0); T3(2,1) = mT123(25,0); T3(2,2) = mT123(26,0);

  mT.push_back(T3);
  printf("Saved T3 \n");

  // Checking
  Eigen::MatrixXf res = mEq*mT123;
  std::cout << "Checking mEq*T: \n"<< res.transpose() << std::endl;

  // Making it with last guy = 1
  // Normalizing
  
  for( int i = 0; i < mT.size(); ++i ) {

    float temp = mT[i](2,2);

    for( int j = 0; j < 3; ++j ) {
      for( int k = 0; k < 3; ++k ) {

	float orig = mT[i](j,k);
	mT[i](j,k) = orig / temp;

      }
    }
  }
  
  // Visualize
  for( int i = 0; i < mT.size(); ++i ) {
    std::cout << "T("<<i<<"): \n" << mT[i] << std::endl;
  }

  // Test lines
  for( int i = 0; i < mLLL.size(); ++i ) {
    Eigen::VectorXf A(3);
    Eigen::VectorXf B(3);
    Eigen::VectorXf C(3);
    Eigen::VectorXf Ap(3);

    A(0) = mLLL[i][0].x; 
    A(1) = mLLL[i][0].y; 
    A(2) = mLLL[i][0].z;

    B(0) = mLLL[i][1].x; 
    B(1) = mLLL[i][1].y; 
    B(2) = mLLL[i][1].z;
 
    C(0) = mLLL[i][2].x; 
    C(1) = mLLL[i][2].y; 
    C(2) = mLLL[i][2].z;


    Eigen::MatrixXf r0, r1, r2;
    Eigen::MatrixXf Tt;
    Tt = mT[0];
    r0 = ( B.transpose() )*Tt*C; 
    Ap(0) = r0(0,0);
    Tt = mT[1];
    r1 = ( B.transpose() )*Tt*C; 
    Ap(1) = r1(0,0);
    Tt = mT[2];
    r2 = ( B.transpose() )*Tt*C; 
    Ap(2) = r2(0,0);

    // Normalize Ap
    float temp = A(2) / Ap(2);
    float num;
    num = Ap(0)*temp; Ap(0) = num;
    num = Ap(1)*temp; Ap(1) = num;
    num = Ap(2)*temp; Ap(2) = num;

    std::cout <<" ("<<i<<") " <<" A:  " << A.transpose()  << std::endl;
    std::cout <<" ("<<i<<") " <<" Ap: " << Ap.transpose()  << std::endl;
  }
}
示例#26
0
int main(int argc, char *argv[]){
  
	Params params;
  
	std::map<std::string, std::string> args;
	readArgs(argc, argv, args);
	if(args.find("algo")!=args.end()){
		params.algo = args["algo"];
	}else{
		params.algo = "qdMCNat";
	}

	if(args.find("inst_file")!=args.end())
		setParamsFromFile(args["inst_file"], args, params);
	else   
		setParams(params.algo, args, params);
  
	createLogDir(params.dir_path);
  
	gen.seed(params.seed);

	// Load the dataset
	MyMatrix X_train, X_valid;
	VectorXd Y_train, Y_valid;
	loadMnist(params.ratio_train, X_train, X_valid, Y_train, Y_valid);
	//loadCIFAR10(params.ratio_train, X_train, X_valid, Y_train, Y_valid);
	//loadLightCIFAR10(params.ratio_train, X_train, X_valid, Y_train, Y_valid);
  
	// ConvNet parameters
	std::vector<ConvLayerParams> conv_params;
	ConvLayerParams conv_params1;
	conv_params1.Hf = 5;
	conv_params1.stride = 1;
	conv_params1.n_filter = 20;
	conv_params1.padding = 0;
	conv_params.push_back(conv_params1);
  
	ConvLayerParams conv_params2;
	conv_params2.Hf = 5;
	conv_params2.stride = 1;
	conv_params2.n_filter = 50;
	conv_params2.padding = 0;
	conv_params.push_back(conv_params2);

	std::vector<PoolLayerParams> pool_params;
	PoolLayerParams pool_params1;
	pool_params1.Hf = 2;
	pool_params1.stride = 2;
	pool_params.push_back(pool_params1);

	PoolLayerParams pool_params2;
	pool_params2.Hf = 2;
	pool_params2.stride = 2;
	pool_params.push_back(pool_params2);
  
	const unsigned n_conv_layer = conv_params.size();
  
	for(unsigned l = 0; l < conv_params.size(); l++){

		if(l==0){
			conv_params[l].filter_size = conv_params[l].Hf * conv_params[l].Hf * params.img_depth;
			conv_params[l].N = (params.img_width - conv_params[l].Hf + 2*conv_params[l].padding)/conv_params[l].stride + 1;
		}
		else{
			conv_params[l].filter_size = conv_params[l].Hf * conv_params[l].Hf * conv_params[l-1].n_filter;
			conv_params[l].N = (pool_params[l-1].N - conv_params[l].Hf + 2*conv_params[l].padding)/conv_params[l].stride + 1;
		}
		pool_params[l].N = (conv_params[l].N - pool_params[l].Hf)/pool_params[l].stride + 1;
	}
  
	// Neural Network parameters
	const unsigned n_training = X_train.rows();
	const unsigned n_valid = X_valid.rows();
	const unsigned n_feature = X_train.cols();
	const unsigned n_label = Y_train.maxCoeff() + 1;
  
	params.nn_arch.insert(params.nn_arch.begin(),conv_params[n_conv_layer-1].n_filter * pool_params[n_conv_layer-1].N * pool_params[n_conv_layer-1].N);
	params.nn_arch.push_back(n_label);
	const unsigned n_layers = params.nn_arch.size();
  
	// Optimization parameter
	const int n_train_batch = ceil(n_training/(float)params.train_minibatch_size);
	const int n_valid_batch = ceil(n_valid/(float)params.valid_minibatch_size);
	double prev_loss = std::numeric_limits<double>::max();
	double eta = params.eta;

	// Create the convolutional layer
	std::vector<MyMatrix> conv_W(n_conv_layer);
	std::vector<MyMatrix> conv_W_T(n_conv_layer);
	std::vector<MyVector> conv_B(n_conv_layer);
  
	// Create the neural network
	MyMatrix W_out(params.nn_arch[n_layers-2],n_label);
	std::vector<MySpMatrix> W(n_layers-2);
	std::vector<MySpMatrix> Wt(n_layers-2);
	std::vector<MyVector> B(n_layers-1);

	double init_sigma = 0.;
	ActivationFunction act_func;
	ActivationFunction eval_act_func;
	if(params.act_func_name=="sigmoid"){
		init_sigma = 4.0;
		act_func = std::bind(logistic,true,_1,_2,_3);
		eval_act_func = std::bind(logistic,false,_1,_2,_3);
	}else if(params.act_func_name=="tanh"){
		init_sigma = 1.0;
		act_func = std::bind(my_tanh,true,_1,_2,_3);
		eval_act_func = std::bind(my_tanh,false,_1,_2,_3);
	}else if(params.act_func_name=="relu"){
		init_sigma = 1.0; // TODO: Find the good value
		act_func = std::bind(relu,true,_1,_2,_3);
		eval_act_func = std::bind(relu,false,_1,_2,_3);
	}else{
		std::cout << "Not implemented yet!" << std::endl;
		assert(false);
	}

	std::cout << "Initializing the network... ";
	params.n_params = initNetwork(params.nn_arch, params.act_func_name, params.sparsity, conv_params, pool_params, W_out, W, Wt, B, conv_W, conv_W_T, conv_B); // TODO: Init the conv bias

	// Deep copy of parameters for the adaptive rule
	std::vector<MyMatrix> mu_dW(n_layers-1);
	std::vector<MyVector> mu_dB(n_layers-1);

	MyMatrix pW_out = W_out;
	std::vector<MySpMatrix> pW = W;
	std::vector<MySpMatrix> pWt = Wt;
	std::vector<MyVector> pB = B;

	MyMatrix ppMii_out, ppM0i_out;
	MyVector ppM00_out;
  
	std::vector<MySpMatrix> ppMii,ppM0i;
	std::vector<MyVector> ppM00;

	MyMatrix pMii_out,pM0i_out;
	MyVector pM00_out;
  
	std::vector<MySpMatrix> pMii,pM0i;
	std::vector<MyVector> pM00;

	std::vector<MyMatrix> conv_ppMii, conv_ppM0i;
	std::vector<MyVector> conv_ppM00;

	std::vector<MyMatrix> conv_pMii, conv_pM0i;
	std::vector<MyVector> conv_pM00;
  
	// Convert the labels to one-hot vector
	MyMatrix one_hot = MyMatrix::Zero(n_training, n_label);
	labels2oneHot(Y_train,one_hot);
  
	// Configure the logger 
	std::ostream* logger;
	if(args.find("verbose")!=args.end()){
		getOutput("",logger);
	}else{
		getOutput(params.file_path,logger);
	}

	double cumul_time = 0.;
  
	printDesc(params, logger);
	printConvDesc(params, conv_params, pool_params, logger);
	std::cout << "Starting the learning phase... " << std::endl;
	*logger << "Epoch Time(s) train_loss train_accuracy valid_loss valid_accuracy eta" << std::endl;
  
	for(unsigned i = 0; i < params.n_epoch; i++){
		for(unsigned j = 0; j < n_train_batch; j++){
      
			// Mini-batch creation
			unsigned curr_batch_size = 0;
			MyMatrix X_batch, one_hot_batch;
			getMiniBatch(j, params.train_minibatch_size, X_train, one_hot, params, conv_params[0], curr_batch_size, X_batch, one_hot_batch);
      
			double prev_time = gettime();

			// Forward propagation for conv layer
			std::vector<std::vector<unsigned>> poolIdxX1(n_conv_layer);
			std::vector<std::vector<unsigned>> poolIdxY1(n_conv_layer);
      
			MyMatrix z0;
			std::vector<MyMatrix> conv_A(conv_W.size());
			std::vector<MyMatrix> conv_Ap(conv_W.size());
			convFprop(curr_batch_size, conv_params, pool_params, act_func, conv_W, conv_B, X_batch, conv_A, conv_Ap, z0, poolIdxX1, poolIdxY1);
            
			// Forward propagation
			std::vector<MyMatrix> Z(n_layers-1);
			std::vector<MyMatrix> A(n_layers-2);
			std::vector<MyMatrix> Ap(n_layers-2);
			fprop(params.dropout_flag, act_func, W, W_out, B, z0, Z, A, Ap);
      
			// Compute the output and the error
			MyMatrix out;
			softmax(Z[n_layers-2], out);
      
			std::vector<MyMatrix> gradB(n_layers-1);
			gradB[n_layers-2] = out - one_hot_batch;

			// Backpropagation
			bprop(Wt, W_out, Ap, gradB);

			// Backpropagation for conv layer
			std::vector<MyMatrix> conv_gradB(conv_W.size());
			MyMatrix layer_gradB = (gradB[0] * W[0].transpose());
			MyMatrix pool_gradB;
			layer2pool(curr_batch_size, pool_params[conv_W.size()-1].N, conv_params[conv_W.size()-1].n_filter, layer_gradB, pool_gradB);
      
			convBprop(curr_batch_size, conv_params, pool_params, conv_W_T, conv_Ap, pool_gradB, conv_gradB, poolIdxX1, poolIdxY1);
      
			if(params.algo == "bprop"){
				update(eta, gradB, A, z0, params.regularizer, params.lambda, W_out, W, Wt, B);
				convUpdate(curr_batch_size, eta, conv_params, conv_gradB, conv_A, X_batch, "", 0., conv_W, conv_W_T, conv_B);
	
			}else{

				// Compute the metric
				std::vector<MyMatrix> metric_gradB(n_layers-1);
				std::vector<MyMatrix> metric_conv_gradB(conv_params.size());

				if(params.algo=="qdMCNat"){

					// Monte-Carlo Approximation of the metric
					std::vector<MyMatrix> mc_gradB(n_layers-1);
					computeMcError(out, mc_gradB[n_layers-2]);

					// Backpropagation
					bprop(Wt, W_out, Ap, mc_gradB);

					for(unsigned k = 0; k < gradB.size(); k++){
						metric_gradB[k] = mc_gradB[k].array().square();
					}

					// Backpropagation for conv layer
					std::vector<MyMatrix> mc_conv_gradB(conv_W.size());
					MyMatrix mc_layer_gradB = (mc_gradB[0] * W[0].transpose());
					MyMatrix mc_pool_gradB;
					layer2pool(curr_batch_size, pool_params[conv_W.size()-1].N, conv_params[conv_W.size()-1].n_filter, mc_layer_gradB, mc_pool_gradB);
	  
					convBprop(curr_batch_size, conv_params, pool_params, conv_W_T, conv_Ap, mc_pool_gradB, mc_conv_gradB, poolIdxX1, poolIdxY1);
	  
					for(unsigned k = 0; k < conv_params.size(); k++){
						metric_conv_gradB[k] = mc_conv_gradB[k].array().square();
					}
				}	
				else if(params.algo=="qdop"){

					for(unsigned k = 0; k < conv_params.size(); k++){
						metric_conv_gradB[k] = conv_gradB[k].array().square();
					}
					for(unsigned k = 0; k < gradB.size(); k++){
						metric_gradB[k] = gradB[k].array().square();
					}
				}
				else if(params.algo=="qdNat"){
	  
					for(unsigned k = 0; k < conv_params.size(); k++){
						metric_conv_gradB[k] = conv_gradB[k].array().square();
					}

					for(unsigned k = 0; k < metric_gradB.size(); k++){
						metric_gradB[k] = MyMatrix::Zero(gradB[k].rows(),gradB[k].cols());
					}

					for(unsigned l = 0; l < n_label; l++){
						MyMatrix fisher_ohbatch = MyMatrix::Zero(curr_batch_size, n_label);
						fisher_ohbatch.col(l).setOnes();

						std::vector<MyMatrix> fgradB(n_layers-1);
						fgradB[n_layers-2] = out - fisher_ohbatch;
						bprop(Wt, W_out, Ap, fgradB);

						// Backpropagation for conv layer
						std::vector<MyMatrix> fisher_conv_gradB(conv_W.size());
						MyMatrix fisher_layer_gradB = (fgradB[0] * W[0].transpose());
						MyMatrix fisher_pool_gradB;
						layer2pool(curr_batch_size, pool_params[conv_W.size()-1].N, conv_params[conv_W.size()-1].n_filter, fisher_layer_gradB, fisher_pool_gradB);
	    
						convBprop(curr_batch_size, conv_params, pool_params, conv_W_T, conv_Ap, fisher_pool_gradB, fisher_conv_gradB, poolIdxX1, poolIdxY1);

						for(unsigned k = 0; k < conv_params.size(); k++){
							MyMatrix fisher_conv_gradB_sq = fisher_conv_gradB[k].array().square();
							for(unsigned m = 0; m < out.rows(); m++){
								for(unsigned f = 0; f < conv_params[k].n_filter; f++){
									for(unsigned n = 0; n < conv_params[k].N * conv_params[k].N; n++){
										fisher_conv_gradB_sq(f,m*conv_params[k].N*conv_params[k].N+n) *= out(m,l);
									}
								}
							}
							metric_conv_gradB[k] += fisher_conv_gradB_sq;
						}
	    
						for(unsigned k = 0; k < W.size(); k++){
							const unsigned rev_k = n_layers - k - 2;
							metric_gradB[rev_k] += (fgradB[rev_k].array().square().array().colwise() * out.array().col(l)).matrix();
						}
					}
				}
	
				bool init_flag = false;
				if(i == 0 && j == 0 && !params.init_metric_id){
					init_flag = true;
				}

				std::vector<MyMatrix> conv_Mii(conv_params.size());
				std::vector<MyMatrix> conv_M0i(conv_params.size());
				std::vector<MyVector> conv_M00(conv_params.size());
	
				buildConvQDMetric(curr_batch_size, metric_conv_gradB, conv_A, X_batch, conv_W, params.matrix_reg, conv_Mii, conv_M0i, conv_M00);

				updateConvMetric(init_flag, params.metric_gamma, conv_pMii, conv_pM0i, conv_pM00, conv_Mii, conv_M0i, conv_M00);

				MyMatrix Mii_out, M0i_out;
				MyVector M00_out;
				std::vector<MySpMatrix> Mii(W.size());
				std::vector<MySpMatrix> M0i(W.size());
				std::vector<MyVector> M00(W.size());

				buildQDMetric(metric_gradB, A, z0, W_out, W, params.matrix_reg, Mii_out, M0i_out, M00_out, Mii, M0i, M00);

				updateMetric(init_flag, params.metric_gamma, Mii_out, M0i_out, M00_out, Mii, M0i, M00, pMii_out, pM0i_out, pM00_out, pMii, pM0i, pM00);
				update(eta, gradB, A, z0, params.regularizer, params.lambda, W_out, W, Wt, B, Mii_out, M0i_out, M00_out, Mii, M0i, M00);
			}
      
			double curr_time = gettime();
			cumul_time += curr_time - prev_time;      
      
			if(params.minilog_flag){
	
				double train_loss = 0.;
				double train_accuracy = 0.;
				double valid_loss = 0.;
				double valid_accuracy = 0.;
				evalModel(eval_act_func, params, n_train_batch, n_training, X_train, Y_train, conv_params, pool_params, conv_W, conv_B, W_out, W, B, train_loss, train_accuracy);
				evalModel(eval_act_func, params, n_valid_batch, n_valid, X_valid, Y_valid, conv_params, pool_params, conv_W, conv_B, W_out, W, B, valid_loss, valid_accuracy);
	
				// Logging
				*logger << i + float(j)/n_train_batch << " " << cumul_time << " " << train_loss <<  " " << train_accuracy << " " << valid_loss <<  " " << valid_accuracy << " " << eta << std::endl;
	
			}
		}
		if(!params.minilog_flag || params.adaptive_flag){
			double train_loss = 0.;
			double train_accuracy = 0.;
			double valid_loss = 0.;
			double valid_accuracy = 0.;
			evalModel(eval_act_func, params, n_train_batch, n_training, X_train, Y_train, conv_params, pool_params, conv_W, conv_B, W_out, W, B, train_loss, train_accuracy);
			evalModel(eval_act_func, params, n_valid_batch, n_valid, X_valid, Y_valid, conv_params, pool_params, conv_W, conv_B, W_out, W, B, valid_loss, valid_accuracy);
      
			// if(params.adaptive_flag)
			// 	adaptiveRule(train_loss, prev_loss, eta, W, B, pMii, pM0i, pM00, pW, pB, ppMii, ppM0i, ppM00);
      
			// Logging
			if(!params.minilog_flag){
				*logger << i  << " " << cumul_time << " " << train_loss <<  " " << train_accuracy << " " << valid_loss <<  " " << valid_accuracy << " " << eta << std::endl;
			}
		}
	}
}
示例#27
0
  void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
  {
    profile.Start(QUDA_PROFILE_INIT);

    // Check to see that we're not trying to invert on a zero-field source    
    const double b2 = norm2(b);
    if(b2 == 0){
      profile.Stop(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      x=b;
      param.true_res = 0.0;
      param.true_res_hq = 0.0;
      return;
    }


    cudaColorSpinorField r(b);

    ColorSpinorParam csParam(x);
    csParam.create = QUDA_ZERO_FIELD_CREATE;
    cudaColorSpinorField y(b, csParam); 
  
    mat(r, x, y);
//    zeroCuda(y);

    double r2 = xmyNormCuda(b, r);
  
    csParam.setPrecision(param.precision_sloppy);
    cudaColorSpinorField Ap(x, csParam);
    cudaColorSpinorField tmp(x, csParam);

    cudaColorSpinorField *tmp2_p = &tmp;
    // tmp only needed for multi-gpu Wilson-like kernels
    if (mat.Type() != typeid(DiracStaggeredPC).name() && 
	mat.Type() != typeid(DiracStaggered).name()) {
      tmp2_p = new cudaColorSpinorField(x, csParam);
    }
    cudaColorSpinorField &tmp2 = *tmp2_p;

    cudaColorSpinorField *x_sloppy, *r_sloppy;
    if (param.precision_sloppy == x.Precision()) {
      csParam.create = QUDA_REFERENCE_FIELD_CREATE;
      x_sloppy = &x;
      r_sloppy = &r;
    } else {
      csParam.create = QUDA_COPY_FIELD_CREATE;
      x_sloppy = new cudaColorSpinorField(x, csParam);
      r_sloppy = new cudaColorSpinorField(r, csParam);
    }

    cudaColorSpinorField &xSloppy = *x_sloppy;
    cudaColorSpinorField &rSloppy = *r_sloppy;
    cudaColorSpinorField p(rSloppy);

    if(&x != &xSloppy){
      copyCuda(y,x);
      zeroCuda(xSloppy);
    }else{
      zeroCuda(y);
    }
    
    const bool use_heavy_quark_res = 
      (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
    
    profile.Stop(QUDA_PROFILE_INIT);
    profile.Start(QUDA_PROFILE_PREAMBLE);

    double r2_old;
    double stop = b2*param.tol*param.tol; // stopping condition of solver

    double heavy_quark_res = 0.0; // heavy quark residual
    if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
    int heavy_quark_check = 10; // how often to check the heavy quark residual

    double alpha=0.0, beta=0.0;
    double pAp;
    int rUpdate = 0;

    double rNorm = sqrt(r2);
    double r0Norm = rNorm;
    double maxrx = rNorm;
    double maxrr = rNorm;
    double delta = param.delta;

    // this parameter determines how many consective reliable update
    // reisudal increases we tolerate before terminating the solver,
    // i.e., how long do we want to keep trying to converge
    int maxResIncrease = 0; // 0 means we have no tolerance 

    profile.Stop(QUDA_PROFILE_PREAMBLE);
    profile.Start(QUDA_PROFILE_COMPUTE);
    blas_flops = 0;

    int k=0;
    
    PrintStats("CG", k, r2, b2, heavy_quark_res);

    int steps_since_reliable = 1;

    while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && 
	    k < param.maxiter) {
      matSloppy(Ap, p, tmp, tmp2); // tmp as tmp
    
      double sigma;

      bool breakdown = false;

      if (param.pipeline) {
	double3 triplet = tripleCGReductionCuda(rSloppy, Ap, p);
	r2 = triplet.x; double Ap2 = triplet.y; pAp = triplet.z;
	r2_old = r2;

	alpha = r2 / pAp;        
	sigma = alpha*(alpha * Ap2 - pAp);
	if (sigma < 0.0 || steps_since_reliable==0) { // sigma condition has broken down
	  r2 = axpyNormCuda(-alpha, Ap, rSloppy);
	  sigma = r2;
	  breakdown = true;
	}

	r2 = sigma;
      } else {
	r2_old = r2;
	pAp = reDotProductCuda(p, Ap);
	alpha = r2 / pAp;        

	// here we are deploying the alternative beta computation 
	Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy);
	r2 = real(cg_norm); // (r_new, r_new)
	sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks
      }

      // reliable update conditions
      rNorm = sqrt(r2);
      if (rNorm > maxrx) maxrx = rNorm;
      if (rNorm > maxrr) maxrr = rNorm;
      int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
      int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
    
      // force a reliable update if we are within target tolerance (only if doing reliable updates)
      if ( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1;

      if ( !(updateR || updateX)) {
	//beta = r2 / r2_old;
	beta = sigma / r2_old; // use the alternative beta computation

	if (param.pipeline && !breakdown) tripleCGUpdateCuda(alpha, beta, Ap, rSloppy, xSloppy, p);
	else axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);

	if (use_heavy_quark_res && k%heavy_quark_check==0) { 
	  copyCuda(tmp,y);
	  heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(xSloppy, tmp, rSloppy).z);
	}

	steps_since_reliable++;
      } else {
	axpyCuda(alpha, p, xSloppy);
	if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
      
	xpyCuda(x, y); // swap these around?
	mat(r, y, x); // here we can use x as tmp
	r2 = xmyNormCuda(b, r);

	if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r);            
	zeroCuda(xSloppy);

	// break-out check if we have reached the limit of the precision
	static int resIncrease = 0;
	if (sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this
	  warningQuda("CG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm);
	  k++;
	  rUpdate++;
	  if (++resIncrease > maxResIncrease) break; 
	} else {
	  resIncrease = 0;
	}

	rNorm = sqrt(r2);
	maxrr = rNorm;
	maxrx = rNorm;
	r0Norm = rNorm;      
	rUpdate++;

	// explicitly restore the orthogonality of the gradient vector
	double rp = reDotProductCuda(rSloppy, p) / (r2);
	axpyCuda(-rp, rSloppy, p);

	beta = r2 / r2_old; 
	xpayCuda(rSloppy, beta, p);

	if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y,r).z);
	
	steps_since_reliable = 0;
      }

      breakdown = false;
      k++;

      PrintStats("CG", k, r2, b2, heavy_quark_res);
    }

    if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
    xpyCuda(y, x);

    profile.Stop(QUDA_PROFILE_COMPUTE);
    profile.Start(QUDA_PROFILE_EPILOGUE);

    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
    double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
    reduceDouble(gflops);
      param.gflops = gflops;
    param.iter += k;

    if (k==param.maxiter) 
      warningQuda("Exceeded maximum iterations %d", param.maxiter);

    if (getVerbosity() >= QUDA_VERBOSE)
      printfQuda("CG: Reliable updates = %d\n", rUpdate);

    // compute the true residuals
    mat(r, x, y);
    param.true_res = sqrt(xmyNormCuda(b, r) / b2);
#if (__COMPUTE_CAPABILITY__ >= 200)
    param.true_res_hq = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
#else
    param.true_res_hq = 0.0;
#endif      

    PrintSummary("CG", k, r2, b2);

    // reset the flops counters
    quda::blas_flops = 0;
    mat.flops();
    matSloppy.flops();

    profile.Stop(QUDA_PROFILE_EPILOGUE);
    profile.Start(QUDA_PROFILE_FREE);

    if (&tmp2 != &tmp) delete tmp2_p;

    if (param.precision_sloppy != x.Precision()) {
      delete r_sloppy;
      delete x_sloppy;
    }

    profile.Stop(QUDA_PROFILE_FREE);

    return;
  }
示例#28
0
//=============================================================================
int Amesos_Dscpack::PerformSymbolicFactorization()
{
  ResetTimer(0);
  ResetTimer(1);

  MyPID_    = Comm().MyPID();
  NumProcs_ = Comm().NumProc();
  
  Epetra_RowMatrix *RowMatrixA = Problem_->GetMatrix();
  if (RowMatrixA == 0)
    AMESOS_CHK_ERR(-1);

  const Epetra_Map& OriginalMap = RowMatrixA->RowMatrixRowMap() ;
  const Epetra_MpiComm& comm1   = dynamic_cast<const Epetra_MpiComm &> (Comm());
  int numrows                   = RowMatrixA->NumGlobalRows();
  int numentries                = RowMatrixA->NumGlobalNonzeros();

  Teuchos::RCP<Epetra_CrsGraph> Graph;

  Epetra_CrsMatrix* CastCrsMatrixA = 
    dynamic_cast<Epetra_CrsMatrix*>(RowMatrixA); 

  if (CastCrsMatrixA)
  {
    Graph = Teuchos::rcp(const_cast<Epetra_CrsGraph*>(&(CastCrsMatrixA->Graph())), false);
  }
  else
  {
    int MaxNumEntries = RowMatrixA->MaxNumEntries();
    Graph = Teuchos::rcp(new Epetra_CrsGraph(Copy, OriginalMap, MaxNumEntries));

    std::vector<int>    Indices(MaxNumEntries);
    std::vector<double> Values(MaxNumEntries);

    for (int i = 0 ; i < RowMatrixA->NumMyRows() ; ++i)
    {
      int NumEntries;
      RowMatrixA->ExtractMyRowCopy(i, MaxNumEntries, NumEntries,
                                   &Values[0], &Indices[0]);

      for (int j = 0 ; j < NumEntries ; ++j)
        Indices[j] = RowMatrixA->RowMatrixColMap().GID(Indices[j]);

      int GlobalRow = RowMatrixA->RowMatrixRowMap().GID(i);
      Graph->InsertGlobalIndices(GlobalRow, NumEntries, &Indices[0]);
    }

    Graph->FillComplete();
  }

  //
  //  Create a replicated map and graph 
  //
  std::vector<int> AllIDs( numrows ) ; 
  for ( int i = 0; i < numrows ; i++ ) AllIDs[i] = i ; 

  Epetra_Map      ReplicatedMap( -1, numrows, &AllIDs[0], 0, Comm());
  Epetra_Import   ReplicatedImporter(ReplicatedMap, OriginalMap);
  Epetra_CrsGraph ReplicatedGraph( Copy, ReplicatedMap, 0 ); 

  AMESOS_CHK_ERR(ReplicatedGraph.Import(*Graph, ReplicatedImporter, Insert));
  AMESOS_CHK_ERR(ReplicatedGraph.FillComplete());

  //
  //  Convert the matrix to Ap, Ai
  //
  std::vector <int> Replicates(numrows);
  std::vector <int> Ap(numrows + 1);
  std::vector <int> Ai(EPETRA_MAX(numrows, numentries));

  for( int i = 0 ; i < numrows; i++ ) Replicates[i] = 1; 
  
  int NumEntriesPerRow ;
  int *ColIndices = 0 ;
  int Ai_index = 0 ; 
  for ( int MyRow = 0; MyRow <numrows; MyRow++ ) {
    AMESOS_CHK_ERR( ReplicatedGraph.ExtractMyRowView( MyRow, NumEntriesPerRow, ColIndices ) );
    Ap[MyRow] = Ai_index ; 
    for ( int j = 0; j < NumEntriesPerRow; j++ ) { 
      Ai[Ai_index] = ColIndices[j] ; 
      Ai_index++;
    }
  }
  assert( Ai_index == numentries ) ; 
  Ap[ numrows ] = Ai_index ; 
  
  MtxConvTime_ = AddTime("Total matrix conversion time", MtxConvTime_, 0);

  ResetTimer(0);

  //
  //  Call Dscpack Symbolic Factorization
  //  
  int OrderCode = 2;
  std::vector<double> MyANonZ;
  
  NumLocalNonz = 0 ; 
  GlobalStructNewColNum = 0 ; 
  GlobalStructNewNum = 0 ;  
  GlobalStructOwner = 0 ; 
  LocalStructOldNum = 0 ; 
  
  NumGlobalCols = 0 ; 
  
  // MS // Have to define the maximum number of processes to be used
  // MS // This is only a suggestion as Dscpack uses a number of processes that is a power of 2  

  int NumGlobalNonzeros = GetProblem()->GetMatrix()->NumGlobalNonzeros();
  int NumRows = GetProblem()->GetMatrix()->NumGlobalRows(); 

  // optimal value for MaxProcs == -1
  
  int OptNumProcs1 = 1+EPETRA_MAX( NumRows/10000, NumGlobalNonzeros/1000000 );
  OptNumProcs1 = EPETRA_MIN(NumProcs_,OptNumProcs1 );

  // optimal value for MaxProcs == -2

  int OptNumProcs2 = (int)sqrt(1.0 * NumProcs_);
  if( OptNumProcs2 < 1 ) OptNumProcs2 = 1;

  // fix the value of MaxProcs

  switch (MaxProcs_) 
  {
  case -1:
    MaxProcs_ = EPETRA_MIN(OptNumProcs1, NumProcs_);
    break;
  case -2:
    MaxProcs_ = EPETRA_MIN(OptNumProcs2, NumProcs_);
    break;
  case -3:
    MaxProcs_ = NumProcs_;
    break;
  }

#if 0
  if (MyDscRank>=0 && A_and_LU_built) { 
    DSC_ReFactorInitialize(PrivateDscpackData_->MyDSCObject);
  }
#endif
  //  if ( ! A_and_LU_built ) { 
  //    DSC_End( PrivateDscpackData_->MyDSCObject ) ; 
  //    PrivateDscpackData_->MyDSCObject = DSC_Begin() ;
  //  } 

  // MS // here I continue with the old code...
  
  OverheadTime_ = AddTime("Total Amesos overhead time", OverheadTime_, 1);

  DscNumProcs = 1 ; 
  int DscMax = DSC_Analyze( numrows, &Ap[0], &Ai[0], &Replicates[0] );

  while ( DscNumProcs * 2 <=EPETRA_MIN( MaxProcs_, DscMax ) )  DscNumProcs *= 2 ;
  
  MyDscRank = -1; 
  DSC_Open0( PrivateDscpackData_->MyDSCObject_, DscNumProcs, &MyDscRank, comm1.Comm()) ; 
  
  NumLocalCols = 0 ; // This is for those processes not in the Dsc grid
  if ( MyDscRank >= 0 ) { 
    assert( MyPID_ == MyDscRank ) ; 
    AMESOS_CHK_ERR( DSC_Order ( PrivateDscpackData_->MyDSCObject_, OrderCode, numrows, &Ap[0], &Ai[0], 
				&Replicates[0], &NumGlobalCols, &NumLocalStructs, 
				&NumLocalCols, &NumLocalNonz, 
				&GlobalStructNewColNum, &GlobalStructNewNum, 
				&GlobalStructOwner, &LocalStructOldNum ) ) ; 
    assert( NumGlobalCols == numrows ) ; 
    assert( NumLocalCols == NumLocalStructs ) ; 
  }

  if ( MyDscRank >= 0 ) { 
    int MaxSingleBlock; 
    
    const int Limit = 5000000 ;  //  Memory Limit set to 5 Terabytes 
    AMESOS_CHK_ERR( DSC_SFactor ( PrivateDscpackData_->MyDSCObject_, &TotalMemory_, 
				  &MaxSingleBlock, Limit, DSC_LBLAS3, DSC_DBLAS2 ) ) ; 
    
  }
  
  //  A_and_LU_built = true;   // If you uncomment this, TestOptions fails
  
  SymFactTime_ = AddTime("Total symbolic factorization time", SymFactTime_, 0);

  return(0);
}
示例#29
0
void cgsolve(
  const ParallelDataMap                 data_map ,
  const CrsMatrix<AScalarType,Device>   A ,
  const View<VScalarType*,Device> b ,
  const View<VScalarType*,Device> x ,
  size_t & iteration ,
  double & normr ,
  double & iter_time ,
  const size_t maximum_iteration = 200 ,
  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
{
  typedef View<VScalarType*,Device> vector_type ;
  typedef View<VScalarType,  Device> value_type ;

  const size_t count_owned = data_map.count_owned ;
  const size_t count_total = data_map.count_owned + data_map.count_receive ;

  Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A );

  // Need input vector to matvec to be owned + received
  vector_type pAll ( "cg::p" , count_total );

  vector_type p = Kokkos::subview< vector_type >( pAll , std::pair<size_t,size_t>(0,count_owned) );
  vector_type r ( "cg::r" , count_owned );
  vector_type Ap( "cg::Ap", count_owned );

  /* r = b - A * x ; */

  /* p  = x      */ deep_copy( p , x );
  /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
  /* r  = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r );
  /* p  = r      */ deep_copy( p , r );

  double old_rdot = dot( count_owned , r , data_map.machine );

  normr     = sqrt( old_rdot );
  iteration = 0 ;

  Kokkos::Impl::Timer wall_clock ;

  while ( tolerance < normr && iteration < maximum_iteration ) {

    /* pAp_dot = dot( p , Ap = A * p ) */

    /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );

    const double pAp_dot = dot( count_owned , p , Ap , data_map.machine );
    const double alpha   = old_rdot / pAp_dot ;

    /* x += alpha * p ;  */ axpy( count_owned,  alpha, p , x );
    /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r );

    const double r_dot = dot( count_owned , r , data_map.machine );
    const double beta  = r_dot / old_rdot ;

    /* p = r + beta * p ; */ xpby( count_owned , r , beta , p );

    normr = sqrt( old_rdot = r_dot );
    ++iteration ;
  }

  iter_time = wall_clock.seconds();
}
ordinal_type
Stokhos::CGDivisionExpansionStrategy<ordinal_type,value_type,node_type>::
CG(const Teuchos::SerialDenseMatrix<ordinal_type, value_type> & A, 
   Teuchos::SerialDenseMatrix<ordinal_type,value_type> & X, 
   const Teuchos::SerialDenseMatrix<ordinal_type,value_type> & B, 
   ordinal_type max_iter, 
   value_type tolerance, 
   ordinal_type prec_iter, 
   ordinal_type order , 
   ordinal_type m, 
   ordinal_type PrecNum, 
   const Teuchos::SerialDenseMatrix<ordinal_type, value_type> & M, 
   ordinal_type diag)

{
  ordinal_type n = A.numRows();
  ordinal_type k=0;
  value_type resid;
  Teuchos::SerialDenseMatrix<ordinal_type, value_type> Ax(n,1);
  Ax.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, X, 0.0);
  Teuchos::SerialDenseMatrix<ordinal_type, value_type> r(Teuchos::Copy,B);
  r-=Ax;
  resid=r.normFrobenius();
  Teuchos::SerialDenseMatrix<ordinal_type, value_type> p(r);
  Teuchos::SerialDenseMatrix<ordinal_type, value_type> rho(1,1);
  Teuchos::SerialDenseMatrix<ordinal_type, value_type> oldrho(1,1);
  Teuchos::SerialDenseMatrix<ordinal_type, value_type> pAp(1,1);
  Teuchos::SerialDenseMatrix<ordinal_type, value_type> Ap(n,1);
  value_type b;
  value_type a;
  while (resid > tolerance && k < max_iter){
    Teuchos::SerialDenseMatrix<ordinal_type, value_type> z(r);
    //Solve Mz=r
    if (PrecNum != 0){
      if (PrecNum == 1){
	Stokhos::DiagPreconditioner<ordinal_type, value_type> precond(M);
	precond.ApplyInverse(r,z,prec_iter);
      }
      else if (PrecNum == 2){
	Stokhos::JacobiPreconditioner<ordinal_type, value_type> precond(M);
	precond.ApplyInverse(r,z,2);
      }
      else if (PrecNum == 3){
	Stokhos::GSPreconditioner<ordinal_type, value_type> precond(M,0);
	precond.ApplyInverse(r,z,1);
      }
      else if (PrecNum == 4){
	Stokhos::SchurPreconditioner<ordinal_type, value_type> precond(M, order, m, diag);
	precond.ApplyInverse(r,z,prec_iter);            
      }
    }
    rho.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, r, z, 0.0);
    

    if (k==0){
      p.assign(z);
      rho.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, r, z, 0.0);  
    }
    else {
      b=rho(0,0)/oldrho(0,0);
      p.scale(b);
      p+=z; 
    }
    Ap.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, p, 0.0);
    pAp.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, p, Ap, 0.0);
    a=rho(0,0)/pAp(0,0);
    Teuchos::SerialDenseMatrix<ordinal_type, value_type> scalep(p);
    scalep.scale(a);
    X+=scalep;
    Ap.scale(a);
    r-=Ap;
    oldrho.assign(rho);
    resid=r.normFrobenius();
    k++;
  }                      
 
  //std::cout << "iteration count  " << k << std::endl;
  return 0; 
}