Example #1
0
void cgsolve(
  const CrsMatrix<AScalarType,Device>  & A ,
  const View<VScalarType*,LayoutRight,Device> & b ,
  const View<VScalarType*,LayoutRight,Device> & x ,
  size_t & iteration ,
  double & normr ,
  double & iter_time ,
  const size_t maximum_iteration = 200 ,
  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
{
  typedef View<VScalarType*,LayoutRight,Device> vector_type ;

  const size_t count = b.dimension_0();

  vector_type p ( "cg::p" , count );
  vector_type r ( "cg::r" , count );
  vector_type Ap( "cg::Ap", count );

  /* r = b - A * x ; */

  /* p  = x      */ deep_copy( p , x );
  /* Ap = A * p  */ multiply( A , p , Ap );
  /* r  = b - Ap */ waxpby( count , 1.0 , b , -1.0 , Ap , r );
  /* p  = r      */ deep_copy( p , r );

  double old_rdot = dot( count , r );

  normr     = std::sqrt( old_rdot );
  iteration = 0 ;

  Kokkos::Impl::Timer wall_clock ;

  while ( tolerance < normr && iteration < maximum_iteration ) {

    /* pAp_dot = dot( p , Ap = A * p ) */

    /* Ap = A * p  */ multiply( A , p , Ap );

    const double pAp_dot = dot( count , p , Ap );
    const double alpha   = old_rdot / pAp_dot ;

    /* x += alpha * p ;  */ axpy( count,  alpha, p , x );
    /* r -= alpha * Ap ; */ axpy( count, -alpha, Ap, r );

    const double r_dot = dot( count , r );
    const double beta  = r_dot / old_rdot ;

    /* p = r + beta * p ; */ xpby( count , r , beta , p );

    normr = std::sqrt( old_rdot = r_dot );
    ++iteration ;
  }

  iter_time = wall_clock.seconds();
}
void cgsolve(
  const ParallelDataMap                 data_map ,
  const CrsMatrix<AScalarType,Device>   A ,
  const View<VScalarType*,Device> b ,
  const View<VScalarType*,Device> x ,
  size_t & iteration ,
  double & normr ,
  double & iter_time ,
  const size_t maximum_iteration = 200 ,
  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
{
  typedef View<VScalarType*,Device> vector_type ;
  typedef View<VScalarType,  Device> value_type ;

  const size_t count_owned = data_map.count_owned ;
  const size_t count_total = data_map.count_owned + data_map.count_receive ;

  Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A );

  // Need input vector to matvec to be owned + received
  vector_type pAll ( "cg::p" , count_total );

  vector_type p = Kokkos::subview< vector_type >( pAll , std::pair<size_t,size_t>(0,count_owned) );
  vector_type r ( "cg::r" , count_owned );
  vector_type Ap( "cg::Ap", count_owned );

  /* r = b - A * x ; */

  /* p  = x      */ deep_copy( p , x );
  /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
  /* r  = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r );
  /* p  = r      */ deep_copy( p , r );

  double old_rdot = dot( count_owned , r , data_map.machine );

  normr     = sqrt( old_rdot );
  iteration = 0 ;

  Kokkos::Impl::Timer wall_clock ;

  while ( tolerance < normr && iteration < maximum_iteration ) {

    /* pAp_dot = dot( p , Ap = A * p ) */

    /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );

    const double pAp_dot = dot( count_owned , p , Ap , data_map.machine );
    const double alpha   = old_rdot / pAp_dot ;

    /* x += alpha * p ;  */ axpy( count_owned,  alpha, p , x );
    /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r );

    const double r_dot = dot( count_owned , r , data_map.machine );
    const double beta  = r_dot / old_rdot ;

    /* p = r + beta * p ; */ xpby( count_owned , r , beta , p );

    normr = sqrt( old_rdot = r_dot );
    ++iteration ;
  }

  iter_time = wall_clock.seconds();
}
int main() {
  vector x,b;
  vector r,p,Ap;
  matrix A;
  
  double one=1.0, zero=0.0;
  double normr, rtrans, oldtrans, p_ap_dot , alpha, beta;
  int iter=0;

  //create matrix
  allocate_3d_poission_matrix(A,N);
    
  printf("Rows: %d, nnz: %d\n", A.num_rows, A.row_offsets[A.num_rows]);

  allocate_vector(x,A.num_rows);
  allocate_vector(Ap,A.num_rows);
  allocate_vector(r,A.num_rows);
  allocate_vector(p,A.num_rows);
  allocate_vector(b,A.num_rows);

  initialize_vector(x,100000);
  initialize_vector(b,1);
 

  waxpby(one, x, zero, x, p);
  matvec(A,p,Ap);
  waxpby(one, b, -one, Ap, r);
  
  rtrans=dot(r,r);
  normr=sqrt(rtrans);
  
  double st = omp_get_wtime();
  do {
    if(iter==0) {
      waxpby(one,r,zero,r,p);
    } else {
      oldtrans=rtrans;
      rtrans = dot(r,r);
      beta = rtrans/oldtrans;
      waxpby(one,r,beta,p,p);
    }
    
    normr=sqrt(rtrans);
  
    matvec(A,p,Ap);
    p_ap_dot = dot(Ap,p);

    alpha = rtrans/p_ap_dot;

    waxpby(one,x,alpha,p,x);
    waxpby(one,r,-alpha,Ap,r);

    if(iter%10==0)
      printf("Iteration: %d, Tolerance: %.4e\n", iter, normr);
    iter++;
  } while(iter<MAX_ITERS && normr>TOL);
  double et = omp_get_wtime();

  printf("Total Iterations: %d\n", iter);
  printf("Total Time: %lf s\n", (et-st));

  free_vector(x);
  free_vector(r);
  free_vector(p);
  free_vector(Ap);
  free_matrix(A);

  return 0;
}
Example #4
0
inline
void pcgsolve( //const ImportType & import,
              KernelHandle &kh
            ,  const CrsMatrix <typename KernelHandle::nonzero_value_type , typename KernelHandle::row_index_type, typename KernelHandle::HandleExecSpace >      & A
            , const Kokkos::View <typename KernelHandle::nonzero_value_type *,
                                  typename KernelHandle::HandleExecSpace> & b
            , const Kokkos::View <typename KernelHandle::nonzero_value_type * ,
                                  typename KernelHandle::HandleExecSpace > & x
            , const size_t  maximum_iteration = 200
            , const double  tolerance = std::numeric_limits<double>::epsilon()
            , CGSolveResult * result = 0
            , bool use_sgs = true
            )
{
  typedef typename KernelHandle::HandleExecSpace Space;
  //typedef typename KernelHandle::nonzero_value_type MScalar;
  typedef typename KernelHandle::nonzero_value_type VScalar;
  //typedef typename KernelHandle::row_index_type Idx_Type;
  //typedef typename KernelHandle::idx_array_type idx_array_type;
  typedef typename Kokkos::View< VScalar * , Space >  VectorType ;

  //const size_t count_owned = import.count_owned ;
  //const size_t count_total = import.count_owned + import.count_receive;
  const size_t count_owned = A.graph.nv;
  const size_t count_total  = count_owned;

  size_t  iteration = 0 ;
  double  iter_time = 0 ;
  double  matvec_time = 0 ;
  double  norm_res = 0 ;
  double precond_time = 0;
  double precond_init_time = 0;

  Kokkos::Impl::Timer wall_clock ;
  Kokkos::Impl::Timer timer;
  // Need input vector to matvec to be owned + received
  VectorType pAll ( "cg::p" , count_total );

  VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
  VectorType r ( "cg::r" , count_owned );
  VectorType Ap( "cg::Ap", count_owned );

  /* r = b - A * x ; */

  /* p  = x       */  Kokkos::deep_copy( p , x );
  ///* import p     */  import( pAll );
  /* Ap = A * p   */  multiply( count_owned , Ap , A , pAll );
  /* r = b - Ap   */  waxpby( count_owned , r , 1.0 , b , -1.0 , Ap );
  /* p  = r       */  Kokkos::deep_copy( p , r );

  //double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
  double old_rdot = dot( count_owned , r , r );

  norm_res  = sqrt( old_rdot );



  int apply_count = 1;
  VectorType z;
  //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
  double precond_old_rdot = 1;
#ifdef PRECOND_NORM
  double precond_norm_res  = 1;
#endif
  Kokkos::deep_copy( p , z );

  //typename KernelHandle::GaussSeidelHandleType *gsHandler;
  bool owner_handle = false;
  if (use_sgs){
    if (kh.get_gs_handle() == NULL){

      owner_handle = true;
      kh.create_gs_handle();
    }
    //gsHandler = kh.get_gs_handle();
    timer.reset();

    KokkosKernels::Experimental::Graph::gauss_seidel_numeric
      (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff);

    Space::fence();
    precond_init_time += timer.seconds();

    z = VectorType( "pcg::z" , count_owned );
    Space::fence();
    timer.reset();

    KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply
        (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count);

    Space::fence();
    precond_time += timer.seconds();
    //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
    precond_old_rdot = dot( count_owned , r , z );
#ifdef PRECOND_NORM
    precond_norm_res  = sqrt( precond_old_rdot );
#endif

    Kokkos::deep_copy( p , z );
  }

  iteration = 0 ;

#ifdef PRINTRES

  std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
#ifdef PRECOND_NORM
  if (use_sgs)
  std::cout << "precond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<<  std::endl;
#endif

#endif
  while ( tolerance < norm_res && iteration < maximum_iteration ) {

    /* pAp_dot = dot( p , Ap = A * p ) */

    timer.reset();
    ///* import p    */  import( pAll );
    /* Ap = A * p  */  multiply( count_owned , Ap , A , pAll );
    Space::fence();
    matvec_time += timer.seconds();

    //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
    const double pAp_dot = dot( count_owned , p , Ap ) ;

    double alpha  = 0;
    if (use_sgs){
      alpha = precond_old_rdot / pAp_dot ;
    }
    else {
      alpha = old_rdot / pAp_dot ;
    }

    /* x +=  alpha * p ;  */ waxpby( count_owned , x ,  alpha, p  , 1.0 , x );
    /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r );

    //const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
    const double r_dot = dot( count_owned , r , r );
    const double beta_original  = r_dot / old_rdot ;

    double precond_r_dot = 1;
    double precond_beta = 1;
    if (use_sgs){
      Space::fence();
      timer.reset();
      KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply(&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count);

      Space::fence();
      precond_time += timer.seconds();
      //const double precond_r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
      precond_r_dot = dot( count_owned , r , z );
      precond_beta  = precond_r_dot / precond_old_rdot ;
    }

    double beta  = 1;
    if (!use_sgs){
      beta = beta_original;
      /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p );
    }
    else {
      beta = precond_beta;
      waxpby( count_owned , p , 1.0 , z , beta , p );
    }

#ifdef PRINTRES
    std::cout << "\tbeta_original:" << beta_original <<  std::endl;

    if (use_sgs)
    std::cout << "\tprecond_beta:" << precond_beta <<  std::endl;

#endif


    norm_res = sqrt( old_rdot = r_dot );
#ifdef PRECOND_NORM
    if (use_sgs){
      precond_norm_res = sqrt( precond_old_rdot = precond_r_dot );
    }
#else
    precond_old_rdot = precond_r_dot;
#endif

#ifdef PRINTRES
    std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
#ifdef PRECOND_NORM

    if (use_sgs)
    std::cout << "\tprecond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<<  std::endl;
#endif
#endif
    ++iteration ;
  }

  Space::fence();
  iter_time = wall_clock.seconds();

  if ( 0 != result ) {
    result->iteration   = iteration ;
    result->iter_time   = iter_time ;
    result->matvec_time = matvec_time ;
    result->norm_res    = norm_res ;
    result->precond_time = precond_time;
    result->precond_init_time = precond_init_time;
  }

  if (use_sgs & owner_handle ){

    kh.destroy_gs_handle();
  }
}
Example #5
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  VectorType r(b.startIndex, nrows, 256);
  VectorType p(0, ncols, 512);
  VectorType Ap(b.startIndex, nrows, 64);

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

//  print_vec(p.coefs, "p");

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot_r2(r); TOCK(tDOT);

//std::cout << "rtrans="<<rtrans<<std::endl;

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }

  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();

#ifdef MINIFE_DEBUG
  std::ostream& os = outstream();
  os << "brkdown_tol = " << brkdown_tol << std::endl;
#endif

#ifdef MINIFE_DEBUG_OPENMP
  std::cout << "Starting CG Solve Phase..." << std::endl;
#endif

  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
    if (k == 1) {
      //TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
	TICK(); daxpby(one, r, zero, p); TOCK(tWAXPY);
    }
    else {
      oldrtrans = rtrans;
      TICK(); rtrans = dot_r2(r); TOCK(tDOT);
      const magnitude_type beta = rtrans/oldrtrans;
      TICK(); daxpby(one, r, beta, p); TOCK(tWAXPY);
    }

    normr = sqrt(rtrans);

    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
    }

    magnitude_type alpha = 0;
    magnitude_type p_ap_dot = 0;

    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);
    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);

#ifdef MINIFE_DEBUG
    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
    os.flush();
#endif
    if (p_ap_dot < brkdown_tol) {
      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
#ifdef MINIFE_DEBUG
        os << "ERROR, numerical breakdown!"<<std::endl;
#endif
        //update the timers before jumping out.
        my_cg_times[WAXPY] = tWAXPY;
        my_cg_times[DOT] = tDOT;
        my_cg_times[MATVEC] = tMATVEC;
        my_cg_times[TOTAL] = mytimer() - total_time;
        return;
      }
      else brkdown_tol = 0.1 * p_ap_dot;
    }
    alpha = rtrans/p_ap_dot;
#ifdef MINIFE_DEBUG
    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
#endif

    TICK(); daxpby(alpha, p, one, x);
            daxpby(-alpha, Ap, one, r); TOCK(tWAXPY);

    num_iters = k;
  }

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}
Example #6
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  char* str;
  int ngpu = 2;
  int local_rank = 0;
  int device = 0;
  int skip_gpu = 99999;
  if((str = getenv("CUDA_NGPU")) != NULL) {
    ngpu = atoi(str);
  }
  if((str = getenv("CUDA_SKIP_GPU")) != NULL) {
    skip_gpu = atoi(str);
  }
  if((str = getenv("SLURM_LOCALID")) != NULL) {
    local_rank = atoi(str);
    device = local_rank % ngpu;
    if(device >= skip_gpu) device++;
  }
  if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) {
    local_rank = atoi(str);
    device = local_rank % ngpu;
    if(device >= skip_gpu) device++;
  }
  if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) {
    local_rank = atoi(str);
    device = local_rank % ngpu;
    if(device >= skip_gpu) device++;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  NVAMG_SAFE_CALL(NVAMG_initialize());
  NVAMG_SAFE_CALL(NVAMG_initialize_plugins());
  NVAMG_matrix_handle matrix;
  NVAMG_vector_handle rhs;
  NVAMG_vector_handle soln;
  NVAMG_resources_handle rsrc = NULL;
  NVAMG_solver_handle solver = NULL;
  NVAMG_config_handle config;
  NVAMG_SAFE_CALL(NVAMG_config_create_from_file(&config,"NVAMG_CONFIG" ));

  MPI_Comm nvamg_comm;
  MPI_Comm_dup(MPI_COMM_WORLD, &nvamg_comm);
  int devices[] = {device};

  NVAMG_resources_create(&rsrc, config, &nvamg_comm, 1, devices);
  NVAMG_SAFE_CALL(NVAMG_solver_create(&solver, rsrc, NVAMG_mode_dDDI, config));
  NVAMG_SAFE_CALL(NVAMG_matrix_create(&matrix, rsrc, NVAMG_mode_dDDI));
  NVAMG_SAFE_CALL(NVAMG_vector_create(&rhs, rsrc, NVAMG_mode_dDDI));
  NVAMG_SAFE_CALL(NVAMG_vector_create(&soln, rsrc, NVAMG_mode_dDDI));

  //Generating communication Maps for NVAMG
  if(A.neighbors.size()>0) {
    int** send_map = new int*[A.neighbors.size()];
    int** recv_map = new int*[A.neighbors.size()];
    int send_offset = 0;
    int recv_offset = A.row_offsets.size()-1;;
    for(int i = 0; i<A.neighbors.size();i++) {
      send_map[i] = &A.elements_to_send[send_offset];
      send_offset += A.send_length[i];
      recv_map[i] = new int[A.recv_length[i]];
      for(int j=0; j<A.recv_length[i]; j++)
        recv_map[i][j] = recv_offset+j;
      recv_offset += A.recv_length[i];
    }
    const int** send_map_c = (const int**) send_map;
    const int** recv_map_c = (const int**) recv_map;
    NVAMG_SAFE_CALL(NVAMG_matrix_comm_from_maps_one_ring(
      matrix, 1, A.neighbors.size(),A.neighbors.data(),
      A.send_length.data(), send_map_c,
      A.recv_length.data(), recv_map_c));
    NVAMG_SAFE_CALL(NVAMG_vector_bind(rhs,matrix));
    NVAMG_SAFE_CALL(NVAMG_vector_bind(soln,matrix));
    for(int i=0; i<A.neighbors.size(); i++)
      delete [] recv_map[i];

  }

  for(int i=0;i<x.coefs.size();i++) x.coefs[i]=1;

  VectorType r(b.startIndex, nrows);
  VectorType p(0, ncols);
  VectorType Ap(b.startIndex, nrows);

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot_r2(r); TOCK(tDOT);

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }
  {

    //Matrix upload needs to happen before vector, otherwise it crashes
    NVAMG_SAFE_CALL(NVAMG_matrix_upload_all(matrix,A.row_offsets.size()-1, A.packed_coefs.size(),1,1, &A.row_offsets[0],&A.packed_cols[0],&A.packed_coefs[0], NULL));
    NVAMG_SAFE_CALL(NVAMG_vector_upload(soln, p.coefs.size(), 1, &p.coefs[0]));
    NVAMG_SAFE_CALL(NVAMG_vector_upload(rhs, b.coefs.size(), 1, &b.coefs[0]));

    int n = 0;
    int bsize_x = 0, bsize_y = 0;

    NVAMG_SAFE_CALL(NVAMG_solver_setup(solver, matrix));
    NVAMG_SAFE_CALL(NVAMG_solver_solve(solver, rhs, soln));
    NVAMG_SAFE_CALL(NVAMG_vector_download(soln, &x.coefs[0]));

    int niter;
    NVAMG_SAFE_CALL(NVAMG_solver_get_iterations_number(solver, &niter));

    TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);
    TICK();
    matvec(A, p, Ap);
    TOCK(tMATVEC);

    TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

    TICK(); rtrans = dot_r2(r); TOCK(tDOT);

    normr = std::sqrt(rtrans);

    if (myproc == 0) {
      std::cout << "Final Residual = "<< normr << " after " << niter << " iterations" << std::endl;
    }
   }

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}
Example #7
0
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
                     const int global_max_x ,
                     const int global_max_y ,
                     const int global_max_z ,
                     const bool print_error )
{
    typedef Scalar                              scalar_type ;
    typedef FixtureType                         fixture_type ;
    typedef typename fixture_type::device_type  device_type;
    typedef typename device_type::size_type     size_type ;

    typedef typename fixture_type::FEMeshType mesh_type ;
    typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;

    enum { ElementNodeCount = fixture_type::element_node_count };

    const comm::Machine machine = mesh.parallel_data_map.machine ;

    const size_t element_count = mesh.elem_node_ids.dimension(0);

    //------------------------------------
    // The amount of nonlinearity is proportional to the ratio
    // between T(zmax) and T(zmin).  For the manufactured solution
    // 0 < T(zmin) and 0 < T(zmax)

    const ManufacturedSolution
    exact_solution( /* zmin */ 0 ,
                               /* zmax */ global_max_z ,
                               /* T(zmin) */ 1 ,
                               /* T(zmax) */ 20 );

    //-----------------------------------
    // Convergence Criteria and perf data:

    const size_t cg_iteration_limit = 200 ;
    const double cg_tolerance = 1e-14 ;

    const size_t newton_iteration_limit = 150 ;
    const double newton_tolerance = 1e-14 ;

    size_t cg_iteration_count_total = 0 ;
    double cg_iteration_time = 0 ;

    size_t newton_iteration_count = 0 ;
    double residual_norm_init = 0 ;
    double residual_norm = 0 ;

    PerformanceData perf_data ;

    //------------------------------------
    // Sparse linear system types:

    typedef KokkosArray::View< Scalar[] , device_type >     vector_type ;
    typedef KokkosArray::CrsMatrix< Scalar , device_type >  matrix_type ;
    typedef typename matrix_type::graph_type                matrix_graph_type ;
    typedef typename matrix_type::coefficients_type         matrix_coefficients_type ;

    typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;

    //------------------------------------
    // Problem setup types:

    typedef ElementComputation < mesh_type , Scalar > ElementFunctor ;
    typedef DirichletSolution  < mesh_type , Scalar > DirichletSolutionFunctor ;
    typedef DirichletResidual  < mesh_type , Scalar > DirichletResidualFunctor ;

    typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
    typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;

    typedef GatherFill< matrix_type ,
            mesh_type ,
            elem_matrices_type ,
            elem_vectors_type > GatherFillFunctor ;

    //------------------------------------

    matrix_type jacobian ;
    vector_type residual ;
    vector_type delta ;
    vector_type nodal_solution ;

    typename graph_factory::element_map_type element_map ;

    //------------------------------------
    // Generate mesh and corresponding sparse matrix graph

    KokkosArray::Impl::Timer wall_clock ;

    //------------------------------------
    // Generate sparse matrix graph and element->graph map.

    wall_clock.reset();

    graph_factory::create( mesh , jacobian.graph , element_map );

    device_type::fence();

    perf_data.graph_time = comm::max( machine , wall_clock.seconds() );

    //------------------------------------
    // Allocate linear system coefficients and rhs:

    const size_t local_owned_length = jacobian.graph.row_map.dimension(0) - 1 ;
    const size_t local_total_length = mesh.node_coords.dimension(0);

    jacobian.coefficients =
        matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension(0) );

    // Nonlinear residual for owned nodes:
    residual = vector_type( "residual" , local_owned_length );

    // Nonlinear solution for owned and ghosted nodes:
    nodal_solution = vector_type( "solution" , local_total_length );

    // Nonlinear solution update for owned nodes:
    delta = vector_type( "delta" , local_owned_length );

    //------------------------------------
    // Allocation of arrays to fill the linear system

    elem_matrices_type elem_matrices ; // Jacobian matrices
    elem_vectors_type  elem_vectors ;  // Residual vectors

    if ( element_count ) {
        elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
        elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count );
    }

    //------------------------------------
    // For boundary condition set the correct values in the solution vector
    //   The 'zmin' face is assigned to 'T_zmin'.
    //   The 'zmax' face is assigned to 'T_zmax'.
    //   The resulting solution is one dimensional along the 'Z' axis.

    DirichletSolutionFunctor::apply( nodal_solution , mesh ,
                                     exact_solution.zmin ,
                                     exact_solution.zmax ,
                                     exact_solution.T_zmin ,
                                     exact_solution.T_zmax );

    for(;;) { // Nonlinear loop

#if defined( HAVE_MPI )

        {   //------------------------------------
            // Import off-processor nodal solution values
            // for residual and jacobian computations

            KokkosArray::AsyncExchange< typename vector_type::value_type , device_type ,
                        KokkosArray::ParallelDataMap >
                        exchange( mesh.parallel_data_map , 1 );

            KokkosArray::PackArray< vector_type >
            ::pack( exchange.buffer() ,
                    mesh.parallel_data_map.count_interior ,
                    mesh.parallel_data_map.count_send ,
                    nodal_solution );

            exchange.setup();

            exchange.send_receive();

            KokkosArray::UnpackArray< vector_type >
            ::unpack( nodal_solution , exchange.buffer() ,
                      mesh.parallel_data_map.count_owned ,
                      mesh.parallel_data_map.count_receive );
        }

#endif

        //------------------------------------
        // Compute element matrices and vectors:

        wall_clock.reset();

        ElementFunctor( mesh ,
                        elem_matrices ,
                        elem_vectors ,
                        nodal_solution ,
                        exact_solution.K );

        device_type::fence();
        perf_data.elem_time += comm::max( machine , wall_clock.seconds() );

        //------------------------------------
        // Fill linear system coefficients:

        wall_clock.reset();

        fill( 0 , jacobian.coefficients );
        fill( 0 , residual );

        GatherFillFunctor::apply( jacobian ,
                                  residual ,
                                  mesh ,
                                  element_map ,
                                  elem_matrices ,
                                  elem_vectors );

        device_type::fence();
        perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() );

        // Apply boundary conditions:

        wall_clock.reset();

        // Updates jacobian matrix to 1 on the diagonal, zero elsewhere,
        // and 0 in the residual due to the solution vector having the correct value
        DirichletResidualFunctor::apply( jacobian, residual, mesh ,
                                         exact_solution.zmin ,
                                         exact_solution.zmax );

        device_type::fence();
        perf_data.matrix_boundary_condition_time +=
            comm::max( machine , wall_clock.seconds() );

        //------------------------------------
        // Has the residual converged?

        residual_norm = sqrt( dot(mesh.parallel_data_map, residual) );

        if ( 0 == newton_iteration_count ) {
            residual_norm_init = residual_norm ;
        }

        if ( residual_norm / residual_norm_init < newton_tolerance ) {
            break ;
        }

        //------------------------------------
        // Solve linear sytem

        size_t cg_iteration_count = 0 ;
        double cg_residual_norm = 0 ;

        cgsolve( mesh.parallel_data_map ,
                 jacobian , residual , delta ,
                 cg_iteration_count ,
                 cg_residual_norm ,
                 cg_iteration_time ,
                 cg_iteration_limit , cg_tolerance ) ;

        perf_data.cg_iteration_time += cg_iteration_time ;
        cg_iteration_count_total += cg_iteration_count ;

        // Update non-linear solution with delta...
        // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update
        // LaTeX:
        // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} )
        // text:
        // x[n+1] = x[n] + Dx

        waxpby( mesh.parallel_data_map,
                1.0, nodal_solution,
                -1.0, delta, nodal_solution);

        ++newton_iteration_count ;

        if ( newton_iteration_limit < newton_iteration_count ) {
            break ;
        }
    };

    if ( newton_iteration_count ) {
        perf_data.elem_time /= newton_iteration_count ;
        perf_data.matrix_gather_fill_time /= newton_iteration_count ;
        perf_data.matrix_boundary_condition_time /= newton_iteration_count ;
    }

    if ( cg_iteration_count_total ) {
        perf_data.cg_iteration_time /= cg_iteration_count_total ;
    }

    perf_data.newton_iteration_count = newton_iteration_count ;
    perf_data.cg_iteration_count = cg_iteration_count_total ;

    //------------------------------------

    {
        // For extracting the nodal solution and its coordinates:

        typename mesh_type::node_coords_type::HostMirror node_coords_host =
            KokkosArray::create_mirror( mesh.node_coords );

        typename vector_type::HostMirror nodal_solution_host =
            KokkosArray::create_mirror( nodal_solution );

        KokkosArray::deep_copy( node_coords_host , mesh.node_coords );
        KokkosArray::deep_copy( nodal_solution_host , nodal_solution );

        double tmp = 0 ;

        for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
            const coordinate_scalar_type x = node_coords_host(i,0);
            const coordinate_scalar_type y = node_coords_host(i,1);
            const coordinate_scalar_type z = node_coords_host(i,2);

            const double Tx = exact_solution(z);
            const double Ts = nodal_solution_host(i);
            const double Te = std::abs( Tx - Ts ) / std::abs( Tx );

            tmp = std::max( tmp , Te );

            if ( print_error && 0.02 < Te ) {
                std::cout << "  node( " << x << " " << y << " " << z << " ) = "
                          << Ts << " != exact_solution " << Tx
                          << std::endl ;
            }
        }
        perf_data.error_max = comm::max( machine , tmp );
    }

    return perf_data ;
}
Example #8
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors");
  VectorType r(b.startIndex, nrows);
  VectorType p(0, ncols);
  VectorType Ap(b.startIndex, nrows);
  nvtxRangeEnd(r1);

#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
#endif
#endif

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot(r, r); TOCK(tDOT);

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }

  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();

#ifdef MINIFE_DEBUG
  std::ostream& os = outstream();
  os << "brkdown_tol = " << brkdown_tol << std::endl;
#endif

  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
    if (k == 1) {
      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
    }
    else {
      oldrtrans = rtrans;
      TICK(); rtrans = dot(r, r); TOCK(tDOT);
      magnitude_type beta = rtrans/oldrtrans;
      TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY);
    }

    normr = std::sqrt(rtrans);

    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
    }

    magnitude_type alpha = 0;
    magnitude_type p_ap_dot = 0;

    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);

    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);

#ifdef MINIFE_DEBUG
    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
    os.flush();
#endif
    //TODO remove false below
    if (false && p_ap_dot < brkdown_tol) {
      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
#ifdef MINIFE_DEBUG
        os << "ERROR, numerical breakdown!"<<std::endl;
#endif
        //update the timers before jumping out.
        my_cg_times[WAXPY] = tWAXPY;
        my_cg_times[DOT] = tDOT;
        my_cg_times[MATVEC] = tMATVEC;
        my_cg_times[TOTAL] = mytimer() - total_time;
        return;
      }
      else brkdown_tol = 0.1 * p_ap_dot;
    }
    alpha = rtrans/p_ap_dot;
#ifdef MINIFE_DEBUG
    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
#endif

    TICK(); waxpby(one, x, alpha, p, x);
            waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY);
    num_iters = k;
  }
  
#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostUnregister(&p.coefs[0]);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]);
  cudaCheckError();
#endif
#endif

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}