void cgsolve( const CrsMatrix<AScalarType,Device> & A , const View<VScalarType*,LayoutRight,Device> & b , const View<VScalarType*,LayoutRight,Device> & x , size_t & iteration , double & normr , double & iter_time , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<VScalarType>::epsilon() ) { typedef View<VScalarType*,LayoutRight,Device> vector_type ; const size_t count = b.dimension_0(); vector_type p ( "cg::p" , count ); vector_type r ( "cg::r" , count ); vector_type Ap( "cg::Ap", count ); /* r = b - A * x ; */ /* p = x */ deep_copy( p , x ); /* Ap = A * p */ multiply( A , p , Ap ); /* r = b - Ap */ waxpby( count , 1.0 , b , -1.0 , Ap , r ); /* p = r */ deep_copy( p , r ); double old_rdot = dot( count , r ); normr = std::sqrt( old_rdot ); iteration = 0 ; Kokkos::Impl::Timer wall_clock ; while ( tolerance < normr && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ /* Ap = A * p */ multiply( A , p , Ap ); const double pAp_dot = dot( count , p , Ap ); const double alpha = old_rdot / pAp_dot ; /* x += alpha * p ; */ axpy( count, alpha, p , x ); /* r -= alpha * Ap ; */ axpy( count, -alpha, Ap, r ); const double r_dot = dot( count , r ); const double beta = r_dot / old_rdot ; /* p = r + beta * p ; */ xpby( count , r , beta , p ); normr = std::sqrt( old_rdot = r_dot ); ++iteration ; } iter_time = wall_clock.seconds(); }
void cgsolve( const ParallelDataMap data_map , const CrsMatrix<AScalarType,Device> A , const View<VScalarType*,Device> b , const View<VScalarType*,Device> x , size_t & iteration , double & normr , double & iter_time , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<VScalarType>::epsilon() ) { typedef View<VScalarType*,Device> vector_type ; typedef View<VScalarType, Device> value_type ; const size_t count_owned = data_map.count_owned ; const size_t count_total = data_map.count_owned + data_map.count_receive ; Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A ); // Need input vector to matvec to be owned + received vector_type pAll ( "cg::p" , count_total ); vector_type p = Kokkos::subview< vector_type >( pAll , std::pair<size_t,size_t>(0,count_owned) ); vector_type r ( "cg::r" , count_owned ); vector_type Ap( "cg::Ap", count_owned ); /* r = b - A * x ; */ /* p = x */ deep_copy( p , x ); /* Ap = A * p */ matrix_operator.apply( pAll , Ap ); /* r = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r ); /* p = r */ deep_copy( p , r ); double old_rdot = dot( count_owned , r , data_map.machine ); normr = sqrt( old_rdot ); iteration = 0 ; Kokkos::Impl::Timer wall_clock ; while ( tolerance < normr && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ /* Ap = A * p */ matrix_operator.apply( pAll , Ap ); const double pAp_dot = dot( count_owned , p , Ap , data_map.machine ); const double alpha = old_rdot / pAp_dot ; /* x += alpha * p ; */ axpy( count_owned, alpha, p , x ); /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r ); const double r_dot = dot( count_owned , r , data_map.machine ); const double beta = r_dot / old_rdot ; /* p = r + beta * p ; */ xpby( count_owned , r , beta , p ); normr = sqrt( old_rdot = r_dot ); ++iteration ; } iter_time = wall_clock.seconds(); }
int main() { vector x,b; vector r,p,Ap; matrix A; double one=1.0, zero=0.0; double normr, rtrans, oldtrans, p_ap_dot , alpha, beta; int iter=0; //create matrix allocate_3d_poission_matrix(A,N); printf("Rows: %d, nnz: %d\n", A.num_rows, A.row_offsets[A.num_rows]); allocate_vector(x,A.num_rows); allocate_vector(Ap,A.num_rows); allocate_vector(r,A.num_rows); allocate_vector(p,A.num_rows); allocate_vector(b,A.num_rows); initialize_vector(x,100000); initialize_vector(b,1); waxpby(one, x, zero, x, p); matvec(A,p,Ap); waxpby(one, b, -one, Ap, r); rtrans=dot(r,r); normr=sqrt(rtrans); double st = omp_get_wtime(); do { if(iter==0) { waxpby(one,r,zero,r,p); } else { oldtrans=rtrans; rtrans = dot(r,r); beta = rtrans/oldtrans; waxpby(one,r,beta,p,p); } normr=sqrt(rtrans); matvec(A,p,Ap); p_ap_dot = dot(Ap,p); alpha = rtrans/p_ap_dot; waxpby(one,x,alpha,p,x); waxpby(one,r,-alpha,Ap,r); if(iter%10==0) printf("Iteration: %d, Tolerance: %.4e\n", iter, normr); iter++; } while(iter<MAX_ITERS && normr>TOL); double et = omp_get_wtime(); printf("Total Iterations: %d\n", iter); printf("Total Time: %lf s\n", (et-st)); free_vector(x); free_vector(r); free_vector(p); free_vector(Ap); free_matrix(A); return 0; }
inline void pcgsolve( //const ImportType & import, KernelHandle &kh , const CrsMatrix <typename KernelHandle::nonzero_value_type , typename KernelHandle::row_index_type, typename KernelHandle::HandleExecSpace > & A , const Kokkos::View <typename KernelHandle::nonzero_value_type *, typename KernelHandle::HandleExecSpace> & b , const Kokkos::View <typename KernelHandle::nonzero_value_type * , typename KernelHandle::HandleExecSpace > & x , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<double>::epsilon() , CGSolveResult * result = 0 , bool use_sgs = true ) { typedef typename KernelHandle::HandleExecSpace Space; //typedef typename KernelHandle::nonzero_value_type MScalar; typedef typename KernelHandle::nonzero_value_type VScalar; //typedef typename KernelHandle::row_index_type Idx_Type; //typedef typename KernelHandle::idx_array_type idx_array_type; typedef typename Kokkos::View< VScalar * , Space > VectorType ; //const size_t count_owned = import.count_owned ; //const size_t count_total = import.count_owned + import.count_receive; const size_t count_owned = A.graph.nv; const size_t count_total = count_owned; size_t iteration = 0 ; double iter_time = 0 ; double matvec_time = 0 ; double norm_res = 0 ; double precond_time = 0; double precond_init_time = 0; Kokkos::Impl::Timer wall_clock ; Kokkos::Impl::Timer timer; // Need input vector to matvec to be owned + received VectorType pAll ( "cg::p" , count_total ); VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) ); VectorType r ( "cg::r" , count_owned ); VectorType Ap( "cg::Ap", count_owned ); /* r = b - A * x ; */ /* p = x */ Kokkos::deep_copy( p , x ); ///* import p */ import( pAll ); /* Ap = A * p */ multiply( count_owned , Ap , A , pAll ); /* r = b - Ap */ waxpby( count_owned , r , 1.0 , b , -1.0 , Ap ); /* p = r */ Kokkos::deep_copy( p , r ); //double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm ); double old_rdot = dot( count_owned , r , r ); norm_res = sqrt( old_rdot ); int apply_count = 1; VectorType z; //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); double precond_old_rdot = 1; #ifdef PRECOND_NORM double precond_norm_res = 1; #endif Kokkos::deep_copy( p , z ); //typename KernelHandle::GaussSeidelHandleType *gsHandler; bool owner_handle = false; if (use_sgs){ if (kh.get_gs_handle() == NULL){ owner_handle = true; kh.create_gs_handle(); } //gsHandler = kh.get_gs_handle(); timer.reset(); KokkosKernels::Experimental::Graph::gauss_seidel_numeric (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff); Space::fence(); precond_init_time += timer.seconds(); z = VectorType( "pcg::z" , count_owned ); Space::fence(); timer.reset(); KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count); Space::fence(); precond_time += timer.seconds(); //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); precond_old_rdot = dot( count_owned , r , z ); #ifdef PRECOND_NORM precond_norm_res = sqrt( precond_old_rdot ); #endif Kokkos::deep_copy( p , z ); } iteration = 0 ; #ifdef PRINTRES std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<< std::endl; #ifdef PRECOND_NORM if (use_sgs) std::cout << "precond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<< std::endl; #endif #endif while ( tolerance < norm_res && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ timer.reset(); ///* import p */ import( pAll ); /* Ap = A * p */ multiply( count_owned , Ap , A , pAll ); Space::fence(); matvec_time += timer.seconds(); //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm ); const double pAp_dot = dot( count_owned , p , Ap ) ; double alpha = 0; if (use_sgs){ alpha = precond_old_rdot / pAp_dot ; } else { alpha = old_rdot / pAp_dot ; } /* x += alpha * p ; */ waxpby( count_owned , x , alpha, p , 1.0 , x ); /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r ); //const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm ); const double r_dot = dot( count_owned , r , r ); const double beta_original = r_dot / old_rdot ; double precond_r_dot = 1; double precond_beta = 1; if (use_sgs){ Space::fence(); timer.reset(); KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply(&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count); Space::fence(); precond_time += timer.seconds(); //const double precond_r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); precond_r_dot = dot( count_owned , r , z ); precond_beta = precond_r_dot / precond_old_rdot ; } double beta = 1; if (!use_sgs){ beta = beta_original; /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p ); } else { beta = precond_beta; waxpby( count_owned , p , 1.0 , z , beta , p ); } #ifdef PRINTRES std::cout << "\tbeta_original:" << beta_original << std::endl; if (use_sgs) std::cout << "\tprecond_beta:" << precond_beta << std::endl; #endif norm_res = sqrt( old_rdot = r_dot ); #ifdef PRECOND_NORM if (use_sgs){ precond_norm_res = sqrt( precond_old_rdot = precond_r_dot ); } #else precond_old_rdot = precond_r_dot; #endif #ifdef PRINTRES std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<< std::endl; #ifdef PRECOND_NORM if (use_sgs) std::cout << "\tprecond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<< std::endl; #endif #endif ++iteration ; } Space::fence(); iter_time = wall_clock.seconds(); if ( 0 != result ) { result->iteration = iteration ; result->iter_time = iter_time ; result->matvec_time = matvec_time ; result->norm_res = norm_res ; result->precond_time = precond_time; result->precond_init_time = precond_init_time; } if (use_sgs & owner_handle ){ kh.destroy_gs_handle(); } }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; VectorType r(b.startIndex, nrows, 256); VectorType p(0, ncols, 512); VectorType Ap(b.startIndex, nrows, 64); normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); // print_vec(p.coefs, "p"); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot_r2(r); TOCK(tDOT); //std::cout << "rtrans="<<rtrans<<std::endl; normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon(); #ifdef MINIFE_DEBUG std::ostream& os = outstream(); os << "brkdown_tol = " << brkdown_tol << std::endl; #endif #ifdef MINIFE_DEBUG_OPENMP std::cout << "Starting CG Solve Phase..." << std::endl; #endif for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) { if (k == 1) { //TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY); TICK(); daxpby(one, r, zero, p); TOCK(tWAXPY); } else { oldrtrans = rtrans; TICK(); rtrans = dot_r2(r); TOCK(tDOT); const magnitude_type beta = rtrans/oldrtrans; TICK(); daxpby(one, r, beta, p); TOCK(tWAXPY); } normr = sqrt(rtrans); if (myproc == 0 && (k%print_freq==0 || k==max_iter)) { std::cout << "Iteration = "<<k<<" Residual = "<<normr<<std::endl; } magnitude_type alpha = 0; magnitude_type p_ap_dot = 0; TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT); #ifdef MINIFE_DEBUG os << "iter " << k << ", p_ap_dot = " << p_ap_dot; os.flush(); #endif if (p_ap_dot < brkdown_tol) { if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) { std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl; #ifdef MINIFE_DEBUG os << "ERROR, numerical breakdown!"<<std::endl; #endif //update the timers before jumping out. my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[TOTAL] = mytimer() - total_time; return; } else brkdown_tol = 0.1 * p_ap_dot; } alpha = rtrans/p_ap_dot; #ifdef MINIFE_DEBUG os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl; #endif TICK(); daxpby(alpha, p, one, x); daxpby(-alpha, Ap, one, r); TOCK(tWAXPY); num_iters = k; } my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } char* str; int ngpu = 2; int local_rank = 0; int device = 0; int skip_gpu = 99999; if((str = getenv("CUDA_NGPU")) != NULL) { ngpu = atoi(str); } if((str = getenv("CUDA_SKIP_GPU")) != NULL) { skip_gpu = atoi(str); } if((str = getenv("SLURM_LOCALID")) != NULL) { local_rank = atoi(str); device = local_rank % ngpu; if(device >= skip_gpu) device++; } if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) { local_rank = atoi(str); device = local_rank % ngpu; if(device >= skip_gpu) device++; } if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) { local_rank = atoi(str); device = local_rank % ngpu; if(device >= skip_gpu) device++; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; NVAMG_SAFE_CALL(NVAMG_initialize()); NVAMG_SAFE_CALL(NVAMG_initialize_plugins()); NVAMG_matrix_handle matrix; NVAMG_vector_handle rhs; NVAMG_vector_handle soln; NVAMG_resources_handle rsrc = NULL; NVAMG_solver_handle solver = NULL; NVAMG_config_handle config; NVAMG_SAFE_CALL(NVAMG_config_create_from_file(&config,"NVAMG_CONFIG" )); MPI_Comm nvamg_comm; MPI_Comm_dup(MPI_COMM_WORLD, &nvamg_comm); int devices[] = {device}; NVAMG_resources_create(&rsrc, config, &nvamg_comm, 1, devices); NVAMG_SAFE_CALL(NVAMG_solver_create(&solver, rsrc, NVAMG_mode_dDDI, config)); NVAMG_SAFE_CALL(NVAMG_matrix_create(&matrix, rsrc, NVAMG_mode_dDDI)); NVAMG_SAFE_CALL(NVAMG_vector_create(&rhs, rsrc, NVAMG_mode_dDDI)); NVAMG_SAFE_CALL(NVAMG_vector_create(&soln, rsrc, NVAMG_mode_dDDI)); //Generating communication Maps for NVAMG if(A.neighbors.size()>0) { int** send_map = new int*[A.neighbors.size()]; int** recv_map = new int*[A.neighbors.size()]; int send_offset = 0; int recv_offset = A.row_offsets.size()-1;; for(int i = 0; i<A.neighbors.size();i++) { send_map[i] = &A.elements_to_send[send_offset]; send_offset += A.send_length[i]; recv_map[i] = new int[A.recv_length[i]]; for(int j=0; j<A.recv_length[i]; j++) recv_map[i][j] = recv_offset+j; recv_offset += A.recv_length[i]; } const int** send_map_c = (const int**) send_map; const int** recv_map_c = (const int**) recv_map; NVAMG_SAFE_CALL(NVAMG_matrix_comm_from_maps_one_ring( matrix, 1, A.neighbors.size(),A.neighbors.data(), A.send_length.data(), send_map_c, A.recv_length.data(), recv_map_c)); NVAMG_SAFE_CALL(NVAMG_vector_bind(rhs,matrix)); NVAMG_SAFE_CALL(NVAMG_vector_bind(soln,matrix)); for(int i=0; i<A.neighbors.size(); i++) delete [] recv_map[i]; } for(int i=0;i<x.coefs.size();i++) x.coefs[i]=1; VectorType r(b.startIndex, nrows); VectorType p(0, ncols); VectorType Ap(b.startIndex, nrows); normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot_r2(r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } { //Matrix upload needs to happen before vector, otherwise it crashes NVAMG_SAFE_CALL(NVAMG_matrix_upload_all(matrix,A.row_offsets.size()-1, A.packed_coefs.size(),1,1, &A.row_offsets[0],&A.packed_cols[0],&A.packed_coefs[0], NULL)); NVAMG_SAFE_CALL(NVAMG_vector_upload(soln, p.coefs.size(), 1, &p.coefs[0])); NVAMG_SAFE_CALL(NVAMG_vector_upload(rhs, b.coefs.size(), 1, &b.coefs[0])); int n = 0; int bsize_x = 0, bsize_y = 0; NVAMG_SAFE_CALL(NVAMG_solver_setup(solver, matrix)); NVAMG_SAFE_CALL(NVAMG_solver_solve(solver, rhs, soln)); NVAMG_SAFE_CALL(NVAMG_vector_download(soln, &x.coefs[0])); int niter; NVAMG_SAFE_CALL(NVAMG_solver_get_iterations_number(solver, &niter)); TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot_r2(r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Final Residual = "<< normr << " after " << niter << " iterations" << std::endl; } } my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }
PerformanceData run( const typename FixtureType::FEMeshType & mesh , const int global_max_x , const int global_max_y , const int global_max_z , const bool print_error ) { typedef Scalar scalar_type ; typedef FixtureType fixture_type ; typedef typename fixture_type::device_type device_type; typedef typename device_type::size_type size_type ; typedef typename fixture_type::FEMeshType mesh_type ; typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ; enum { ElementNodeCount = fixture_type::element_node_count }; const comm::Machine machine = mesh.parallel_data_map.machine ; const size_t element_count = mesh.elem_node_ids.dimension(0); //------------------------------------ // The amount of nonlinearity is proportional to the ratio // between T(zmax) and T(zmin). For the manufactured solution // 0 < T(zmin) and 0 < T(zmax) const ManufacturedSolution exact_solution( /* zmin */ 0 , /* zmax */ global_max_z , /* T(zmin) */ 1 , /* T(zmax) */ 20 ); //----------------------------------- // Convergence Criteria and perf data: const size_t cg_iteration_limit = 200 ; const double cg_tolerance = 1e-14 ; const size_t newton_iteration_limit = 150 ; const double newton_tolerance = 1e-14 ; size_t cg_iteration_count_total = 0 ; double cg_iteration_time = 0 ; size_t newton_iteration_count = 0 ; double residual_norm_init = 0 ; double residual_norm = 0 ; PerformanceData perf_data ; //------------------------------------ // Sparse linear system types: typedef KokkosArray::View< Scalar[] , device_type > vector_type ; typedef KokkosArray::CrsMatrix< Scalar , device_type > matrix_type ; typedef typename matrix_type::graph_type matrix_graph_type ; typedef typename matrix_type::coefficients_type matrix_coefficients_type ; typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ; //------------------------------------ // Problem setup types: typedef ElementComputation < mesh_type , Scalar > ElementFunctor ; typedef DirichletSolution < mesh_type , Scalar > DirichletSolutionFunctor ; typedef DirichletResidual < mesh_type , Scalar > DirichletResidualFunctor ; typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ; typedef typename ElementFunctor::elem_vectors_type elem_vectors_type ; typedef GatherFill< matrix_type , mesh_type , elem_matrices_type , elem_vectors_type > GatherFillFunctor ; //------------------------------------ matrix_type jacobian ; vector_type residual ; vector_type delta ; vector_type nodal_solution ; typename graph_factory::element_map_type element_map ; //------------------------------------ // Generate mesh and corresponding sparse matrix graph KokkosArray::Impl::Timer wall_clock ; //------------------------------------ // Generate sparse matrix graph and element->graph map. wall_clock.reset(); graph_factory::create( mesh , jacobian.graph , element_map ); device_type::fence(); perf_data.graph_time = comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Allocate linear system coefficients and rhs: const size_t local_owned_length = jacobian.graph.row_map.dimension(0) - 1 ; const size_t local_total_length = mesh.node_coords.dimension(0); jacobian.coefficients = matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension(0) ); // Nonlinear residual for owned nodes: residual = vector_type( "residual" , local_owned_length ); // Nonlinear solution for owned and ghosted nodes: nodal_solution = vector_type( "solution" , local_total_length ); // Nonlinear solution update for owned nodes: delta = vector_type( "delta" , local_owned_length ); //------------------------------------ // Allocation of arrays to fill the linear system elem_matrices_type elem_matrices ; // Jacobian matrices elem_vectors_type elem_vectors ; // Residual vectors if ( element_count ) { elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count ); elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count ); } //------------------------------------ // For boundary condition set the correct values in the solution vector // The 'zmin' face is assigned to 'T_zmin'. // The 'zmax' face is assigned to 'T_zmax'. // The resulting solution is one dimensional along the 'Z' axis. DirichletSolutionFunctor::apply( nodal_solution , mesh , exact_solution.zmin , exact_solution.zmax , exact_solution.T_zmin , exact_solution.T_zmax ); for(;;) { // Nonlinear loop #if defined( HAVE_MPI ) { //------------------------------------ // Import off-processor nodal solution values // for residual and jacobian computations KokkosArray::AsyncExchange< typename vector_type::value_type , device_type , KokkosArray::ParallelDataMap > exchange( mesh.parallel_data_map , 1 ); KokkosArray::PackArray< vector_type > ::pack( exchange.buffer() , mesh.parallel_data_map.count_interior , mesh.parallel_data_map.count_send , nodal_solution ); exchange.setup(); exchange.send_receive(); KokkosArray::UnpackArray< vector_type > ::unpack( nodal_solution , exchange.buffer() , mesh.parallel_data_map.count_owned , mesh.parallel_data_map.count_receive ); } #endif //------------------------------------ // Compute element matrices and vectors: wall_clock.reset(); ElementFunctor( mesh , elem_matrices , elem_vectors , nodal_solution , exact_solution.K ); device_type::fence(); perf_data.elem_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Fill linear system coefficients: wall_clock.reset(); fill( 0 , jacobian.coefficients ); fill( 0 , residual ); GatherFillFunctor::apply( jacobian , residual , mesh , element_map , elem_matrices , elem_vectors ); device_type::fence(); perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() ); // Apply boundary conditions: wall_clock.reset(); // Updates jacobian matrix to 1 on the diagonal, zero elsewhere, // and 0 in the residual due to the solution vector having the correct value DirichletResidualFunctor::apply( jacobian, residual, mesh , exact_solution.zmin , exact_solution.zmax ); device_type::fence(); perf_data.matrix_boundary_condition_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------ // Has the residual converged? residual_norm = sqrt( dot(mesh.parallel_data_map, residual) ); if ( 0 == newton_iteration_count ) { residual_norm_init = residual_norm ; } if ( residual_norm / residual_norm_init < newton_tolerance ) { break ; } //------------------------------------ // Solve linear sytem size_t cg_iteration_count = 0 ; double cg_residual_norm = 0 ; cgsolve( mesh.parallel_data_map , jacobian , residual , delta , cg_iteration_count , cg_residual_norm , cg_iteration_time , cg_iteration_limit , cg_tolerance ) ; perf_data.cg_iteration_time += cg_iteration_time ; cg_iteration_count_total += cg_iteration_count ; // Update non-linear solution with delta... // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update // LaTeX: // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} ) // text: // x[n+1] = x[n] + Dx waxpby( mesh.parallel_data_map, 1.0, nodal_solution, -1.0, delta, nodal_solution); ++newton_iteration_count ; if ( newton_iteration_limit < newton_iteration_count ) { break ; } }; if ( newton_iteration_count ) { perf_data.elem_time /= newton_iteration_count ; perf_data.matrix_gather_fill_time /= newton_iteration_count ; perf_data.matrix_boundary_condition_time /= newton_iteration_count ; } if ( cg_iteration_count_total ) { perf_data.cg_iteration_time /= cg_iteration_count_total ; } perf_data.newton_iteration_count = newton_iteration_count ; perf_data.cg_iteration_count = cg_iteration_count_total ; //------------------------------------ { // For extracting the nodal solution and its coordinates: typename mesh_type::node_coords_type::HostMirror node_coords_host = KokkosArray::create_mirror( mesh.node_coords ); typename vector_type::HostMirror nodal_solution_host = KokkosArray::create_mirror( nodal_solution ); KokkosArray::deep_copy( node_coords_host , mesh.node_coords ); KokkosArray::deep_copy( nodal_solution_host , nodal_solution ); double tmp = 0 ; for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) { const coordinate_scalar_type x = node_coords_host(i,0); const coordinate_scalar_type y = node_coords_host(i,1); const coordinate_scalar_type z = node_coords_host(i,2); const double Tx = exact_solution(z); const double Ts = nodal_solution_host(i); const double Te = std::abs( Tx - Ts ) / std::abs( Tx ); tmp = std::max( tmp , Te ); if ( print_error && 0.02 < Te ) { std::cout << " node( " << x << " " << y << " " << z << " ) = " << Ts << " != exact_solution " << Tx << std::endl ; } } perf_data.error_max = comm::max( machine , tmp ); } return perf_data ; }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors"); VectorType r(b.startIndex, nrows); VectorType p(0, ncols); VectorType Ap(b.startIndex, nrows); nvtxRangeEnd(r1); #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); #endif #endif normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot(r, r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon(); #ifdef MINIFE_DEBUG std::ostream& os = outstream(); os << "brkdown_tol = " << brkdown_tol << std::endl; #endif for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) { if (k == 1) { TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY); } else { oldrtrans = rtrans; TICK(); rtrans = dot(r, r); TOCK(tDOT); magnitude_type beta = rtrans/oldrtrans; TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY); } normr = std::sqrt(rtrans); if (myproc == 0 && (k%print_freq==0 || k==max_iter)) { std::cout << "Iteration = "<<k<<" Residual = "<<normr<<std::endl; } magnitude_type alpha = 0; magnitude_type p_ap_dot = 0; TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT); #ifdef MINIFE_DEBUG os << "iter " << k << ", p_ap_dot = " << p_ap_dot; os.flush(); #endif //TODO remove false below if (false && p_ap_dot < brkdown_tol) { if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) { std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl; #ifdef MINIFE_DEBUG os << "ERROR, numerical breakdown!"<<std::endl; #endif //update the timers before jumping out. my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[TOTAL] = mytimer() - total_time; return; } else brkdown_tol = 0.1 * p_ap_dot; } alpha = rtrans/p_ap_dot; #ifdef MINIFE_DEBUG os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl; #endif TICK(); waxpby(one, x, alpha, p, x); waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY); num_iters = k; } #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostUnregister(&p.coefs[0]); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]); cudaCheckError(); #endif #endif my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }