Perf fenl_assembly( const Teuchos::RCP<const Teuchos::Comm<int> >& comm , const int use_print , const int use_trials , const int use_nodes[] , Kokkos::Example::FENL::DeviceConfig dev_config , Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual) { using Teuchos::RCP; using Teuchos::rcp; using Teuchos::rcpFromRef; using Teuchos::arrayView; using Teuchos::ParameterList; typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ; typedef Kokkos::Example::FENL::CrsMatrix< Scalar , Device > LocalMatrixType ; typedef typename LocalMatrixType::StaticCrsGraphType LocalGraphType ; typedef Kokkos::Example::FENL::NodeNodeGraph< typename FixtureType::elem_node_type , LocalGraphType , FixtureType::ElemNode > NodeNodeGraphType ; //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType; typedef Kokkos::Example::FENL::ExponentialKLCoefficient< Scalar, double, Device > CoeffFunctionType; typedef Kokkos::Example::FENL::ElementComputation< FixtureType , LocalMatrixType , Method, CoeffFunctionType > ElementComputationType ; // typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , LocalMatrixType > // DirichletComputationType ; typedef typename ElementComputationType::vector_type VectorType ; typedef Kokkos::Example::VectorImport< typename FixtureType::comm_list_type , typename FixtureType::send_nodeid_type , VectorType > ImportType ; //------------------------------------ const int print_flag = use_print && Kokkos::Impl::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ; const int comm_rank = comm->getRank(); const int comm_size = comm->getSize(); // Decompose by node to avoid parallel communication in assembly const double bubble_x = 1.0 ; const double bubble_y = 1.0 ; const double bubble_z = 1.0 ; const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode , comm_size , comm_rank , use_nodes[0] , use_nodes[1] , use_nodes[2] , bubble_x , bubble_y , bubble_z ); //------------------------------------ const ImportType comm_nodal_import( comm , fixture.recv_node() , fixture.send_node() , fixture.send_nodeid() , fixture.node_count_owned() , fixture.node_count() - fixture.node_count_owned() ); //------------------------------------ // const double bc_lower_value = 1 ; // const double bc_upper_value = 2 ; //CoeffFunctionType diffusion_coefficient( 1.0 ); CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 ); Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 ); //------------------------------------ if ( print_flag ) { std::cout << "ElemNode {" << std::endl ; for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) { std::cout << " elem[" << ielem << "]{" ; for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) { std::cout << " " << fixture.elem_node(ielem,inode); } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; } //------------------------------------ Kokkos::Impl::Timer wall_clock ; Perf perf_stats = Perf() ; for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) { Perf perf = Perf() ; perf.global_elem_count = fixture.elem_count_global(); perf.global_node_count = fixture.node_count_global(); //---------------------------------- // Create the local sparse matrix graph and element-to-graph map // from the element->to->node identifier array. // The graph only has rows for the owned nodes. typename NodeNodeGraphType::Times graph_times; const NodeNodeGraphType mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(), graph_times ); // Create the local sparse matrix from the graph: LocalMatrixType jacobian( mesh_to_graph.graph ); //---------------------------------- // Allocate solution vector for each node in the mesh and residual vector for each owned node VectorType nodal_solution( "nodal_solution" , fixture.node_count() ); nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() ); // Get DeviceConfig structs used by some functors Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc; Kokkos::Example::FENL::CreateDeviceConfigs<Scalar>::eval( dev_config_elem, dev_config_bc ); // Create element computation functor const ElementComputationType elemcomp( fixture , diffusion_coefficient , nodal_solution , mesh_to_graph.elem_graph , jacobian , nodal_residual , dev_config_elem ); // Create boundary condition functor // const DirichletComputationType dirichlet( // fixture , nodal_solution , jacobian , nodal_residual , // 2 /* apply at 'z' ends */ , // bc_lower_value , // bc_upper_value , // dev_config_bc ); Kokkos::deep_copy( nodal_solution , Scalar(1) ); //-------------------------------- wall_clock.reset(); comm_nodal_import( nodal_solution ); Device::fence(); perf.import_time = maximum( comm , wall_clock.seconds() ); //-------------------------------- // Element contributions to residual and jacobian wall_clock.reset(); Kokkos::deep_copy( nodal_residual , Scalar(0) ); Kokkos::deep_copy( jacobian.values , Scalar(0) ); elemcomp.apply(); //-------------------------------- // Apply boundary conditions //dirichlet.apply(); Device::fence(); perf.fill_time = maximum( comm , wall_clock.seconds() ); //-------------------------------- perf_stats.increment(perf); } return perf_stats ; }
Perf fenl( MPI_Comm comm , const int use_print , const int use_trials , const int use_atomic , const int use_elems[] ) { typedef Kokkos::Example::BoxElemFixture< Space , ElemOrder > FixtureType ; typedef Kokkos::Example::CrsMatrix< double , Space > SparseMatrixType ; typedef typename SparseMatrixType::StaticCrsGraphType SparseGraphType ; typedef Kokkos::Example::FENL::NodeNodeGraph< typename FixtureType::elem_node_type , SparseGraphType , FixtureType::ElemNode > NodeNodeGraphType ; typedef Kokkos::Example::FENL::ElementComputation< FixtureType , SparseMatrixType > ElementComputationType ; typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , SparseMatrixType > DirichletComputationType ; typedef NodeElemGatherFill< ElementComputationType > NodeElemGatherFillType ; typedef typename ElementComputationType::vector_type VectorType ; typedef Kokkos::Example::VectorImport< typename FixtureType::comm_list_type , typename FixtureType::send_nodeid_type , VectorType > ImportType ; //------------------------------------ const unsigned newton_iteration_limit = 10 ; const double newton_iteration_tolerance = 1e-7 ; const unsigned cg_iteration_limit = 200 ; const double cg_iteration_tolerance = 1e-7 ; //------------------------------------ const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Space::memory_space >::value ; int comm_rank ; int comm_size ; MPI_Comm_rank( comm , & comm_rank ); MPI_Comm_size( comm , & comm_size ); // Decompose by node to avoid mpi-communication for assembly const float bubble_x = 1.0 ; const float bubble_y = 1.0 ; const float bubble_z = 1.0 ; const FixtureType fixture( BoxElemPart::DecomposeNode , comm_size , comm_rank , use_elems[0] , use_elems[1] , use_elems[2] , bubble_x , bubble_y , bubble_z ); { int global_error = ! fixture.ok(); #if defined( KOKKOS_ENABLE_MPI ) int local_error = global_error ; global_error = 0 ; MPI_Allreduce( & local_error , & global_error , 1 , MPI_INT , MPI_SUM , comm ); #endif if ( global_error ) { throw std::runtime_error(std::string("Error generating finite element fixture")); } } //------------------------------------ const ImportType comm_nodal_import( comm , fixture.recv_node() , fixture.send_node() , fixture.send_nodeid() , fixture.node_count_owned() , fixture.node_count() - fixture.node_count_owned() ); //------------------------------------ const double bc_lower_value = 1 ; const double bc_upper_value = 2 ; const Kokkos::Example::FENL::ManufacturedSolution manufactured_solution( 0 , 1 , bc_lower_value , bc_upper_value ); //------------------------------------ for ( int k = 0 ; k < comm_size && use_print ; ++k ) { if ( k == comm_rank ) { typename FixtureType::node_grid_type::HostMirror h_node_grid = Kokkos::create_mirror_view( fixture.node_grid() ); typename FixtureType::node_coord_type::HostMirror h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() ); typename FixtureType::elem_node_type::HostMirror h_elem_node = Kokkos::create_mirror_view( fixture.elem_node() ); Kokkos::deep_copy( h_node_grid , fixture.node_grid() ); Kokkos::deep_copy( h_node_coord , fixture.node_coord() ); Kokkos::deep_copy( h_elem_node , fixture.elem_node() ); std::cout << "MPI[" << comm_rank << "]" << std::endl ; std::cout << "Node grid {" ; for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) { std::cout << " (" << h_node_grid(inode,0) << "," << h_node_grid(inode,1) << "," << h_node_grid(inode,2) << ")" ; } std::cout << " }" << std::endl ; std::cout << "Node coord {" ; for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) { std::cout << " (" << h_node_coord(inode,0) << "," << h_node_coord(inode,1) << "," << h_node_coord(inode,2) << ")" ; } std::cout << " }" << std::endl ; std::cout << "Manufactured solution" << " a[" << manufactured_solution.a << "]" << " b[" << manufactured_solution.b << "]" << " K[" << manufactured_solution.K << "]" << " {" ; for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) { std::cout << " " << manufactured_solution( h_node_coord( inode , 2 ) ); } std::cout << " }" << std::endl ; std::cout << "ElemNode {" << std::endl ; for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) { std::cout << " elem[" << ielem << "]{" ; for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) { std::cout << " " << h_elem_node(ielem,inode); } std::cout << " }{" ; for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) { std::cout << " (" << h_node_grid(h_elem_node(ielem,inode),0) << "," << h_node_grid(h_elem_node(ielem,inode),1) << "," << h_node_grid(h_elem_node(ielem,inode),2) << ")" ; } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; } std::cout.flush(); MPI_Barrier( comm ); } //------------------------------------ Kokkos::Timer wall_clock ; Perf perf_stats = Perf() ; for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) { Perf perf = Perf() ; perf.global_elem_count = fixture.elem_count_global(); perf.global_node_count = fixture.node_count_global(); //---------------------------------- // Create the sparse matrix graph and element-to-graph map // from the element->to->node identifier array. // The graph only has rows for the owned nodes. typename NodeNodeGraphType::Times graph_times; const NodeNodeGraphType mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(), graph_times ); perf.map_ratio = maximum(comm, graph_times.ratio); perf.fill_node_set = maximum(comm, graph_times.fill_node_set); perf.scan_node_count = maximum(comm, graph_times.scan_node_count); perf.fill_graph_entries = maximum(comm, graph_times.fill_graph_entries); perf.sort_graph_entries = maximum(comm, graph_times.sort_graph_entries); perf.fill_element_graph = maximum(comm, graph_times.fill_element_graph); wall_clock.reset(); // Create the sparse matrix from the graph: SparseMatrixType jacobian( mesh_to_graph.graph ); Space::fence(); perf.create_sparse_matrix = maximum( comm , wall_clock.seconds() ); //---------------------------------- for ( int k = 0 ; k < comm_size && print_flag ; ++k ) { if ( k == comm_rank ) { const unsigned nrow = jacobian.graph.numRows(); std::cout << "MPI[" << comm_rank << "]" << std::endl ; std::cout << "JacobianGraph {" << std::endl ; for ( unsigned irow = 0 ; irow < nrow ; ++irow ) { std::cout << " row[" << irow << "]{" ; const unsigned entry_end = jacobian.graph.row_map(irow+1); for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) { std::cout << " " << jacobian.graph.entries(entry); } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; std::cout << "ElemGraph {" << std::endl ; for ( unsigned ielem = 0 ; ielem < mesh_to_graph.elem_graph.dimension_0() ; ++ielem ) { std::cout << " elem[" << ielem << "]{" ; for ( unsigned irow = 0 ; irow < mesh_to_graph.elem_graph.dimension_1() ; ++irow ) { std::cout << " {" ; for ( unsigned icol = 0 ; icol < mesh_to_graph.elem_graph.dimension_2() ; ++icol ) { std::cout << " " << mesh_to_graph.elem_graph(ielem,irow,icol); } std::cout << " }" ; } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; } std::cout.flush(); MPI_Barrier( comm ); } //---------------------------------- // Allocate solution vector for each node in the mesh and residual vector for each owned node const VectorType nodal_solution( "nodal_solution" , fixture.node_count() ); const VectorType nodal_residual( "nodal_residual" , fixture.node_count_owned() ); const VectorType nodal_delta( "nodal_delta" , fixture.node_count_owned() ); // Create element computation functor const ElementComputationType elemcomp( use_atomic ? ElementComputationType( fixture , manufactured_solution.K , nodal_solution , mesh_to_graph.elem_graph , jacobian , nodal_residual ) : ElementComputationType( fixture , manufactured_solution.K , nodal_solution ) ); const NodeElemGatherFillType gatherfill( use_atomic ? NodeElemGatherFillType() : NodeElemGatherFillType( fixture.elem_node() , mesh_to_graph.elem_graph , nodal_residual , jacobian , elemcomp.elem_residuals , elemcomp.elem_jacobians ) ); // Create boundary condition functor const DirichletComputationType dirichlet( fixture , nodal_solution , jacobian , nodal_residual , 2 /* apply at 'z' ends */ , manufactured_solution.T_zmin , manufactured_solution.T_zmax ); //---------------------------------- // Nonlinear Newton iteration: double residual_norm_init = 0 ; for ( perf.newton_iter_count = 0 ; perf.newton_iter_count < newton_iteration_limit ; ++perf.newton_iter_count ) { //-------------------------------- comm_nodal_import( nodal_solution ); //-------------------------------- // Element contributions to residual and jacobian wall_clock.reset(); Kokkos::deep_copy( nodal_residual , double(0) ); Kokkos::deep_copy( jacobian.coeff , double(0) ); elemcomp.apply(); if ( ! use_atomic ) { gatherfill.apply(); } Space::fence(); perf.fill_time = maximum( comm , wall_clock.seconds() ); //-------------------------------- // Apply boundary conditions wall_clock.reset(); dirichlet.apply(); Space::fence(); perf.bc_time = maximum( comm , wall_clock.seconds() ); //-------------------------------- // Evaluate convergence const double residual_norm = std::sqrt( Kokkos::Example::all_reduce( Kokkos::Example::dot( fixture.node_count_owned() , nodal_residual, nodal_residual ) , comm ) ); perf.newton_residual = residual_norm ; if ( 0 == perf.newton_iter_count ) { residual_norm_init = residual_norm ; } if ( residual_norm < residual_norm_init * newton_iteration_tolerance ) { break ; } //-------------------------------- // Solve for nonlinear update CGSolveResult cg_result ; Kokkos::Example::cgsolve( comm_nodal_import , jacobian , nodal_residual , nodal_delta , cg_iteration_limit , cg_iteration_tolerance , & cg_result ); // Update solution vector Kokkos::Example::waxpby( fixture.node_count_owned() , nodal_solution , -1.0 , nodal_delta , 1.0 , nodal_solution ); perf.cg_iter_count += cg_result.iteration ; perf.matvec_time += cg_result.matvec_time ; perf.cg_time += cg_result.iter_time ; //-------------------------------- if ( print_flag ) { const double delta_norm = std::sqrt( Kokkos::Example::all_reduce( Kokkos::Example::dot( fixture.node_count_owned() , nodal_delta, nodal_delta ) , comm ) ); if ( 0 == comm_rank ) { std::cout << "Newton iteration[" << perf.newton_iter_count << "]" << " residual[" << perf.newton_residual << "]" << " update[" << delta_norm << "]" << " cg_iteration[" << cg_result.iteration << "]" << " cg_residual[" << cg_result.norm_res << "]" << std::endl ; } for ( int k = 0 ; k < comm_size ; ++k ) { if ( k == comm_rank ) { const unsigned nrow = jacobian.graph.numRows(); std::cout << "MPI[" << comm_rank << "]" << std::endl ; std::cout << "Residual {" ; for ( unsigned irow = 0 ; irow < nrow ; ++irow ) { std::cout << " " << nodal_residual(irow); } std::cout << " }" << std::endl ; std::cout << "Delta {" ; for ( unsigned irow = 0 ; irow < nrow ; ++irow ) { std::cout << " " << nodal_delta(irow); } std::cout << " }" << std::endl ; std::cout << "Solution {" ; for ( unsigned irow = 0 ; irow < nrow ; ++irow ) { std::cout << " " << nodal_solution(irow); } std::cout << " }" << std::endl ; std::cout << "Jacobian[ " << jacobian.graph.numRows() << " x " << Kokkos::maximum_entry( jacobian.graph ) << " ] {" << std::endl ; for ( unsigned irow = 0 ; irow < nrow ; ++irow ) { std::cout << " {" ; const unsigned entry_end = jacobian.graph.row_map(irow+1); for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) { std::cout << " (" << jacobian.graph.entries(entry) << "," << jacobian.coeff(entry) << ")" ; } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; } std::cout.flush(); MPI_Barrier( comm ); } } //-------------------------------- } // Evaluate solution error if ( 0 == itrial ) { const typename FixtureType::node_coord_type::HostMirror h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() ); const typename VectorType::HostMirror h_nodal_solution = Kokkos::create_mirror_view( nodal_solution ); Kokkos::deep_copy( h_node_coord , fixture.node_coord() ); Kokkos::deep_copy( h_nodal_solution , nodal_solution ); double error_max = 0 ; for ( unsigned inode = 0 ; inode < fixture.node_count_owned() ; ++inode ) { const double answer = manufactured_solution( h_node_coord( inode , 2 ) ); const double error = ( h_nodal_solution(inode) - answer ) / answer ; if ( error_max < fabs( error ) ) { error_max = fabs( error ); } } perf.error_max = std::sqrt( Kokkos::Example::all_reduce_max( error_max , comm ) ); perf_stats = perf ; } else { perf_stats.fill_node_set = std::min( perf_stats.fill_node_set , perf.fill_node_set ); perf_stats.scan_node_count = std::min( perf_stats.scan_node_count , perf.scan_node_count ); perf_stats.fill_graph_entries = std::min( perf_stats.fill_graph_entries , perf.fill_graph_entries ); perf_stats.sort_graph_entries = std::min( perf_stats.sort_graph_entries , perf.sort_graph_entries ); perf_stats.fill_element_graph = std::min( perf_stats.fill_element_graph , perf.fill_element_graph ); perf_stats.create_sparse_matrix = std::min( perf_stats.create_sparse_matrix , perf.create_sparse_matrix ); perf_stats.fill_time = std::min( perf_stats.fill_time , perf.fill_time ); perf_stats.bc_time = std::min( perf_stats.bc_time , perf.bc_time ); perf_stats.cg_time = std::min( perf_stats.cg_time , perf.cg_time ); } } return perf_stats ; }
Perf fenl_assembly( const Teuchos::RCP<const Teuchos::Comm<int> >& comm , const int use_print , const int use_trials , const int use_atomic , const int use_nodes[] , Kokkos::DeviceConfig dev_config , Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual) { using Teuchos::RCP; using Teuchos::rcp; using Teuchos::rcpFromRef; using Teuchos::arrayView; using Teuchos::ParameterList; typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ; typedef Kokkos::CrsMatrix< Scalar , unsigned , Device > //typedef typename GlobalMatrixType::k_local_matrix_type LocalMatrixType ; typedef typename LocalMatrixType::StaticCrsGraphType LocalGraphType ; typedef Kokkos::Example::FENL::NodeNodeGraph< typename FixtureType::elem_node_type , LocalGraphType , FixtureType::ElemNode > NodeNodeGraphType ; typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType; typedef Kokkos::Example::FENL::ElementComputation< FixtureType , LocalMatrixType , CoeffFunctionType > ElementComputationType ; typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , LocalMatrixType > DirichletComputationType ; typedef Kokkos::Example::FENL::NodeElemGatherFill< ElementComputationType > NodeElemGatherFillType ; typedef typename ElementComputationType::vector_type VectorType ; typedef Kokkos::Example::VectorImport< typename FixtureType::comm_list_type , typename FixtureType::send_nodeid_type , VectorType > ImportType ; //------------------------------------ const int print_flag = use_print && Kokkos::Impl::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ; const int comm_rank = comm->getRank(); const int comm_size = comm->getSize(); // Decompose by node to avoid parallel communication in assembly const float bubble_x = 1.0 ; const float bubble_y = 1.0 ; const float bubble_z = 1.0 ; const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode , comm_size , comm_rank , use_nodes[0] , use_nodes[1] , use_nodes[2] , bubble_x , bubble_y , bubble_z ); //------------------------------------ const ImportType comm_nodal_import( comm , fixture.recv_node() , fixture.send_node() , fixture.send_nodeid() , fixture.node_count_owned() , fixture.node_count() - fixture.node_count_owned() ); //------------------------------------ const double bc_lower_value = 1 ; const double bc_upper_value = 2 ; CoeffFunctionType diffusion_coefficient( 1.0 ); //------------------------------------ if ( print_flag ) { std::cout << "ElemNode {" << std::endl ; for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) { std::cout << " elem[" << ielem << "]{" ; for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) { std::cout << " " << fixture.elem_node(ielem,inode); } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; } //------------------------------------ Kokkos::Impl::Timer wall_clock ; Perf perf_stats = Perf() ; for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) { Perf perf = Perf() ; perf.global_elem_count = fixture.elem_count_global(); perf.global_node_count = fixture.node_count_global(); //---------------------------------- // Create the local sparse matrix graph and element-to-graph map // from the element->to->node identifier array. // The graph only has rows for the owned nodes. typename NodeNodeGraphType::Times graph_times; const NodeNodeGraphType mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(), graph_times ); // Create the local sparse matrix from the graph: LocalMatrixType jacobian( "jacobian" , mesh_to_graph.graph ); //---------------------------------- if ( print_flag ) { const unsigned nrow = jacobian.numRows(); std::cout << "JacobianGraph[ " << jacobian.numRows() << " x " << jacobian.numCols() << " ] {" << std::endl ; for ( unsigned irow = 0 ; irow < nrow ; ++irow ) { std::cout << " row[" << irow << "]{" ; const unsigned entry_end = jacobian.graph.row_map(irow+1); for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) { std::cout << " " << jacobian.graph.entries(entry); } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; std::cout << "ElemGraph {" << std::endl ; for ( unsigned ielem = 0 ; ielem < mesh_to_graph.elem_graph.dimension_0() ; ++ielem ) { std::cout << " elem[" << ielem << "]{" ; for ( unsigned irow = 0 ; irow < mesh_to_graph.elem_graph.dimension_1() ; ++irow ) { std::cout << " {" ; for ( unsigned icol = 0 ; icol < mesh_to_graph.elem_graph.dimension_2() ; ++icol ) { std::cout << " " << mesh_to_graph.elem_graph(ielem,irow,icol); } std::cout << " }" ; } std::cout << " }" << std::endl ; } std::cout << "}" << std::endl ; } //---------------------------------- // Allocate solution vector for each node in the mesh and residual vector for each owned node VectorType nodal_solution( "nodal_solution" , fixture.node_count() ); nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() ); // Get DeviceConfig structs used by some functors Kokkos::DeviceConfig dev_config_elem, dev_config_gath, dev_config_bc; Kokkos::Example::FENL::CreateDeviceConfigs<Scalar>::eval( dev_config_elem, dev_config_gath, dev_config_bc ); // Create element computation functor const ElementComputationType elemcomp( use_atomic ? ElementComputationType( fixture , diffusion_coefficient , nodal_solution , mesh_to_graph.elem_graph , jacobian , nodal_residual , dev_config_elem ) : ElementComputationType( fixture , diffusion_coefficient , nodal_solution , dev_config_elem ) ); const NodeElemGatherFillType gatherfill( use_atomic ? NodeElemGatherFillType() : NodeElemGatherFillType( fixture.elem_node() , mesh_to_graph.elem_graph , nodal_residual , jacobian , elemcomp.elem_residuals , elemcomp.elem_jacobians , dev_config_gath) ); // Create boundary condition functor const DirichletComputationType dirichlet( fixture , nodal_solution , jacobian , nodal_residual , 2 /* apply at 'z' ends */ , bc_lower_value , bc_upper_value , dev_config_bc ); Kokkos::deep_copy( nodal_solution , Scalar(1) ); //-------------------------------- wall_clock.reset(); comm_nodal_import( nodal_solution ); Device::fence(); perf.import_time = maximum( comm , wall_clock.seconds() ); //-------------------------------- // Element contributions to residual and jacobian wall_clock.reset(); Kokkos::deep_copy( nodal_residual , Scalar(0) ); Kokkos::deep_copy( jacobian.values , Scalar(0) ); elemcomp.apply(); if ( ! use_atomic ) { gatherfill.apply(); } Device::fence(); perf.fill_time = maximum( comm , wall_clock.seconds() ); //-------------------------------- // Apply boundary conditions wall_clock.reset(); dirichlet.apply(); Device::fence(); perf.bc_time = maximum( comm , wall_clock.seconds() ); //-------------------------------- perf_stats.increment(perf); } return perf_stats ; }