int m2_timer_load_state(struct m2_timer *timer, struct save_state *state) { size_t size; uint8_t *buf; if (save_state_find_chunk(state, "M2 ", &buf, &size) < 0) return -1; unpack_state(timer, m2_timer_state_items, buf); timer->prescaler_mask = (1 << timer->prescaler_size) - 1; timer->mask = (1 << timer->size) - 1; return 0; }
PerformanceData run( const typename FixtureType::FEMeshType & mesh , const int global_max_x , const int global_max_y , const int global_max_z , const unsigned uq_count , const int steps , const int print_sample ) { typedef Scalar scalar_type ; typedef FixtureType fixture_type ; typedef typename fixture_type::device_type device_type ; enum { ElementNodeCount = fixture_type::element_node_count }; const int total_num_steps = steps ; const Scalar user_dt = 5.0e-6; //const Scalar end_time = 0.0050; // element block parameters const Scalar lin_bulk_visc = 0.0; const Scalar quad_bulk_visc = 0.0; // const Scalar lin_bulk_visc = 0.06; // const Scalar quad_bulk_visc = 1.2; // const Scalar hg_stiffness = 0.0; // const Scalar hg_viscosity = 0.0; // const Scalar hg_stiffness = 0.03; // const Scalar hg_viscosity = 0.001; // material properties const Scalar youngs_modulus=1.0e6; const Scalar poissons_ratio=0.0; const Scalar density = 8.0e-4; const comm::Machine machine = mesh.parallel_data_map.machine ; PerformanceData perf_data ; Kokkos::Impl::Timer wall_clock ; //------------------------------------ // Generate fields typedef Fields< scalar_type , device_type > fields_type ; fields_type mesh_fields( mesh , uq_count , lin_bulk_visc , quad_bulk_visc , youngs_modulus , poissons_ratio , density ); typename fields_type::node_coords_type::HostMirror model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords ); typename fields_type::spatial_precise_view::HostMirror displacement_h = Kokkos::create_mirror( mesh_fields.displacement ); typename fields_type::spatial_precise_view::HostMirror velocity_h = Kokkos::create_mirror( mesh_fields.velocity ); Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords ); //------------------------------------ // Initialization initialize_element( mesh_fields ); initialize_node( mesh_fields ); const Scalar x_bc = global_max_x ; // Initial condition on velocity to initiate a pulse along the X axis { const unsigned X = 0; for ( unsigned inode = 0; inode< mesh_fields.num_nodes; ++inode) { for ( unsigned kq = 0 ; kq < uq_count ; ++kq ) { if ( model_coords_h(inode,X) == 0 ) { velocity_h(inode,kq,X) = 1000 + 100 * kq ; velocity_h(inode,kq,X) = 1000 + 100 * kq ; } } } } Kokkos::deep_copy( mesh_fields.velocity , velocity_h ); Kokkos::deep_copy( mesh_fields.velocity_new , velocity_h ); //-------------------------------------------------------------------------- // We will call a sequence of functions. These functions have been // grouped into several functors to balance the number of global memory // accesses versus requiring too many registers or too much L1 cache. // Global memory accees have read/write cost and memory subsystem contention cost. //-------------------------------------------------------------------------- perf_data.init_time = comm::max( machine , wall_clock.seconds() ); // Parameters required for the internal force computations. perf_data.number_of_steps = total_num_steps ; typedef typename fields_type::spatial_precise_view::scalar_type comm_value_type ; const unsigned comm_value_count = 6 ; Kokkos::AsyncExchange< comm_value_type , device_type , Kokkos::ParallelDataMap > comm_exchange( mesh.parallel_data_map , comm_value_count * uq_count ); for ( int step = 0; step < total_num_steps; ++step ) { //------------------------------------------------------------------------ // rotate the state variable views. swap( mesh_fields.dt , mesh_fields.dt_new ); swap( mesh_fields.displacement , mesh_fields.displacement_new ); swap( mesh_fields.velocity , mesh_fields.velocity_new ); swap( mesh_fields.rotation , mesh_fields.rotation_new ); //------------------------------------------------------------------------ // Communicate "send" nodes' displacement and velocity next_state // to the ghosted nodes. // buffer packages: { { dx , dy , dz , vx , vy , vz }_node } wall_clock.reset(); pack_state( mesh_fields , comm_exchange.buffer(), mesh.parallel_data_map.count_interior , mesh.parallel_data_map.count_send ); comm_exchange.setup(); comm_exchange.send_receive(); unpack_state( mesh_fields , comm_exchange.buffer() , mesh.parallel_data_map.count_owned , mesh.parallel_data_map.count_receive ); device_type::fence(); perf_data.comm_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------------------------------------------ wall_clock.reset(); // First kernel 'grad_hgop' combines two functions: // gradient, velocity gradient gradient( mesh_fields ); // Combine tensor decomposition and rotation functions. decomp_rotate( mesh_fields ); internal_force( mesh_fields , user_dt ); device_type::fence(); perf_data.internal_force_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------------------------------------------ // Assembly of elements' contributions to nodal force into // a nodal force vector. Update the accelerations, velocities, // displacements. // The same pattern can be used for matrix-free residual computations. wall_clock.reset(); nodal_update( mesh_fields , x_bc ); device_type::fence(); perf_data.central_diff += comm::max( machine , wall_clock.seconds() ); if ( print_sample && 0 == step % 100 ) { Kokkos::deep_copy( displacement_h , mesh_fields.displacement_new ); Kokkos::deep_copy( velocity_h , mesh_fields.velocity_new ); if ( 1 == print_sample ) { for ( unsigned kp = 0 ; kp < uq_count ; ++kp ) { std::cout << "step " << step << " : displacement({*,0,0}," << kp << ",0) =" ; for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) { if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) { std::cout << " " << displacement_h(i,kp,0); } } std::cout << std::endl ; const float tol = 1.0e-6 ; const int yb = global_max_y ; const int zb = global_max_z ; std::cout << "step " << step << " : displacement({*," << yb << "," << zb << "}," << kp << ",0) =" ; for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) { if ( fabs( model_coords_h(i,1) - yb ) < tol && fabs( model_coords_h(i,2) - zb ) < tol ) { std::cout << " " << displacement_h(i,kp,0); } } std::cout << std::endl ; } } else if ( 2 == print_sample ) { const unsigned kp = 0 ; const float tol = 1.0e-6 ; const int xb = global_max_x / 2 ; const int yb = global_max_y / 2 ; const int zb = global_max_z / 2 ; for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) { if ( fabs( model_coords_h(i,0) - xb ) < tol && fabs( model_coords_h(i,1) - yb ) < tol && fabs( model_coords_h(i,2) - zb ) < tol ) { std::cout << "step " << step << " : displacement(" << xb << "," << yb << "," << zb << ") = {" << std::setprecision(6) << " " << displacement_h(i,kp,0) << std::setprecision(2) << " " << displacement_h(i,kp,1) << std::setprecision(2) << " " << displacement_h(i,kp,2) << " }" << std::endl ; } } } } } return perf_data ; }