KOKKOS_INLINE_FUNCTION void operator()(int inode) const { // Getting count as per 'CSR-like' data structure const int element_offset = node_elem_offset(inode); const int element_count = node_elem_offset(inode + 1) - element_offset ; double local_force[] = {0.0, 0.0, 0.0}; // for each element that a node belongs to for(int i = 0; i < element_count ; i++){ // node_elem_offset is a cumulative structure, so // node_elem_offset(inode) should be the index where // a particular row's elem_IDs begin const int nelem = node_elem_ids( element_offset + i, 0); // find the row in an element's stiffness matrix // that corresponds to inode const int elem_node_index = node_elem_ids( element_offset + i, 1); local_force[0] += element_force(nelem, 0, elem_node_index); local_force[1] += element_force(nelem, 1, elem_node_index); local_force[2] += element_force(nelem, 2, elem_node_index); } internal_force(inode, 0) = local_force[0]; internal_force(inode, 1) = local_force[1]; internal_force(inode, 2) = local_force[2]; Scalar v_new[3]; Scalar a_current[3]; const Scalar tol = 1.0e-7; if ( fabs(model_coords(inode,0)-x_bc) > tol ) { //not on x boundary acceleration(inode,0) = a_current[0] = -local_force[0] / nodal_mass(inode); acceleration(inode,1) = a_current[1] = -local_force[1] / nodal_mass(inode); acceleration(inode,2) = a_current[2] = -local_force[2] / nodal_mass(inode); } else { //enforce fixed BC acceleration(inode,0) = a_current[0] = 0; acceleration(inode,1) = a_current[1] = 0; acceleration(inode,2) = a_current[2] = 0; } velocity(inode,0,next_state) = v_new[0] = velocity(inode,0,current_state) + (*prev_dt+*dt)/2.0*a_current[0]; velocity(inode,1,next_state) = v_new[1] = velocity(inode,1,current_state) + (*prev_dt+*dt)/2.0*a_current[1]; velocity(inode,2,next_state) = v_new[2] = velocity(inode,2,current_state) + (*prev_dt+*dt)/2.0*a_current[2]; displacement(inode,0,next_state) = displacement(inode,0,current_state) + *dt*v_new[0]; displacement(inode,1,next_state) = displacement(inode,1,current_state) + *dt*v_new[1]; displacement(inode,2,next_state) = displacement(inode,2,current_state) + *dt*v_new[2]; }
PerformanceData run( const typename FixtureType::FEMeshType & mesh , const int global_max_x , const int global_max_y , const int global_max_z , const unsigned uq_count , const int steps , const int print_sample ) { typedef Scalar scalar_type ; typedef FixtureType fixture_type ; typedef typename fixture_type::device_type device_type ; enum { ElementNodeCount = fixture_type::element_node_count }; const int total_num_steps = steps ; const Scalar user_dt = 5.0e-6; //const Scalar end_time = 0.0050; // element block parameters const Scalar lin_bulk_visc = 0.0; const Scalar quad_bulk_visc = 0.0; // const Scalar lin_bulk_visc = 0.06; // const Scalar quad_bulk_visc = 1.2; // const Scalar hg_stiffness = 0.0; // const Scalar hg_viscosity = 0.0; // const Scalar hg_stiffness = 0.03; // const Scalar hg_viscosity = 0.001; // material properties const Scalar youngs_modulus=1.0e6; const Scalar poissons_ratio=0.0; const Scalar density = 8.0e-4; const comm::Machine machine = mesh.parallel_data_map.machine ; PerformanceData perf_data ; Kokkos::Impl::Timer wall_clock ; //------------------------------------ // Generate fields typedef Fields< scalar_type , device_type > fields_type ; fields_type mesh_fields( mesh , uq_count , lin_bulk_visc , quad_bulk_visc , youngs_modulus , poissons_ratio , density ); typename fields_type::node_coords_type::HostMirror model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords ); typename fields_type::spatial_precise_view::HostMirror displacement_h = Kokkos::create_mirror( mesh_fields.displacement ); typename fields_type::spatial_precise_view::HostMirror velocity_h = Kokkos::create_mirror( mesh_fields.velocity ); Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords ); //------------------------------------ // Initialization initialize_element( mesh_fields ); initialize_node( mesh_fields ); const Scalar x_bc = global_max_x ; // Initial condition on velocity to initiate a pulse along the X axis { const unsigned X = 0; for ( unsigned inode = 0; inode< mesh_fields.num_nodes; ++inode) { for ( unsigned kq = 0 ; kq < uq_count ; ++kq ) { if ( model_coords_h(inode,X) == 0 ) { velocity_h(inode,kq,X) = 1000 + 100 * kq ; velocity_h(inode,kq,X) = 1000 + 100 * kq ; } } } } Kokkos::deep_copy( mesh_fields.velocity , velocity_h ); Kokkos::deep_copy( mesh_fields.velocity_new , velocity_h ); //-------------------------------------------------------------------------- // We will call a sequence of functions. These functions have been // grouped into several functors to balance the number of global memory // accesses versus requiring too many registers or too much L1 cache. // Global memory accees have read/write cost and memory subsystem contention cost. //-------------------------------------------------------------------------- perf_data.init_time = comm::max( machine , wall_clock.seconds() ); // Parameters required for the internal force computations. perf_data.number_of_steps = total_num_steps ; typedef typename fields_type::spatial_precise_view::scalar_type comm_value_type ; const unsigned comm_value_count = 6 ; Kokkos::AsyncExchange< comm_value_type , device_type , Kokkos::ParallelDataMap > comm_exchange( mesh.parallel_data_map , comm_value_count * uq_count ); for ( int step = 0; step < total_num_steps; ++step ) { //------------------------------------------------------------------------ // rotate the state variable views. swap( mesh_fields.dt , mesh_fields.dt_new ); swap( mesh_fields.displacement , mesh_fields.displacement_new ); swap( mesh_fields.velocity , mesh_fields.velocity_new ); swap( mesh_fields.rotation , mesh_fields.rotation_new ); //------------------------------------------------------------------------ // Communicate "send" nodes' displacement and velocity next_state // to the ghosted nodes. // buffer packages: { { dx , dy , dz , vx , vy , vz }_node } wall_clock.reset(); pack_state( mesh_fields , comm_exchange.buffer(), mesh.parallel_data_map.count_interior , mesh.parallel_data_map.count_send ); comm_exchange.setup(); comm_exchange.send_receive(); unpack_state( mesh_fields , comm_exchange.buffer() , mesh.parallel_data_map.count_owned , mesh.parallel_data_map.count_receive ); device_type::fence(); perf_data.comm_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------------------------------------------ wall_clock.reset(); // First kernel 'grad_hgop' combines two functions: // gradient, velocity gradient gradient( mesh_fields ); // Combine tensor decomposition and rotation functions. decomp_rotate( mesh_fields ); internal_force( mesh_fields , user_dt ); device_type::fence(); perf_data.internal_force_time += comm::max( machine , wall_clock.seconds() ); //------------------------------------------------------------------------ // Assembly of elements' contributions to nodal force into // a nodal force vector. Update the accelerations, velocities, // displacements. // The same pattern can be used for matrix-free residual computations. wall_clock.reset(); nodal_update( mesh_fields , x_bc ); device_type::fence(); perf_data.central_diff += comm::max( machine , wall_clock.seconds() ); if ( print_sample && 0 == step % 100 ) { Kokkos::deep_copy( displacement_h , mesh_fields.displacement_new ); Kokkos::deep_copy( velocity_h , mesh_fields.velocity_new ); if ( 1 == print_sample ) { for ( unsigned kp = 0 ; kp < uq_count ; ++kp ) { std::cout << "step " << step << " : displacement({*,0,0}," << kp << ",0) =" ; for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) { if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) { std::cout << " " << displacement_h(i,kp,0); } } std::cout << std::endl ; const float tol = 1.0e-6 ; const int yb = global_max_y ; const int zb = global_max_z ; std::cout << "step " << step << " : displacement({*," << yb << "," << zb << "}," << kp << ",0) =" ; for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) { if ( fabs( model_coords_h(i,1) - yb ) < tol && fabs( model_coords_h(i,2) - zb ) < tol ) { std::cout << " " << displacement_h(i,kp,0); } } std::cout << std::endl ; } } else if ( 2 == print_sample ) { const unsigned kp = 0 ; const float tol = 1.0e-6 ; const int xb = global_max_x / 2 ; const int yb = global_max_y / 2 ; const int zb = global_max_z / 2 ; for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) { if ( fabs( model_coords_h(i,0) - xb ) < tol && fabs( model_coords_h(i,1) - yb ) < tol && fabs( model_coords_h(i,2) - zb ) < tol ) { std::cout << "step " << step << " : displacement(" << xb << "," << yb << "," << zb << ") = {" << std::setprecision(6) << " " << displacement_h(i,kp,0) << std::setprecision(2) << " " << displacement_h(i,kp,1) << std::setprecision(2) << " " << displacement_h(i,kp,2) << " }" << std::endl ; } } } } } return perf_data ; }