KOKKOS_INLINE_FUNCTION
    void operator()(int inode) const {

      // Getting count as per 'CSR-like' data structure
      const int element_offset = node_elem_offset(inode);
      const int element_count  = node_elem_offset(inode + 1) - element_offset ;

      double local_force[] = {0.0, 0.0, 0.0};

      //  for each element that a node belongs to
      for(int i = 0; i < element_count ; i++){

        //  node_elem_offset is a cumulative structure, so
        //  node_elem_offset(inode) should be the index where
        //  a particular row's elem_IDs begin
        const int nelem = node_elem_ids( element_offset + i, 0);

        //  find the row in an element's stiffness matrix
        //  that corresponds to inode
        const int elem_node_index = node_elem_ids( element_offset + i, 1);

        local_force[0] += element_force(nelem, 0, elem_node_index);
        local_force[1] += element_force(nelem, 1, elem_node_index);
        local_force[2] += element_force(nelem, 2, elem_node_index);
      }

      internal_force(inode, 0) = local_force[0];
      internal_force(inode, 1) = local_force[1];
      internal_force(inode, 2) = local_force[2];


      Scalar v_new[3];
      Scalar a_current[3];

      const Scalar tol = 1.0e-7;

      if ( fabs(model_coords(inode,0)-x_bc) > tol ) { //not on x boundary
        acceleration(inode,0) = a_current[0] = -local_force[0] / nodal_mass(inode);
        acceleration(inode,1) = a_current[1] = -local_force[1] / nodal_mass(inode);
        acceleration(inode,2) = a_current[2] = -local_force[2] / nodal_mass(inode);
      } else { //enforce fixed BC
        acceleration(inode,0) = a_current[0] = 0;
        acceleration(inode,1) = a_current[1] = 0;
        acceleration(inode,2) = a_current[2] = 0;
      }

      velocity(inode,0,next_state) = v_new[0] = velocity(inode,0,current_state) + (*prev_dt+*dt)/2.0*a_current[0];
      velocity(inode,1,next_state) = v_new[1] = velocity(inode,1,current_state) + (*prev_dt+*dt)/2.0*a_current[1];
      velocity(inode,2,next_state) = v_new[2] = velocity(inode,2,current_state) + (*prev_dt+*dt)/2.0*a_current[2];

      displacement(inode,0,next_state) = displacement(inode,0,current_state) + *dt*v_new[0];
      displacement(inode,1,next_state) = displacement(inode,1,current_state) + *dt*v_new[1];
      displacement(inode,2,next_state) = displacement(inode,2,current_state) + *dt*v_new[2];

    }
Beispiel #2
0
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
                     const int global_max_x ,
                     const int global_max_y ,
                     const int global_max_z ,
                     const unsigned uq_count ,
                     const int steps ,
                     const int print_sample )
{
  typedef Scalar                              scalar_type ;
  typedef FixtureType                         fixture_type ;
  typedef typename fixture_type::device_type  device_type ;

  enum { ElementNodeCount = fixture_type::element_node_count };

  const int total_num_steps = steps ;

  const Scalar user_dt = 5.0e-6;
  //const Scalar  end_time = 0.0050;

  // element block parameters
  const Scalar  lin_bulk_visc = 0.0;
  const Scalar  quad_bulk_visc = 0.0;

  // const Scalar  lin_bulk_visc = 0.06;
  // const Scalar  quad_bulk_visc = 1.2;
  // const Scalar  hg_stiffness = 0.0;
  // const Scalar  hg_viscosity = 0.0;
  // const Scalar  hg_stiffness = 0.03;
  // const Scalar  hg_viscosity = 0.001;

  // material properties
  const Scalar youngs_modulus=1.0e6;
  const Scalar poissons_ratio=0.0;
  const Scalar  density = 8.0e-4;

  const comm::Machine machine = mesh.parallel_data_map.machine ;

  PerformanceData perf_data ;

  Kokkos::Impl::Timer wall_clock ;

  //------------------------------------
  // Generate fields

  typedef Fields< scalar_type , device_type > fields_type ;

  fields_type mesh_fields( mesh , uq_count ,
                           lin_bulk_visc ,
                           quad_bulk_visc ,
                           youngs_modulus ,
                           poissons_ratio ,
                           density );

  typename fields_type::node_coords_type::HostMirror
    model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );

  typename fields_type::spatial_precise_view::HostMirror
    displacement_h = Kokkos::create_mirror( mesh_fields.displacement );

  typename fields_type::spatial_precise_view::HostMirror
    velocity_h = Kokkos::create_mirror( mesh_fields.velocity );

  Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );

  //------------------------------------
  // Initialization

  initialize_element( mesh_fields );
  initialize_node(    mesh_fields );

  const Scalar x_bc = global_max_x ;

  // Initial condition on velocity to initiate a pulse along the X axis
  {
    const unsigned X = 0;
    for ( unsigned inode = 0; inode< mesh_fields.num_nodes; ++inode) {
      for ( unsigned kq = 0 ; kq < uq_count ; ++kq ) {
        if ( model_coords_h(inode,X) == 0 ) {
          velocity_h(inode,kq,X) = 1000 + 100 * kq ;
          velocity_h(inode,kq,X) = 1000 + 100 * kq ;
        }
      }
    }
  }

  Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
  Kokkos::deep_copy( mesh_fields.velocity_new , velocity_h );

  //--------------------------------------------------------------------------
  // We will call a sequence of functions.  These functions have been
  // grouped into several functors to balance the number of global memory
  // accesses versus requiring too many registers or too much L1 cache.
  // Global memory accees have read/write cost and memory subsystem contention cost.
  //--------------------------------------------------------------------------

  perf_data.init_time = comm::max( machine , wall_clock.seconds() );

  // Parameters required for the internal force computations.

  perf_data.number_of_steps = total_num_steps ;

  typedef typename
    fields_type::spatial_precise_view::scalar_type  comm_value_type ;

  const unsigned comm_value_count = 6 ;

  Kokkos::AsyncExchange< comm_value_type , device_type ,
                              Kokkos::ParallelDataMap >
    comm_exchange( mesh.parallel_data_map , comm_value_count * uq_count );

  for ( int step = 0; step < total_num_steps; ++step ) {

    //------------------------------------------------------------------------
    // rotate the state variable views.

    swap( mesh_fields.dt ,           mesh_fields.dt_new );
    swap( mesh_fields.displacement , mesh_fields.displacement_new );
    swap( mesh_fields.velocity ,     mesh_fields.velocity_new );
    swap( mesh_fields.rotation ,     mesh_fields.rotation_new );

    //------------------------------------------------------------------------
    // Communicate "send" nodes' displacement and velocity next_state
    // to the ghosted nodes.
    // buffer packages: { { dx , dy , dz , vx , vy , vz }_node }

    wall_clock.reset();

    pack_state( mesh_fields ,
                comm_exchange.buffer(),
                mesh.parallel_data_map.count_interior ,
                mesh.parallel_data_map.count_send );

    comm_exchange.setup();

    comm_exchange.send_receive();

    unpack_state( mesh_fields ,
                  comm_exchange.buffer() ,
                  mesh.parallel_data_map.count_owned ,
                  mesh.parallel_data_map.count_receive );

    device_type::fence();

    perf_data.comm_time += comm::max( machine , wall_clock.seconds() );

    //------------------------------------------------------------------------

    wall_clock.reset();

    // First kernel 'grad_hgop' combines two functions:
    // gradient, velocity gradient
    gradient( mesh_fields );

    // Combine tensor decomposition and rotation functions.
    decomp_rotate( mesh_fields );

    internal_force( mesh_fields , user_dt );

    device_type::fence();

    perf_data.internal_force_time +=
      comm::max( machine , wall_clock.seconds() );

    //------------------------------------------------------------------------
    // Assembly of elements' contributions to nodal force into
    // a nodal force vector.  Update the accelerations, velocities,
    // displacements.
    // The same pattern can be used for matrix-free residual computations.

    wall_clock.reset();

    nodal_update( mesh_fields , x_bc );

    device_type::fence();

    perf_data.central_diff +=
      comm::max( machine , wall_clock.seconds() );

    if ( print_sample && 0 == step % 100 ) {
      Kokkos::deep_copy( displacement_h , mesh_fields.displacement_new );
      Kokkos::deep_copy( velocity_h ,     mesh_fields.velocity_new );

      if ( 1 == print_sample ) {
        for ( unsigned kp = 0 ; kp < uq_count ; ++kp ) {
          std::cout << "step " << step
                    << " : displacement({*,0,0}," << kp << ",0) =" ;
          for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
            if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
                std::cout << " " << displacement_h(i,kp,0);
            }
          }
          std::cout << std::endl ;

          const float tol = 1.0e-6 ;
          const int yb = global_max_y ;
          const int zb = global_max_z ;
          std::cout << "step " << step
                    << " : displacement({*," << yb << "," << zb << "}," << kp << ",0) =" ;
          for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
            if ( fabs( model_coords_h(i,1) - yb ) < tol &&
                 fabs( model_coords_h(i,2) - zb ) < tol ) {
              std::cout << " " << displacement_h(i,kp,0);
            }
          }
          std::cout << std::endl ;
        }
      }
      else if ( 2 == print_sample ) {
        const unsigned kp = 0 ;

        const float tol = 1.0e-6 ;
        const int xb = global_max_x / 2 ;
        const int yb = global_max_y / 2 ;
        const int zb = global_max_z / 2 ;

        for ( unsigned i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
          if ( fabs( model_coords_h(i,0) - xb ) < tol &&
               fabs( model_coords_h(i,1) - yb ) < tol &&
               fabs( model_coords_h(i,2) - zb ) < tol ) {
            std::cout << "step " << step
                      << " : displacement("
                      << xb << "," << yb << "," << zb << ") = {" 
                      << std::setprecision(6)
                      << " " << displacement_h(i,kp,0)
                      << std::setprecision(2)
                      << " " << displacement_h(i,kp,1)
                      << std::setprecision(2)
                      << " " << displacement_h(i,kp,2)
                      << " }" << std::endl ;
          }
        }
      }
    }
  }

  return perf_data ;
}