예제 #1
0
void OpenMPExec::clear_thread_data()
{
  const size_t member_bytes =
    sizeof(int64_t) *
    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );

  const int old_alloc_bytes =
    m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;

  OpenMP::memory_space space ;

  #pragma omp parallel num_threads( m_pool_size )
  {
    const int rank = omp_get_thread_num();

    if ( 0 != m_pool[rank] ) {

      m_pool[rank]->disband_pool();

      space.deallocate( m_pool[rank] , old_alloc_bytes );

      m_pool[rank] = 0 ;
    }
  }
/* END #pragma omp parallel */
}
void OpenMP::partition_master( F const& f
                             , int num_partitions
                             , int partition_size
                             )
{
  if (omp_get_nested()) {
    using Exec = Impl::OpenMPExec;

    Exec * prev_instance = Impl::t_openmp_instance;

    Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size );

    OpenMP::memory_space space;

    #pragma omp parallel num_threads(num_partitions)
    {
      void * const ptr = space.allocate( sizeof(Exec) );

      Impl::t_openmp_instance = new (ptr) Exec( partition_size );

      size_t pool_reduce_bytes  =   32 * partition_size ;
      size_t team_reduce_bytes  =   32 * partition_size ;
      size_t team_shared_bytes  = 1024 * partition_size ;
      size_t thread_local_bytes = 1024 ;

      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
                                                 , team_reduce_bytes
                                                 , team_shared_bytes
                                                 , thread_local_bytes
                                                 );

      omp_set_num_threads(partition_size);
      f( omp_get_thread_num(), omp_get_num_threads() );

      Impl::t_openmp_instance->~Exec();
      space.deallocate( Impl::t_openmp_instance, sizeof(Exec) );
      Impl::t_openmp_instance = nullptr;
    }

    Impl::t_openmp_instance  = prev_instance;
  }
  else {
    // nested openmp not enabled
    f(0,1);
  }
}
예제 #3
0
void OpenMP::impl_finalize()
#endif
{
  if ( omp_in_parallel() )
  {
    std::string msg("Kokkos::OpenMP::finalize ERROR ");
    if( !Impl::t_openmp_instance ) msg.append(": not initialized");
    if( omp_in_parallel() ) msg.append(": in parallel");
    Kokkos::Impl::throw_runtime_exception(msg);
  }

  if ( Impl::t_openmp_instance ) {
    // Silence Cuda Warning
    const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads
                       ? Impl::g_openmp_hardware_max_threads
                       : Impl::t_openmp_instance->m_pool_size;
    (void) nthreads;

    using Exec = Impl::OpenMPExec;
    Exec * instance = Impl::t_openmp_instance;
    instance->~Exec();

    OpenMP::memory_space space;
    space.deallocate( instance, sizeof(Exec) );

    #pragma omp parallel num_threads(nthreads)
    {
      Impl::t_openmp_hardware_id = 0;
      Impl::t_openmp_instance    = nullptr;
      Impl::SharedAllocationRecord< void, void >::tracking_disable();
    }

    // allow main thread to track
    Impl::SharedAllocationRecord< void, void >::tracking_enable();

    Impl::g_openmp_hardware_max_threads = 1;
  }

  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
}
예제 #4
0
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                                   , size_t team_reduce_bytes
                                   , size_t team_shared_bytes
                                   , size_t thread_local_bytes )
{
  const size_t member_bytes =
    sizeof(int64_t) *
    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );

  HostThreadTeamData * root = m_pool[0] ;

  const size_t old_pool_reduce  = root ? root->pool_reduce_bytes() : 0 ;
  const size_t old_team_reduce  = root ? root->team_reduce_bytes() : 0 ;
  const size_t old_team_shared  = root ? root->team_shared_bytes() : 0 ;
  const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
  const size_t old_alloc_bytes  = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;

  // Allocate if any of the old allocation is tool small:

  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
                        ( old_team_reduce  < team_reduce_bytes ) ||
                        ( old_team_shared  < team_shared_bytes ) ||
                        ( old_thread_local < thread_local_bytes );

  if ( allocate ) {

    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }

    const size_t alloc_bytes =
      member_bytes +
      HostThreadTeamData::scratch_size( pool_reduce_bytes
                                      , team_reduce_bytes
                                      , team_shared_bytes
                                      , thread_local_bytes );

    OpenMP::memory_space space ;

    memory_fence();

    #pragma omp parallel num_threads(m_pool_size)
    {
      const int rank = omp_get_thread_num();

      if ( 0 != m_pool[rank] ) {

        m_pool[rank]->disband_pool();

        space.deallocate( m_pool[rank] , old_alloc_bytes );
      }

      void * const ptr = space.allocate( alloc_bytes );

      m_pool[ rank ] = new( ptr ) HostThreadTeamData();

      m_pool[ rank ]->
        scratch_assign( ((char *)ptr) + member_bytes
                      , alloc_bytes
                      , pool_reduce_bytes
                      , team_reduce_bytes
                      , team_shared_bytes
                      , thread_local_bytes
                      );

      memory_fence();
    }
/* END #pragma omp parallel */

    HostThreadTeamData::organize_pool( m_pool , m_pool_size );
  }
}