예제 #1
0
ThreadsExec::ThreadsExec()
  : m_pool_base(0)
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
  , m_scratch()
#else
  , m_scratch(0)
#endif
  , m_scratch_reduce_end(0)
  , m_scratch_thread_end(0)
  , m_numa_rank(0)
  , m_numa_core_rank(0)
  , m_pool_rank(0)
  , m_pool_size(0)
  , m_pool_fan_size(0)
  , m_pool_state( ThreadsExec::Terminating )
{
  if ( & s_threads_process != this ) {

    // A spawned thread

    ThreadsExec * const nil = 0 ;

    // Which entry in 's_threads_exec', possibly determined from hwloc binding
    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
                    ? ((size_t)s_current_function_arg)
                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));

    // Given a good entry set this thread in the 's_threads_exec' array
    if ( entry < s_thread_pool_size[0] &&
         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {

      const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();

      m_numa_rank       = coord.first ;
      m_numa_core_rank  = coord.second ;
      m_pool_base       = s_threads_exec ;
      m_pool_rank       = s_thread_pool_size[0] - ( entry + 1 );
      m_pool_size       = s_thread_pool_size[0] ;
      m_pool_fan_size   = fan_size( m_pool_rank , m_pool_size );
      m_pool_state      = ThreadsExec::Active ;

      s_threads_pid[ m_pool_rank ] = pthread_self();

      // Inform spawning process that the threads_exec entry has been set.
      s_threads_process.m_pool_state = ThreadsExec::Active ;
    }
    else {
      // Inform spawning process that the threads_exec entry could not be set.
      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
    }
  }
  else {
    // Enables 'parallel_for' to execute on unitialized Threads device
    m_pool_rank  = 0 ;
    m_pool_size  = 1 ;
    m_pool_state = ThreadsExec::Inactive ;

    s_threads_pid[ m_pool_rank ] = pthread_self();
  }
}
예제 #2
0
void ThreadsExec::initialize( unsigned thread_count ,
                              unsigned use_numa_count ,
                              unsigned use_cores_per_numa ,
                              bool allow_asynchronous_threadpool )
{
  static const Sentinel sentinel ;

  const bool is_initialized = 0 != s_thread_pool_size[0] ;

  unsigned thread_spawn_failed = 0 ;

  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
    s_threads_exec[i] = NULL;

  if ( ! is_initialized ) {

    // If thread_count, use_numa_count, or use_cores_per_numa are zero
    // then they will be given default values based upon hwloc detection
    // and allowed asynchronous execution.

    const bool hwloc_avail = hwloc::available();

    const unsigned thread_spawn_begin =
      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
                             allow_asynchronous_threadpool ,
                             thread_count ,
                             use_numa_count ,
                             use_cores_per_numa ,
                             s_threads_coord );

    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;

    if ( thread_spawn_begin ) {
      // Synchronous with s_threads_coord[0] as the process core
      // Claim entry #0 for binding the process core.
      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
    }

    s_thread_pool_size[0]    = thread_count ;
    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
    s_current_function = & execute_function_noop ; // Initialization work function

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {

      s_threads_process.m_pool_state = ThreadsExec::Inactive ;

      // If hwloc available then spawned thread will
      // choose its own entry in 's_threads_coord'
      // otherwise specify the entry.
      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );

      // Spawn thread executing the 'driver()' function.
      // Wait until spawned thread has attempted to initialize.
      // If spawning and initialization is successfull then
      // an entry in 's_threads_exec' will be assigned.
      if ( ThreadsExec::spawn() ) {
        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
      }
      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
    }

    // Wait for all spawned threads to deactivate before zeroing the function.

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
      // Try to protect against cache coherency failure by casting to volatile.
      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
      if ( th ) {
        wait_yield( th->m_pool_state , ThreadsExec::Active );
      }
      else {
        ++thread_spawn_failed ;
      }
    }

    s_current_function     = 0 ;
    s_current_function_arg = 0 ;
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;

    if ( ! thread_spawn_failed ) {
      // Bind process to the core on which it was located before spawning occured
      Kokkos::hwloc::bind_this_thread( proc_coord );

      if ( thread_spawn_begin ) { // Include process in pool.
        s_threads_exec[0]                 = & s_threads_process ;
        s_threads_process.m_pool_base     = s_threads_exec ;
        s_threads_process.m_pool_rank     = thread_count - 1 ; // Reversed for scan-compatible reductions
        s_threads_process.m_pool_size     = thread_count ;
        s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
      }
      else {
        s_threads_process.m_pool_base = 0 ;
        s_threads_process.m_pool_rank = 0 ;
        s_threads_process.m_pool_size = 0 ;
        s_threads_process.m_pool_fan_size = 0 ;
      }

      // Initial allocations:
      ThreadsExec::resize_scratch( 1024 , 1024 );
    }
    else {
      s_thread_pool_size[0]    = 0 ;
      s_thread_pool_size[1] = 0 ;
      s_thread_pool_size[2] = 0 ;
    }
  }

  if ( is_initialized || thread_spawn_failed ) {

    std::ostringstream msg ;

    msg << "Kokkos::Threads::initialize ERROR" ;

    if ( is_initialized ) {
      msg << " : already initialized" ;
    }
    if ( thread_spawn_failed ) {
      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
    }

    Kokkos::Impl::throw_runtime_exception( msg.str() );
  }
}
예제 #3
0
void ThreadsExec::initialize( unsigned thread_count ,
                              unsigned use_numa_count ,
                              unsigned use_cores_per_numa ,
                              bool allow_asynchronous_threadpool )
{
  static const Sentinel sentinel ;

  const bool is_initialized = 0 != s_thread_pool_size[0] ;

  unsigned thread_spawn_failed = 0 ;

  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
    s_threads_exec[i] = NULL;

  if ( ! is_initialized ) {

    // If thread_count, use_numa_count, or use_cores_per_numa are zero
    // then they will be given default values based upon hwloc detection
    // and allowed asynchronous execution.

    const bool hwloc_avail = Kokkos::hwloc::available();
    const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads();

    if ( thread_count == 0 ) {
      thread_count = hwloc_avail
      ? Kokkos::hwloc::get_available_numa_count() *
        Kokkos::hwloc::get_available_cores_per_numa() *
        Kokkos::hwloc::get_available_threads_per_core()
      : 1 ;
    }

    const unsigned thread_spawn_begin =
      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
                             allow_asynchronous_threadpool ,
                             thread_count ,
                             use_numa_count ,
                             use_cores_per_numa ,
                             s_threads_coord );

    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;

    if ( thread_spawn_begin ) {
      // Synchronous with s_threads_coord[0] as the process core
      // Claim entry #0 for binding the process core.
      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
    }

    s_thread_pool_size[0] = thread_count ;
    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
    s_current_function = & execute_function_noop ; // Initialization work function

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {

      s_threads_process.m_pool_state = ThreadsExec::Inactive ;

      // If hwloc available then spawned thread will
      // choose its own entry in 's_threads_coord'
      // otherwise specify the entry.
      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith );

      // Make sure all outstanding memory writes are complete
      // before spawning the new thread.
      memory_fence();

      // Spawn thread executing the 'driver()' function.
      // Wait until spawned thread has attempted to initialize.
      // If spawning and initialization is successfull then
      // an entry in 's_threads_exec' will be assigned.
      if ( ThreadsExec::spawn() ) {
        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
      }
      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
    }

    // Wait for all spawned threads to deactivate before zeroing the function.

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
      // Try to protect against cache coherency failure by casting to volatile.
      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
      if ( th ) {
        wait_yield( th->m_pool_state , ThreadsExec::Active );
      }
      else {
        ++thread_spawn_failed ;
      }
    }

    s_current_function     = 0 ;
    s_current_function_arg = 0 ;
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;

    memory_fence();

    if ( ! thread_spawn_failed ) {
      // Bind process to the core on which it was located before spawning occured
      if (hwloc_can_bind) {
        Kokkos::hwloc::bind_this_thread( proc_coord );
      }

      if ( thread_spawn_begin ) { // Include process in pool.
        const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();

        s_threads_exec[0]                   = & s_threads_process ;
        s_threads_process.m_numa_rank       = coord.first ;
        s_threads_process.m_numa_core_rank  = coord.second ;
        s_threads_process.m_pool_base       = s_threads_exec ;
        s_threads_process.m_pool_rank       = thread_count - 1 ; // Reversed for scan-compatible reductions
        s_threads_process.m_pool_size       = thread_count ;
        s_threads_process.m_pool_fan_size   = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
      }
      else {
        s_threads_process.m_pool_base = 0 ;
        s_threads_process.m_pool_rank = 0 ;
        s_threads_process.m_pool_size = 0 ;
        s_threads_process.m_pool_fan_size = 0 ;
      }

      // Initial allocations:
      ThreadsExec::resize_scratch( 1024 , 1024 );
    }
    else {
      s_thread_pool_size[0] = 0 ;
      s_thread_pool_size[1] = 0 ;
      s_thread_pool_size[2] = 0 ;
    }
  }

  if ( is_initialized || thread_spawn_failed ) {

    std::ostringstream msg ;

    msg << "Kokkos::Threads::initialize ERROR" ;

    if ( is_initialized ) {
      msg << " : already initialized" ;
    }
    if ( thread_spawn_failed ) {
      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
    }

    Kokkos::Impl::throw_runtime_exception( msg.str() );
  }

  // Check for over-subscription
  if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
    std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
  }

  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
}