Exemplo n.º 1
0
void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
{
  s_current_function = func ;
  s_current_function_arg = & s_threads_process ;

  // Make sure function and arguments are written before activating threads.
  memory_fence();

  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;

  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
    ThreadsExec & th = * s_threads_exec[ --i ];

    th.m_pool_state = ThreadsExec::Active ;

    wait_yield( th.m_pool_state , ThreadsExec::Active );
  }

  if ( s_threads_process.m_pool_base ) {
    s_threads_process.m_pool_state = ThreadsExec::Active ;
    (*func)( s_threads_process , 0 );
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
  }

  s_current_function_arg = 0 ;
  s_current_function = 0 ;

  // Make sure function and arguments are cleared before proceeding.
  memory_fence();
}
Exemplo n.º 2
0
void ThreadsExec::driver(void)
{
  ThreadsExec this_thread ;

  while ( ThreadsExec::Active == this_thread.m_pool_state ) {

    (*s_current_function)( this_thread , s_current_function_arg );

    // Deactivate thread and wait for reactivation
    this_thread.m_pool_state = ThreadsExec::Inactive ;

    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
  }
}
Exemplo n.º 3
0
void ThreadsExec::finalize()
{
  verify_is_process("ThreadsExec::finalize",false);

  fence();

  resize_scratch(0,0);

  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;

  for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {

    if ( s_threads_exec[i] ) {

      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;

      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );

      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
    }

    s_threads_pid[i] = 0 ;
  }

  if ( s_threads_process.m_pool_base ) {
    ( & s_threads_process )->~ThreadsExec();
    s_threads_exec[0] = 0 ;
  }

  if (Kokkos::hwloc::can_bind_threads() ) {
    Kokkos::hwloc::unbind_this_thread();
  }

  s_thread_pool_size[0] = 0 ;
  s_thread_pool_size[1] = 0 ;
  s_thread_pool_size[2] = 0 ;

  // Reset master thread to run solo.
  s_threads_process.m_numa_rank       = 0 ;
  s_threads_process.m_numa_core_rank  = 0 ;
  s_threads_process.m_pool_base       = 0 ;
  s_threads_process.m_pool_rank       = 0 ;
  s_threads_process.m_pool_size       = 1 ;
  s_threads_process.m_pool_fan_size   = 0 ;
  s_threads_process.m_pool_state = ThreadsExec::Inactive ;

  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
}
Exemplo n.º 4
0
void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
{
  s_current_function = func ;
  s_current_function_arg = & s_threads_process ;

  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;

  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
    ThreadsExec & th = * s_threads_exec[ --i ];

    th.m_pool_state = ThreadsExec::Active ;

    wait_yield( th.m_pool_state , ThreadsExec::Active );
  }

  if ( s_threads_process.m_pool_base ) {
    s_threads_process.m_pool_state = ThreadsExec::Active ;
    (*func)( s_threads_process , 0 );
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
  }

  s_current_function_arg = 0 ;
  s_current_function = 0 ;
}
Exemplo n.º 5
0
void ThreadsExec::initialize( unsigned thread_count ,
                              unsigned use_numa_count ,
                              unsigned use_cores_per_numa ,
                              bool allow_asynchronous_threadpool )
{
  static const Sentinel sentinel ;

  const bool is_initialized = 0 != s_thread_pool_size[0] ;

  unsigned thread_spawn_failed = 0 ;

  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
    s_threads_exec[i] = NULL;

  if ( ! is_initialized ) {

    // If thread_count, use_numa_count, or use_cores_per_numa are zero
    // then they will be given default values based upon hwloc detection
    // and allowed asynchronous execution.

    const bool hwloc_avail = hwloc::available();

    const unsigned thread_spawn_begin =
      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
                             allow_asynchronous_threadpool ,
                             thread_count ,
                             use_numa_count ,
                             use_cores_per_numa ,
                             s_threads_coord );

    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;

    if ( thread_spawn_begin ) {
      // Synchronous with s_threads_coord[0] as the process core
      // Claim entry #0 for binding the process core.
      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
    }

    s_thread_pool_size[0]    = thread_count ;
    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
    s_current_function = & execute_function_noop ; // Initialization work function

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {

      s_threads_process.m_pool_state = ThreadsExec::Inactive ;

      // If hwloc available then spawned thread will
      // choose its own entry in 's_threads_coord'
      // otherwise specify the entry.
      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );

      // Spawn thread executing the 'driver()' function.
      // Wait until spawned thread has attempted to initialize.
      // If spawning and initialization is successfull then
      // an entry in 's_threads_exec' will be assigned.
      if ( ThreadsExec::spawn() ) {
        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
      }
      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
    }

    // Wait for all spawned threads to deactivate before zeroing the function.

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
      // Try to protect against cache coherency failure by casting to volatile.
      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
      if ( th ) {
        wait_yield( th->m_pool_state , ThreadsExec::Active );
      }
      else {
        ++thread_spawn_failed ;
      }
    }

    s_current_function     = 0 ;
    s_current_function_arg = 0 ;
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;

    if ( ! thread_spawn_failed ) {
      // Bind process to the core on which it was located before spawning occured
      Kokkos::hwloc::bind_this_thread( proc_coord );

      if ( thread_spawn_begin ) { // Include process in pool.
        s_threads_exec[0]                 = & s_threads_process ;
        s_threads_process.m_pool_base     = s_threads_exec ;
        s_threads_process.m_pool_rank     = thread_count - 1 ; // Reversed for scan-compatible reductions
        s_threads_process.m_pool_size     = thread_count ;
        s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
      }
      else {
        s_threads_process.m_pool_base = 0 ;
        s_threads_process.m_pool_rank = 0 ;
        s_threads_process.m_pool_size = 0 ;
        s_threads_process.m_pool_fan_size = 0 ;
      }

      // Initial allocations:
      ThreadsExec::resize_scratch( 1024 , 1024 );
    }
    else {
      s_thread_pool_size[0]    = 0 ;
      s_thread_pool_size[1] = 0 ;
      s_thread_pool_size[2] = 0 ;
    }
  }

  if ( is_initialized || thread_spawn_failed ) {

    std::ostringstream msg ;

    msg << "Kokkos::Threads::initialize ERROR" ;

    if ( is_initialized ) {
      msg << " : already initialized" ;
    }
    if ( thread_spawn_failed ) {
      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
    }

    Kokkos::Impl::throw_runtime_exception( msg.str() );
  }
}
Exemplo n.º 6
0
void ThreadsExec::initialize( unsigned thread_count ,
                              unsigned use_numa_count ,
                              unsigned use_cores_per_numa ,
                              bool allow_asynchronous_threadpool )
{
  static const Sentinel sentinel ;

  const bool is_initialized = 0 != s_thread_pool_size[0] ;

  unsigned thread_spawn_failed = 0 ;

  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
    s_threads_exec[i] = NULL;

  if ( ! is_initialized ) {

    // If thread_count, use_numa_count, or use_cores_per_numa are zero
    // then they will be given default values based upon hwloc detection
    // and allowed asynchronous execution.

    const bool hwloc_avail = Kokkos::hwloc::available();
    const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads();

    if ( thread_count == 0 ) {
      thread_count = hwloc_avail
      ? Kokkos::hwloc::get_available_numa_count() *
        Kokkos::hwloc::get_available_cores_per_numa() *
        Kokkos::hwloc::get_available_threads_per_core()
      : 1 ;
    }

    const unsigned thread_spawn_begin =
      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
                             allow_asynchronous_threadpool ,
                             thread_count ,
                             use_numa_count ,
                             use_cores_per_numa ,
                             s_threads_coord );

    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;

    if ( thread_spawn_begin ) {
      // Synchronous with s_threads_coord[0] as the process core
      // Claim entry #0 for binding the process core.
      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
    }

    s_thread_pool_size[0] = thread_count ;
    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
    s_current_function = & execute_function_noop ; // Initialization work function

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {

      s_threads_process.m_pool_state = ThreadsExec::Inactive ;

      // If hwloc available then spawned thread will
      // choose its own entry in 's_threads_coord'
      // otherwise specify the entry.
      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith );

      // Make sure all outstanding memory writes are complete
      // before spawning the new thread.
      memory_fence();

      // Spawn thread executing the 'driver()' function.
      // Wait until spawned thread has attempted to initialize.
      // If spawning and initialization is successfull then
      // an entry in 's_threads_exec' will be assigned.
      if ( ThreadsExec::spawn() ) {
        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
      }
      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
    }

    // Wait for all spawned threads to deactivate before zeroing the function.

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
      // Try to protect against cache coherency failure by casting to volatile.
      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
      if ( th ) {
        wait_yield( th->m_pool_state , ThreadsExec::Active );
      }
      else {
        ++thread_spawn_failed ;
      }
    }

    s_current_function     = 0 ;
    s_current_function_arg = 0 ;
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;

    memory_fence();

    if ( ! thread_spawn_failed ) {
      // Bind process to the core on which it was located before spawning occured
      if (hwloc_can_bind) {
        Kokkos::hwloc::bind_this_thread( proc_coord );
      }

      if ( thread_spawn_begin ) { // Include process in pool.
        const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();

        s_threads_exec[0]                   = & s_threads_process ;
        s_threads_process.m_numa_rank       = coord.first ;
        s_threads_process.m_numa_core_rank  = coord.second ;
        s_threads_process.m_pool_base       = s_threads_exec ;
        s_threads_process.m_pool_rank       = thread_count - 1 ; // Reversed for scan-compatible reductions
        s_threads_process.m_pool_size       = thread_count ;
        s_threads_process.m_pool_fan_size   = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
      }
      else {
        s_threads_process.m_pool_base = 0 ;
        s_threads_process.m_pool_rank = 0 ;
        s_threads_process.m_pool_size = 0 ;
        s_threads_process.m_pool_fan_size = 0 ;
      }

      // Initial allocations:
      ThreadsExec::resize_scratch( 1024 , 1024 );
    }
    else {
      s_thread_pool_size[0] = 0 ;
      s_thread_pool_size[1] = 0 ;
      s_thread_pool_size[2] = 0 ;
    }
  }

  if ( is_initialized || thread_spawn_failed ) {

    std::ostringstream msg ;

    msg << "Kokkos::Threads::initialize ERROR" ;

    if ( is_initialized ) {
      msg << " : already initialized" ;
    }
    if ( thread_spawn_failed ) {
      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
    }

    Kokkos::Impl::throw_runtime_exception( msg.str() );
  }

  // Check for over-subscription
  if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
    std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
  }

  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
}