ThreadsExec::ThreadsExec() : m_pool_base(0) #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) , m_scratch() #else , m_scratch(0) #endif , m_scratch_reduce_end(0) , m_scratch_thread_end(0) , m_numa_rank(0) , m_numa_core_rank(0) , m_pool_rank(0) , m_pool_size(0) , m_pool_fan_size(0) , m_pool_state( ThreadsExec::Terminating ) { if ( & s_threads_process != this ) { // A spawned thread ThreadsExec * const nil = 0 ; // Which entry in 's_threads_exec', possibly determined from hwloc binding const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0]) ? ((size_t)s_current_function_arg) : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord )); // Given a good entry set this thread in the 's_threads_exec' array if ( entry < s_thread_pool_size[0] && nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) { const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate(); m_numa_rank = coord.first ; m_numa_core_rank = coord.second ; m_pool_base = s_threads_exec ; m_pool_rank = s_thread_pool_size[0] - ( entry + 1 ); m_pool_size = s_thread_pool_size[0] ; m_pool_fan_size = fan_size( m_pool_rank , m_pool_size ); m_pool_state = ThreadsExec::Active ; s_threads_pid[ m_pool_rank ] = pthread_self(); // Inform spawning process that the threads_exec entry has been set. s_threads_process.m_pool_state = ThreadsExec::Active ; } else { // Inform spawning process that the threads_exec entry could not be set. s_threads_process.m_pool_state = ThreadsExec::Terminating ; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0 ; m_pool_size = 1 ; m_pool_state = ThreadsExec::Inactive ; s_threads_pid[ m_pool_rank ] = pthread_self(); } }
void ThreadsExec::initialize( unsigned thread_count , unsigned use_numa_count , unsigned use_cores_per_numa , bool allow_asynchronous_threadpool ) { static const Sentinel sentinel ; const bool is_initialized = 0 != s_thread_pool_size[0] ; unsigned thread_spawn_failed = 0 ; for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++) s_threads_exec[i] = NULL; if ( ! is_initialized ) { // If thread_count, use_numa_count, or use_cores_per_numa are zero // then they will be given default values based upon hwloc detection // and allowed asynchronous execution. const bool hwloc_avail = hwloc::available(); const unsigned thread_spawn_begin = hwloc::thread_mapping( "Kokkos::Threads::initialize" , allow_asynchronous_threadpool , thread_count , use_numa_count , use_cores_per_numa , s_threads_coord ); const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ; if ( thread_spawn_begin ) { // Synchronous with s_threads_coord[0] as the process core // Claim entry #0 for binding the process core. s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u); } s_thread_pool_size[0] = thread_count ; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ; s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ; s_current_function = & execute_function_noop ; // Initialization work function for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { s_threads_process.m_pool_state = ThreadsExec::Inactive ; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' // otherwise specify the entry. s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith ); // Spawn thread executing the 'driver()' function. // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successfull then // an entry in 's_threads_exec' will be assigned. if ( ThreadsExec::spawn() ) { wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); } if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ; } // Wait for all spawned threads to deactivate before zeroing the function. for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { // Try to protect against cache coherency failure by casting to volatile. ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ; if ( th ) { wait_yield( th->m_pool_state , ThreadsExec::Active ); } else { ++thread_spawn_failed ; } } s_current_function = 0 ; s_current_function_arg = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; if ( ! thread_spawn_failed ) { // Bind process to the core on which it was located before spawning occured Kokkos::hwloc::bind_this_thread( proc_coord ); if ( thread_spawn_begin ) { // Include process in pool. s_threads_exec[0] = & s_threads_process ; s_threads_process.m_pool_base = s_threads_exec ; s_threads_process.m_pool_rank = thread_count - 1 ; // Reversed for scan-compatible reductions s_threads_process.m_pool_size = thread_count ; s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size ); s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self(); } else { s_threads_process.m_pool_base = 0 ; s_threads_process.m_pool_rank = 0 ; s_threads_process.m_pool_size = 0 ; s_threads_process.m_pool_fan_size = 0 ; } // Initial allocations: ThreadsExec::resize_scratch( 1024 , 1024 ); } else { s_thread_pool_size[0] = 0 ; s_thread_pool_size[1] = 0 ; s_thread_pool_size[2] = 0 ; } } if ( is_initialized || thread_spawn_failed ) { std::ostringstream msg ; msg << "Kokkos::Threads::initialize ERROR" ; if ( is_initialized ) { msg << " : already initialized" ; } if ( thread_spawn_failed ) { msg << " : failed to spawn " << thread_spawn_failed << " threads" ; } Kokkos::Impl::throw_runtime_exception( msg.str() ); } }
void ThreadsExec::initialize( unsigned thread_count , unsigned use_numa_count , unsigned use_cores_per_numa , bool allow_asynchronous_threadpool ) { static const Sentinel sentinel ; const bool is_initialized = 0 != s_thread_pool_size[0] ; unsigned thread_spawn_failed = 0 ; for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++) s_threads_exec[i] = NULL; if ( ! is_initialized ) { // If thread_count, use_numa_count, or use_cores_per_numa are zero // then they will be given default values based upon hwloc detection // and allowed asynchronous execution. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); if ( thread_count == 0 ) { thread_count = hwloc_avail ? Kokkos::hwloc::get_available_numa_count() * Kokkos::hwloc::get_available_cores_per_numa() * Kokkos::hwloc::get_available_threads_per_core() : 1 ; } const unsigned thread_spawn_begin = hwloc::thread_mapping( "Kokkos::Threads::initialize" , allow_asynchronous_threadpool , thread_count , use_numa_count , use_cores_per_numa , s_threads_coord ); const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ; if ( thread_spawn_begin ) { // Synchronous with s_threads_coord[0] as the process core // Claim entry #0 for binding the process core. s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u); } s_thread_pool_size[0] = thread_count ; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ; s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ; s_current_function = & execute_function_noop ; // Initialization work function for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { s_threads_process.m_pool_state = ThreadsExec::Inactive ; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' // otherwise specify the entry. s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith ); // Make sure all outstanding memory writes are complete // before spawning the new thread. memory_fence(); // Spawn thread executing the 'driver()' function. // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successfull then // an entry in 's_threads_exec' will be assigned. if ( ThreadsExec::spawn() ) { wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); } if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ; } // Wait for all spawned threads to deactivate before zeroing the function. for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { // Try to protect against cache coherency failure by casting to volatile. ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ; if ( th ) { wait_yield( th->m_pool_state , ThreadsExec::Active ); } else { ++thread_spawn_failed ; } } s_current_function = 0 ; s_current_function_arg = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; memory_fence(); if ( ! thread_spawn_failed ) { // Bind process to the core on which it was located before spawning occured if (hwloc_can_bind) { Kokkos::hwloc::bind_this_thread( proc_coord ); } if ( thread_spawn_begin ) { // Include process in pool. const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate(); s_threads_exec[0] = & s_threads_process ; s_threads_process.m_numa_rank = coord.first ; s_threads_process.m_numa_core_rank = coord.second ; s_threads_process.m_pool_base = s_threads_exec ; s_threads_process.m_pool_rank = thread_count - 1 ; // Reversed for scan-compatible reductions s_threads_process.m_pool_size = thread_count ; s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size ); s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self(); } else { s_threads_process.m_pool_base = 0 ; s_threads_process.m_pool_rank = 0 ; s_threads_process.m_pool_size = 0 ; s_threads_process.m_pool_fan_size = 0 ; } // Initial allocations: ThreadsExec::resize_scratch( 1024 , 1024 ); } else { s_thread_pool_size[0] = 0 ; s_thread_pool_size[1] = 0 ; s_thread_pool_size[2] = 0 ; } } if ( is_initialized || thread_spawn_failed ) { std::ostringstream msg ; msg << "Kokkos::Threads::initialize ERROR" ; if ( is_initialized ) { msg << " : already initialized" ; } if ( thread_spawn_failed ) { msg << " : failed to spawn " << thread_spawn_failed << " threads" ; } Kokkos::Impl::throw_runtime_exception( msg.str() ); } // Check for over-subscription if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) { std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; std::cout << " Requested: " << thread_count << " threads per process." << std::endl; } // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); #if (KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif }