void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) ) { s_current_function = func ; s_current_function_arg = & s_threads_process ; // Make sure function and arguments are written before activating threads. memory_fence(); const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ; for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) { ThreadsExec & th = * s_threads_exec[ --i ]; th.m_pool_state = ThreadsExec::Active ; wait_yield( th.m_pool_state , ThreadsExec::Active ); } if ( s_threads_process.m_pool_base ) { s_threads_process.m_pool_state = ThreadsExec::Active ; (*func)( s_threads_process , 0 ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } s_current_function_arg = 0 ; s_current_function = 0 ; // Make sure function and arguments are cleared before proceeding. memory_fence(); }
void ThreadsExec::driver(void) { ThreadsExec this_thread ; while ( ThreadsExec::Active == this_thread.m_pool_state ) { (*s_current_function)( this_thread , s_current_function_arg ); // Deactivate thread and wait for reactivation this_thread.m_pool_state = ThreadsExec::Inactive ; wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive ); } }
void ThreadsExec::finalize() { verify_is_process("ThreadsExec::finalize",false); fence(); resize_scratch(0,0); const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ; for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) { if ( s_threads_exec[i] ) { s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ; wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } s_threads_pid[i] = 0 ; } if ( s_threads_process.m_pool_base ) { ( & s_threads_process )->~ThreadsExec(); s_threads_exec[0] = 0 ; } if (Kokkos::hwloc::can_bind_threads() ) { Kokkos::hwloc::unbind_this_thread(); } s_thread_pool_size[0] = 0 ; s_thread_pool_size[1] = 0 ; s_thread_pool_size[2] = 0 ; // Reset master thread to run solo. s_threads_process.m_numa_rank = 0 ; s_threads_process.m_numa_core_rank = 0 ; s_threads_process.m_pool_base = 0 ; s_threads_process.m_pool_rank = 0 ; s_threads_process.m_pool_size = 1 ; s_threads_process.m_pool_fan_size = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; #if (KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::finalize(); #endif }
void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) ) { s_current_function = func ; s_current_function_arg = & s_threads_process ; const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ; for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) { ThreadsExec & th = * s_threads_exec[ --i ]; th.m_pool_state = ThreadsExec::Active ; wait_yield( th.m_pool_state , ThreadsExec::Active ); } if ( s_threads_process.m_pool_base ) { s_threads_process.m_pool_state = ThreadsExec::Active ; (*func)( s_threads_process , 0 ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } s_current_function_arg = 0 ; s_current_function = 0 ; }
void ThreadsExec::initialize( unsigned thread_count , unsigned use_numa_count , unsigned use_cores_per_numa , bool allow_asynchronous_threadpool ) { static const Sentinel sentinel ; const bool is_initialized = 0 != s_thread_pool_size[0] ; unsigned thread_spawn_failed = 0 ; for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++) s_threads_exec[i] = NULL; if ( ! is_initialized ) { // If thread_count, use_numa_count, or use_cores_per_numa are zero // then they will be given default values based upon hwloc detection // and allowed asynchronous execution. const bool hwloc_avail = hwloc::available(); const unsigned thread_spawn_begin = hwloc::thread_mapping( "Kokkos::Threads::initialize" , allow_asynchronous_threadpool , thread_count , use_numa_count , use_cores_per_numa , s_threads_coord ); const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ; if ( thread_spawn_begin ) { // Synchronous with s_threads_coord[0] as the process core // Claim entry #0 for binding the process core. s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u); } s_thread_pool_size[0] = thread_count ; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ; s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ; s_current_function = & execute_function_noop ; // Initialization work function for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { s_threads_process.m_pool_state = ThreadsExec::Inactive ; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' // otherwise specify the entry. s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith ); // Spawn thread executing the 'driver()' function. // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successfull then // an entry in 's_threads_exec' will be assigned. if ( ThreadsExec::spawn() ) { wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); } if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ; } // Wait for all spawned threads to deactivate before zeroing the function. for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { // Try to protect against cache coherency failure by casting to volatile. ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ; if ( th ) { wait_yield( th->m_pool_state , ThreadsExec::Active ); } else { ++thread_spawn_failed ; } } s_current_function = 0 ; s_current_function_arg = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; if ( ! thread_spawn_failed ) { // Bind process to the core on which it was located before spawning occured Kokkos::hwloc::bind_this_thread( proc_coord ); if ( thread_spawn_begin ) { // Include process in pool. s_threads_exec[0] = & s_threads_process ; s_threads_process.m_pool_base = s_threads_exec ; s_threads_process.m_pool_rank = thread_count - 1 ; // Reversed for scan-compatible reductions s_threads_process.m_pool_size = thread_count ; s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size ); s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self(); } else { s_threads_process.m_pool_base = 0 ; s_threads_process.m_pool_rank = 0 ; s_threads_process.m_pool_size = 0 ; s_threads_process.m_pool_fan_size = 0 ; } // Initial allocations: ThreadsExec::resize_scratch( 1024 , 1024 ); } else { s_thread_pool_size[0] = 0 ; s_thread_pool_size[1] = 0 ; s_thread_pool_size[2] = 0 ; } } if ( is_initialized || thread_spawn_failed ) { std::ostringstream msg ; msg << "Kokkos::Threads::initialize ERROR" ; if ( is_initialized ) { msg << " : already initialized" ; } if ( thread_spawn_failed ) { msg << " : failed to spawn " << thread_spawn_failed << " threads" ; } Kokkos::Impl::throw_runtime_exception( msg.str() ); } }
void ThreadsExec::initialize( unsigned thread_count , unsigned use_numa_count , unsigned use_cores_per_numa , bool allow_asynchronous_threadpool ) { static const Sentinel sentinel ; const bool is_initialized = 0 != s_thread_pool_size[0] ; unsigned thread_spawn_failed = 0 ; for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++) s_threads_exec[i] = NULL; if ( ! is_initialized ) { // If thread_count, use_numa_count, or use_cores_per_numa are zero // then they will be given default values based upon hwloc detection // and allowed asynchronous execution. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); if ( thread_count == 0 ) { thread_count = hwloc_avail ? Kokkos::hwloc::get_available_numa_count() * Kokkos::hwloc::get_available_cores_per_numa() * Kokkos::hwloc::get_available_threads_per_core() : 1 ; } const unsigned thread_spawn_begin = hwloc::thread_mapping( "Kokkos::Threads::initialize" , allow_asynchronous_threadpool , thread_count , use_numa_count , use_cores_per_numa , s_threads_coord ); const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ; if ( thread_spawn_begin ) { // Synchronous with s_threads_coord[0] as the process core // Claim entry #0 for binding the process core. s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u); } s_thread_pool_size[0] = thread_count ; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ; s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ; s_current_function = & execute_function_noop ; // Initialization work function for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { s_threads_process.m_pool_state = ThreadsExec::Inactive ; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' // otherwise specify the entry. s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith ); // Make sure all outstanding memory writes are complete // before spawning the new thread. memory_fence(); // Spawn thread executing the 'driver()' function. // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successfull then // an entry in 's_threads_exec' will be assigned. if ( ThreadsExec::spawn() ) { wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); } if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ; } // Wait for all spawned threads to deactivate before zeroing the function. for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { // Try to protect against cache coherency failure by casting to volatile. ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ; if ( th ) { wait_yield( th->m_pool_state , ThreadsExec::Active ); } else { ++thread_spawn_failed ; } } s_current_function = 0 ; s_current_function_arg = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; memory_fence(); if ( ! thread_spawn_failed ) { // Bind process to the core on which it was located before spawning occured if (hwloc_can_bind) { Kokkos::hwloc::bind_this_thread( proc_coord ); } if ( thread_spawn_begin ) { // Include process in pool. const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate(); s_threads_exec[0] = & s_threads_process ; s_threads_process.m_numa_rank = coord.first ; s_threads_process.m_numa_core_rank = coord.second ; s_threads_process.m_pool_base = s_threads_exec ; s_threads_process.m_pool_rank = thread_count - 1 ; // Reversed for scan-compatible reductions s_threads_process.m_pool_size = thread_count ; s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size ); s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self(); } else { s_threads_process.m_pool_base = 0 ; s_threads_process.m_pool_rank = 0 ; s_threads_process.m_pool_size = 0 ; s_threads_process.m_pool_fan_size = 0 ; } // Initial allocations: ThreadsExec::resize_scratch( 1024 , 1024 ); } else { s_thread_pool_size[0] = 0 ; s_thread_pool_size[1] = 0 ; s_thread_pool_size[2] = 0 ; } } if ( is_initialized || thread_spawn_failed ) { std::ostringstream msg ; msg << "Kokkos::Threads::initialize ERROR" ; if ( is_initialized ) { msg << " : already initialized" ; } if ( thread_spawn_failed ) { msg << " : failed to spawn " << thread_spawn_failed << " threads" ; } Kokkos::Impl::throw_runtime_exception( msg.str() ); } // Check for over-subscription if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) { std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; std::cout << " Requested: " << thread_count << " threads per process." << std::endl; } // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); #if (KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif }