/** \brief Begin execution of the asynchronous functor */ void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg ) { verify_is_process("ThreadsExec::start" , true ); if ( s_current_function || s_current_function_arg ) { Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) ); } s_current_function = func ; s_current_function_arg = arg ; // Make sure function and arguments are written before activating threads. memory_fence(); // Activate threads: for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) { s_threads_exec[i]->m_pool_state = ThreadsExec::Active ; } if ( s_threads_process.m_pool_size ) { // Master process is the root thread, run it: (*func)( s_threads_process , arg ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } }
void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size ) { enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; fence(); const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ; const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ; reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ; thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ; // Increase size or deallocate completely. if ( ( old_reduce_size < reduce_size ) || ( old_thread_size < thread_size ) || ( ( reduce_size == 0 && thread_size == 0 ) && ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) { verify_is_process( "ThreadsExec::resize_scratch" , true ); s_threads_process.m_scratch_reduce_end = reduce_size ; s_threads_process.m_scratch_thread_end = reduce_size + thread_size ; execute_serial( & execute_resize_scratch ); s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ; } return s_threads_process.m_scratch ; }
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg ) { verify_is_process("QthreadExec::exec_all(...)",true); s_active_function = func ; s_active_function_arg = arg ; // Need to query which shepherd this main 'process' is running... const int main_shep = qthread_shep(); for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) { for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) { // Unit tests hang with this call: // // qthread_fork_to_local_priority( driver_exec_all , NULL , NULL , jshep ); // qthread_fork_to( driver_exec_all , NULL , NULL , jshep ); }} driver_exec_all( NULL ); s_active_function = 0 ; s_active_function_arg = 0 ; }
void ThreadsExec::finalize() { verify_is_process("ThreadsExec::finalize",false); fence(); resize_scratch(0,0); const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ; for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) { if ( s_threads_exec[i] ) { s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ; wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } s_threads_pid[i] = 0 ; } if ( s_threads_process.m_pool_base ) { ( & s_threads_process )->~ThreadsExec(); s_threads_exec[0] = 0 ; } if (Kokkos::hwloc::can_bind_threads() ) { Kokkos::hwloc::unbind_this_thread(); } s_thread_pool_size[0] = 0 ; s_thread_pool_size[1] = 0 ; s_thread_pool_size[2] = 0 ; // Reset master thread to run solo. s_threads_process.m_numa_rank = 0 ; s_threads_process.m_numa_core_rank = 0 ; s_threads_process.m_pool_base = 0 ; s_threads_process.m_pool_rank = 0 ; s_threads_process.m_pool_size = 1 ; s_threads_process.m_pool_fan_size = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; #if (KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::finalize(); #endif }
bool ThreadsExec::wake() { verify_is_process("ThreadsExec::wake", true ); if ( & execute_sleep != s_current_function ) return false ; ThreadsExec::global_unlock(); if ( s_threads_process.m_pool_base ) { execute_sleep( s_threads_process , 0 ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } fence(); return true ; }
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg ) { verify_is_process("QthreadExec::exec_all(...)",true); /* fprintf( stdout , "QthreadExec::exec_all\n"); fflush(stdout); */ s_active_function = func ; s_active_function_arg = arg ; // Need to query which shepherd this main 'process' is running... const int main_shep = qthread_shep(); #if 1 for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) { for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) { qthread_fork_to( driver_exec_all , NULL , NULL , jshep ); }} #else // If this function is used before the 'qthread.task_policy' unit test // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so. for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ; if ( num_clone ) { const int ret = qthread_fork_clones_to_local_priority ( driver_exec_all /* function */ , NULL /* function data block */ , NULL /* pointer to return value feb */ , jshep /* shepherd number */ , num_clone - 1 /* number of instances - 1 */ ); assert(ret == QTHREAD_SUCCESS); } } #endif driver_exec_all( NULL ); s_active_function = 0 ; s_active_function_arg = 0 ; }
bool ThreadsExec::sleep() { verify_is_process("ThreadsExec::sleep", true ); if ( & execute_sleep == s_current_function ) return false ; fence(); ThreadsExec::global_lock(); s_current_function = & execute_sleep ; // Activate threads: for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) { s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ; } return true ; }
void ThreadsExec::print_configuration( std::ostream & s , const bool detail ) { verify_is_process("ThreadsExec::print_configuration",false); fence(); const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); // Forestall compiler warnings for unused variables. (void) numa_count; (void) cores_per_numa; (void) threads_per_core; s << "Kokkos::Threads" ; #if defined( KOKKOS_HAVE_PTHREAD ) s << " KOKKOS_HAVE_PTHREAD" ; #endif #if defined( KOKKOS_HAVE_HWLOC ) s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ; #endif if ( s_thread_pool_size[0] ) { s << " threads[" << s_thread_pool_size[0] << "]" << " threads_per_numa[" << s_thread_pool_size[1] << "]" << " threads_per_core[" << s_thread_pool_size[2] << "]" ; if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; } s << " ReduceScratch[" << s_current_reduce_size << "]" << " SharedScratch[" << s_current_shared_size << "]" ; s << std::endl ; if ( detail ) { execute_serial( & execute_get_binding ); for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) { ThreadsExec * const th = s_threads_exec[i] ; s << " Thread hwloc(" << s_threads_coord[i].first << "." << s_threads_coord[i].second << ")" ; s_threads_coord[i].first = ~0u ; s_threads_coord[i].second = ~0u ; if ( th ) { const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 ); s << " rank(" << th->m_pool_rank << ")" ; if ( th->m_pool_fan_size ) { s << " Fan{" ; for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) { s << " " << th->m_pool_base[rank_rev+(1<<j)]->m_pool_rank ; } s << " }" ; } if ( th == & s_threads_process ) { s << " is_process" ; } } s << std::endl ; } } } else { s << " not initialized" << std::endl ; } }