void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) ) { s_current_function = func ; s_current_function_arg = & s_threads_process ; // Make sure function and arguments are written before activating threads. memory_fence(); const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ; for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) { ThreadsExec & th = * s_threads_exec[ --i ]; th.m_pool_state = ThreadsExec::Active ; wait_yield( th.m_pool_state , ThreadsExec::Active ); } if ( s_threads_process.m_pool_base ) { s_threads_process.m_pool_state = ThreadsExec::Active ; (*func)( s_threads_process , 0 ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } s_current_function_arg = 0 ; s_current_function = 0 ; // Make sure function and arguments are cleared before proceeding. memory_fence(); }
inline void barrier( ) { // Make sure there is enough scratch space: const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); memory_fence(); // Fan-in reduction with highest ranking thread as the root for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { // Wait: Active -> Rendezvous Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active ); } if ( rev_rank ) { m_pool_state = ThreadsExec::Rendezvous ; // Wait: Rendezvous -> Active Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous ); } else { // Root thread does the reduction and broadcast memory_fence(); for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { get_thread( rank )->m_pool_state = ThreadsExec::Active ; } } }
inline Type shepherd_scan( const int team_size , const Type & value , Type * const global_value = 0 ) const { *shepherd_team_scratch_value<Type>() = value ; memory_fence(); const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 ); int n , j ; for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active ); } if ( rev_rank ) { m_worker_state = QthreadExec::Inactive ; Impl::spinwait( m_worker_state , QthreadExec::Inactive ); } else { // Root thread scans across values before releasing threads // Worker data is in reverse order, so m_shepherd_base[0] is the // highest ranking thread. // Copy from lower ranking to higher ranking worker. Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); for ( int i = 1 ; i < team_size ; ++i ) { const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>(); accum += tmp ; * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ; } * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() = global_value ? atomic_fetch_add( global_value , accum ) : 0 ; // Join from lower ranking to higher ranking worker. for ( int i = team_size ; --i ; ) { * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>(); } memory_fence(); } for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; } return *shepherd_team_scratch_value<Type>(); }
// Get a work index. Claim from owned range until its exhausted, then steal from other thread inline long get_work_index (int team_size = 0) { long work_index = -1; if(!m_stealing) work_index = get_work_index_begin(); if( work_index == -1) { memory_fence(); m_stealing = true; work_index = steal_work_index(team_size); } m_team_work_index = work_index; memory_fence(); return work_index; }
inline typename size_to_int<size>::type atomic_read(const volatile void * ptr) { typedef typename size_to_int<size>::type value_type; value_type result = *static_cast<const volatile value_type*>(ptr); memory_fence(); return result; }
T *FixedVector<T, Size>::pop_back(std::size_t *out_index) { int kRetryDelay = 1; while (true) { Word length = length_.nobarrier_load(); if (length == 0) return reinterpret_cast<T *>(kOutOfRange); Word index = length - 1; T *value; // pop_back "primes" the value it is about to pop by setting a // bit. It is illegal to pop "past" a primed element. if (unlikely(!buffer_[index].cas_prime(&value))) { Platform::Sleep(kRetryDelay); continue; } // We can't let this load be reordered to after the modifying the // length -- we might end up reading a completely different value. memory_fence(); if (unlikely(!length_.boolean_cas(length, length - 1))) { // Something's changed, undo priming and retry. buffer_[index].nobarrier_store(value); continue; } if (out_index != NULL) *out_index = index; return value; } }
/** \brief Begin execution of the asynchronous functor */ void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg ) { verify_is_process("ThreadsExec::start" , true ); if ( s_current_function || s_current_function_arg ) { Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) ); } s_current_function = func ; s_current_function_arg = arg ; // Make sure function and arguments are written before activating threads. memory_fence(); // Activate threads: for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) { s_threads_exec[i]->m_pool_state = ThreadsExec::Active ; } if ( s_threads_process.m_pool_size ) { // Master process is the root thread, run it: (*func)( s_threads_process , arg ); s_threads_process.m_pool_state = ThreadsExec::Inactive ; } }
inline int all_reduce( const int value ) { // Make sure there is enough scratch space: const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); *((volatile int*) reduce_memory()) = value ; memory_fence(); // Fan-in reduction with highest ranking thread as the root for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { // Wait: Active -> Rendezvous Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active ); } if ( rev_rank ) { m_pool_state = ThreadsExec::Rendezvous ; // Wait: Rendezvous -> Active Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous ); } else { // Root thread does the reduction and broadcast int accum = 0 ; for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { accum += *((volatile int *) get_thread( rank )->reduce_memory()); } for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { *((volatile int *) get_thread( rank )->reduce_memory()) = accum ; } memory_fence(); for ( int rank = 0 ; rank < m_pool_size ; ++rank ) { get_thread( rank )->m_pool_state = ThreadsExec::Active ; } } return *((volatile int*) reduce_memory()); }
inline typename JoinOp::value_type shepherd_reduce( const int team_size , const typename JoinOp::value_type & value , const JoinOp & op ) const { typedef typename JoinOp::value_type Type ; *shepherd_team_scratch_value<Type>() = value ; memory_fence(); const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 ); int n , j ; for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active ); } if ( rev_rank ) { m_worker_state = QthreadExec::Inactive ; Impl::spinwait( m_worker_state , QthreadExec::Inactive ); } else { volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); for ( int i = 1 ; i < team_size ; ++i ) { op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() ); } for ( int i = 1 ; i < team_size ; ++i ) { * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ; } memory_fence(); } for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; } return *shepherd_team_scratch_value<Type>(); }
inline void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const { if ( m_shepherd_base ) { Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; } memory_fence(); shepherd_barrier( team_size ); value = *shared_value ; } }
// Wait for root thread to become inactive void ThreadsExec::fence() { if ( s_thread_pool_size[0] ) { // Wait for the root thread to complete: Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active ); } s_current_function = 0 ; s_current_function_arg = 0 ; // Make sure function and arguments are cleared before // potentially re-activating threads with a subsequent launch. memory_fence(); }
inline Type shepherd_reduce( const int team_size , const Type & value ) const { *shepherd_team_scratch_value<Type>() = value ; memory_fence(); const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 ); int n , j ; for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active ); } if ( rev_rank ) { m_worker_state = QthreadExec::Inactive ; Impl::spinwait( m_worker_state , QthreadExec::Inactive ); } else { Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>(); for ( int i = 1 ; i < n ; ++i ) { accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>(); } for ( int i = 1 ; i < n ; ++i ) { * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ; } memory_fence(); } for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) { m_shepherd_base[j]->m_worker_state = QthreadExec::Active ; } return *shepherd_team_scratch_value<Type>(); }
void Window::TextureDisplayer_SetTextureObject(TextureObject Texture){ if(TextureDisplayer_Inited){ //BorrowContext(); ConsoleEcho("on rentre"); Texture.Load(0); ConsoleEcho("Binding"); Texture.Bind(1); ConsoleEcho("Binded"); int dara=1; float dara2=1.0f; TextureDisplayer_Texture=Texture.GetTextureIndex(); ConsoleEcho("Envoie de la Texture"); SetUniform(GL_INT,1,&dara,TextureDisplayer_Adress,"Tex1"); memory_fence(); SetUniform(GL_FLOAT,1,&dara2,TextureDisplayer_Adress,"ColorTest"); } }
KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, const int& thread_id) const { #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) { } #else // Make sure there is enough scratch space: typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE , ValueType , void >::type type ; type * const local_value = ((type*) m_exec.scratch_thread()); if(team_rank() == thread_id) *local_value = value; memory_fence(); team_barrier(); value = *local_value; #endif }
KOKKOS_INLINE_FUNCTION void push_work( const std::int32_t w ) const noexcept { const std::int32_t N = m_graph.numRows(); std::int32_t volatile * const ready_queue = & m_queue[0] ; std::int32_t volatile * const end_hint = & m_queue[2*N+1] ; // Push work to end of queue const std::int32_t j = atomic_fetch_add( end_hint , 1 ); if ( ( N <= j ) || ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) { // ERROR: past the end of queue or did not replace END_TOKEN Kokkos::abort("WorkGraphPolicy push_work error"); } memory_fence(); }
std::size_t FixedVector<T, Size>::push_back(T *value) { while (true) { Word index = length_.nobarrier_load(); if (index >= Size) return -1; // We "make space" for the element we are going to insert by // incrementing the index. We can't use an atomic add here since // we need to ensure we don't bump the index out of bounds and // make the container inconsistent. There may be some clever way // around that, though; might be worth thinking about if atomic // adds are faster than atomic compare exchanges. if (!length_.boolean_cas(index, index + 1)) continue; // We can't let the actual store to the buffer be reordered ahead // of the length_ increment -- another thread might end up writing // to the same location. memory_fence(); buffer_[index].nobarrier_store(value); return static_cast<std::size_t>(index); } }
void ThreadsExec::initialize( unsigned thread_count , unsigned use_numa_count , unsigned use_cores_per_numa , bool allow_asynchronous_threadpool ) { static const Sentinel sentinel ; const bool is_initialized = 0 != s_thread_pool_size[0] ; unsigned thread_spawn_failed = 0 ; for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++) s_threads_exec[i] = NULL; if ( ! is_initialized ) { // If thread_count, use_numa_count, or use_cores_per_numa are zero // then they will be given default values based upon hwloc detection // and allowed asynchronous execution. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); if ( thread_count == 0 ) { thread_count = hwloc_avail ? Kokkos::hwloc::get_available_numa_count() * Kokkos::hwloc::get_available_cores_per_numa() * Kokkos::hwloc::get_available_threads_per_core() : 1 ; } const unsigned thread_spawn_begin = hwloc::thread_mapping( "Kokkos::Threads::initialize" , allow_asynchronous_threadpool , thread_count , use_numa_count , use_cores_per_numa , s_threads_coord ); const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ; if ( thread_spawn_begin ) { // Synchronous with s_threads_coord[0] as the process core // Claim entry #0 for binding the process core. s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u); } s_thread_pool_size[0] = thread_count ; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ; s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ; s_current_function = & execute_function_noop ; // Initialization work function for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { s_threads_process.m_pool_state = ThreadsExec::Inactive ; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' // otherwise specify the entry. s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith ); // Make sure all outstanding memory writes are complete // before spawning the new thread. memory_fence(); // Spawn thread executing the 'driver()' function. // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successfull then // an entry in 's_threads_exec' will be assigned. if ( ThreadsExec::spawn() ) { wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive ); } if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ; } // Wait for all spawned threads to deactivate before zeroing the function. for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) { // Try to protect against cache coherency failure by casting to volatile. ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ; if ( th ) { wait_yield( th->m_pool_state , ThreadsExec::Active ); } else { ++thread_spawn_failed ; } } s_current_function = 0 ; s_current_function_arg = 0 ; s_threads_process.m_pool_state = ThreadsExec::Inactive ; memory_fence(); if ( ! thread_spawn_failed ) { // Bind process to the core on which it was located before spawning occured if (hwloc_can_bind) { Kokkos::hwloc::bind_this_thread( proc_coord ); } if ( thread_spawn_begin ) { // Include process in pool. const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate(); s_threads_exec[0] = & s_threads_process ; s_threads_process.m_numa_rank = coord.first ; s_threads_process.m_numa_core_rank = coord.second ; s_threads_process.m_pool_base = s_threads_exec ; s_threads_process.m_pool_rank = thread_count - 1 ; // Reversed for scan-compatible reductions s_threads_process.m_pool_size = thread_count ; s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size ); s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self(); } else { s_threads_process.m_pool_base = 0 ; s_threads_process.m_pool_rank = 0 ; s_threads_process.m_pool_size = 0 ; s_threads_process.m_pool_fan_size = 0 ; } // Initial allocations: ThreadsExec::resize_scratch( 1024 , 1024 ); } else { s_thread_pool_size[0] = 0 ; s_thread_pool_size[1] = 0 ; s_thread_pool_size[2] = 0 ; } } if ( is_initialized || thread_spawn_failed ) { std::ostringstream msg ; msg << "Kokkos::Threads::initialize ERROR" ; if ( is_initialized ) { msg << " : already initialized" ; } if ( thread_spawn_failed ) { msg << " : failed to spawn " << thread_spawn_failed << " threads" ; } Kokkos::Impl::throw_runtime_exception( msg.str() ); } // Check for over-subscription if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) { std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; std::cout << " Requested: " << thread_count << " threads per process." << std::endl; } // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); #if (KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif }
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes , size_t team_reduce_bytes , size_t team_shared_bytes , size_t thread_local_bytes ) { const size_t member_bytes = sizeof(int64_t) * HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) ); HostThreadTeamData * root = m_pool[0] ; const size_t old_pool_reduce = root ? root->pool_reduce_bytes() : 0 ; const size_t old_team_reduce = root ? root->team_reduce_bytes() : 0 ; const size_t old_team_shared = root ? root->team_shared_bytes() : 0 ; const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ; const size_t old_alloc_bytes = root ? ( member_bytes + root->scratch_bytes() ) : 0 ; // Allocate if any of the old allocation is tool small: const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) || ( old_team_reduce < team_reduce_bytes ) || ( old_team_shared < team_shared_bytes ) || ( old_thread_local < thread_local_bytes ); if ( allocate ) { if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; } if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; } if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; } if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; } const size_t alloc_bytes = member_bytes + HostThreadTeamData::scratch_size( pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); OpenMP::memory_space space ; memory_fence(); #pragma omp parallel num_threads(m_pool_size) { const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { m_pool[rank]->disband_pool(); space.deallocate( m_pool[rank] , old_alloc_bytes ); } void * const ptr = space.allocate( alloc_bytes ); m_pool[ rank ] = new( ptr ) HostThreadTeamData(); m_pool[ rank ]-> scratch_assign( ((char *)ptr) + member_bytes , alloc_bytes , pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); memory_fence(); } /* END #pragma omp parallel */ HostThreadTeamData::organize_pool( m_pool , m_pool_size ); } }
inline void atomic_write(volatile void * ptr, typename size_to_int<size>::type value) { memory_fence(); *static_cast<volatile typename size_to_int<size>::type*>(ptr) = value; }