/**\brief Attempt to pop the work item at the head of the queue. * * Find entry 'i' such that * ( m_queue[i] != BEGIN_TOKEN ) AND * ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN ) * if found then * increment begin hint * return atomic_exchange( m_queue[i] , BEGIN_TOKEN ) * else if i < total work * return END_TOKEN * else * return COMPLETED_TOKEN * */ KOKKOS_INLINE_FUNCTION std::int32_t pop_work() const noexcept { const std::int32_t N = m_graph.numRows(); std::int32_t volatile * const ready_queue = & m_queue[0] ; std::int32_t volatile * const begin_hint = & m_queue[2*N] ; // begin hint is guaranteed to be less than or equal to // actual begin location in the queue. for ( std::int32_t i = *begin_hint ; i < N ; ++i ) { const std::int32_t w = ready_queue[i] ; if ( w == END_TOKEN ) { return END_TOKEN ; } if ( ( w != BEGIN_TOKEN ) && ( w == atomic_compare_exchange(ready_queue+i,w,(std::int32_t)BEGIN_TOKEN) ) ) { // Attempt to claim ready work index succeeded, // update the hint and return work index atomic_increment( begin_hint ); return w ; } // arrive here when ready_queue[i] == BEGIN_TOKEN } return COMPLETED_TOKEN ; }
/* * Routine: hw_lock_lock * * Acquire lock, spinning until it becomes available, * return with preemption disabled. */ void hw_lock_lock(hw_lock_t lock) { thread_t thread; uintptr_t state; thread = current_thread(); disable_preemption_for_thread(thread); state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; #if __SMP__ #if LOCK_PRETEST if (ordered_load_hw(lock)) goto contended; #endif // LOCK_PRETEST if (atomic_compare_exchange(&lock->lock_data, 0, state, memory_order_acquire_smp, TRUE)) { goto end; } #if LOCK_PRETEST contended: #endif // LOCK_PRETEST hw_lock_lock_contended(lock, state, 0, TRUE); end: #else // __SMP__ if (lock->lock_data) panic("Spinlock held %p", lock); lock->lock_data = state; #endif // __SMP__ #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0); #endif return; }
ThreadsExec::~ThreadsExec() { const unsigned entry = m_pool_size - ( m_pool_rank + 1 ); typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; if ( m_scratch ) { Record * const r = Record::get_record( m_scratch ); m_scratch = 0 ; Record::decrement( r ); } m_pool_base = 0 ; m_scratch_reduce_end = 0 ; m_scratch_thread_end = 0 ; m_numa_rank = 0 ; m_numa_core_rank = 0 ; m_pool_rank = 0 ; m_pool_size = 0 ; m_pool_fan_size = 0 ; m_pool_state = ThreadsExec::Terminating ; if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) { ThreadsExec * const nil = 0 ; atomic_compare_exchange( s_threads_exec + entry , this , nil ); s_threads_process.m_pool_state = ThreadsExec::Terminating ; } }
ThreadsExec::ThreadsExec() : m_pool_base(0) #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) , m_scratch() #else , m_scratch(0) #endif , m_scratch_reduce_end(0) , m_scratch_thread_end(0) , m_numa_rank(0) , m_numa_core_rank(0) , m_pool_rank(0) , m_pool_size(0) , m_pool_fan_size(0) , m_pool_state( ThreadsExec::Terminating ) { if ( & s_threads_process != this ) { // A spawned thread ThreadsExec * const nil = 0 ; // Which entry in 's_threads_exec', possibly determined from hwloc binding const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0]) ? ((size_t)s_current_function_arg) : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord )); // Given a good entry set this thread in the 's_threads_exec' array if ( entry < s_thread_pool_size[0] && nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) { const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate(); m_numa_rank = coord.first ; m_numa_core_rank = coord.second ; m_pool_base = s_threads_exec ; m_pool_rank = s_thread_pool_size[0] - ( entry + 1 ); m_pool_size = s_thread_pool_size[0] ; m_pool_fan_size = fan_size( m_pool_rank , m_pool_size ); m_pool_state = ThreadsExec::Active ; s_threads_pid[ m_pool_rank ] = pthread_self(); // Inform spawning process that the threads_exec entry has been set. s_threads_process.m_pool_state = ThreadsExec::Active ; } else { // Inform spawning process that the threads_exec entry could not be set. s_threads_process.m_pool_state = ThreadsExec::Terminating ; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0 ; m_pool_size = 1 ; m_pool_state = ThreadsExec::Inactive ; s_threads_pid[ m_pool_rank ] = pthread_self(); } }
T atomic_decrement(volatile T * const dest) { T oldval = *dest; T assume; do { assume = oldval; T newval = assume--; oldval = atomic_compare_exchange(dest, assume, newval); } while (assume != oldval); }
T atomic_exchange(volatile T * const dest, const T val) { T oldval = *dest; T assume; do { assume = oldval; oldval = atomic_compare_exchange(dest, assume, val); } while (assume != oldval); return oldval; }
T atomic_fetch_sub(volatile T * const dest, const T val) { T oldval = *dest; T assume; do { assume = oldval; T newval = val - oldval; oldval = atomic_compare_exchange(dest, assume, newval); } while (assume != oldval); return oldval; }
void EnterSpinLockCriticalSection(SPIN_LOCK_FLAG& spinLockFlag) { unsigned int cnt = 0; while (1) { long orig = atomic_compare_exchange(&spinLockFlag, 1, 0); if (orig == 0 && spinLockFlag == 1) break; ++cnt; if (cnt > 0x00000fff) SleepForMS(20); } }
void EnterSpinLockCriticalSection_Share(SPIN_LOCK_FLAG& spinLockFlag) { unsigned int cnt = 0; while (1) { long orig = atomic_compare_exchange(&spinLockFlag, 2, 0); if ((orig == 0 && spinLockFlag == 2) || orig > 2) break; ++cnt; if (cnt > 0x00000fff) SleepForMS(20); } atomic_increment(&spinLockFlag); }
static unsigned int NOINLINE hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic) { uint64_t end = 0; uintptr_t holder = lock->lock_data; int i; if (timeout == 0) timeout = LOCK_PANIC_TIMEOUT; #if CONFIG_DTRACE uint64_t begin; boolean_t dtrace_enabled = lockstat_probemap[LS_LCK_SPIN_LOCK_SPIN] != 0; if (__improbable(dtrace_enabled)) begin = mach_absolute_time(); #endif for ( ; ; ) { for (i = 0; i < LOCK_SNOOP_SPINS; i++) { cpu_pause(); #if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST) holder = ordered_load_hw(lock); if (holder != 0) continue; #endif if (atomic_compare_exchange(&lock->lock_data, 0, data, memory_order_acquire_smp, TRUE)) { #if CONFIG_DTRACE if (__improbable(dtrace_enabled)) { uint64_t spintime = mach_absolute_time() - begin; if (spintime > dtrace_spin_threshold) LOCKSTAT_RECORD2(LS_LCK_SPIN_LOCK_SPIN, lock, spintime, dtrace_spin_threshold); } #endif return 1; } } if (end == 0) { end = ml_get_timebase() + timeout; } else if (ml_get_timebase() >= end) break; } if (do_panic) { // Capture the actual time spent blocked, which may be higher than the timeout // if a misbehaving interrupt stole this thread's CPU time. panic("Spinlock timeout after %llu ticks, %p = %lx", (ml_get_timebase() - end + timeout), lock, holder); } return 0; }
/* * Routine: hw_lock_try * * returns with preemption disabled on success. */ unsigned int hw_lock_try(hw_lock_t lock) { thread_t thread = current_thread(); int success = 0; #if LOCK_TRY_DISABLE_INT long intmask; intmask = disable_interrupts(); #else disable_preemption_for_thread(thread); #endif // LOCK_TRY_DISABLE_INT #if __SMP__ #if LOCK_PRETEST if (ordered_load_hw(lock)) goto failed; #endif // LOCK_PRETEST success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK, memory_order_acquire_smp, FALSE); #else if (lock->lock_data == 0) { lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; success = 1; } #endif // __SMP__ #if LOCK_TRY_DISABLE_INT if (success) disable_preemption_for_thread(thread); #if LOCK_PRETEST failed: #endif // LOCK_PRETEST restore_interrupts(intmask); #else #if LOCK_PRETEST failed: #endif // LOCK_PRETEST if (!success) enable_preemption(); #endif // LOCK_TRY_DISABLE_INT #if CONFIG_DTRACE if (success) LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0); #endif return success; }
/* * Routine: hw_lock_to * * Acquire lock, spinning until it becomes available or timeout. * Timeout is in mach_absolute_time ticks, return with * preemption disabled. */ unsigned int hw_lock_to(hw_lock_t lock, uint64_t timeout) { thread_t thread; uintptr_t state; unsigned int success = 0; thread = current_thread(); disable_preemption_for_thread(thread); state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; #if __SMP__ #if LOCK_PRETEST if (ordered_load_hw(lock)) goto contended; #endif // LOCK_PRETEST if (atomic_compare_exchange(&lock->lock_data, 0, state, memory_order_acquire_smp, TRUE)) { success = 1; goto end; } #if LOCK_PRETEST contended: #endif // LOCK_PRETEST success = hw_lock_lock_contended(lock, state, timeout, FALSE); end: #else // __SMP__ (void)timeout; if (ordered_load_hw(lock) == 0) { ordered_store_hw(lock, state); success = 1; } #endif // __SMP__ #if CONFIG_DTRACE if (success) LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0); #endif return success; }
ThreadsExec::~ThreadsExec() { const unsigned entry = m_pool_size - ( m_pool_rank + 1 ); m_pool_base = 0 ; m_scratch = 0 ; m_scratch_reduce_end = 0 ; m_scratch_thread_end = 0 ; m_pool_rank = 0 ; m_pool_size = 0 ; m_pool_fan_size = 0 ; m_pool_state = ThreadsExec::Terminating ; if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) { ThreadsExec * const nil = 0 ; atomic_compare_exchange( s_threads_exec + entry , this , nil ); s_threads_process.m_pool_state = ThreadsExec::Terminating ; } }
KOKKOS_INLINE_FUNCTION T atomic_compare_exchange_strong(volatile T * const dest, const T & compare, const T & val) { return atomic_compare_exchange(dest,compare,val); }
KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val) { return compare == atomic_compare_exchange(dest, compare, val); }
void AllocationTracker::enable_tracking() { if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) { Impl::throw_runtime_exception("Error: Tracking already enabled"); } }
void LeaveSpinLockCriticalSection_Share(SPIN_LOCK_FLAG& spinLockFlag) { atomic_decrement(&spinLockFlag); atomic_compare_exchange(&spinLockFlag, 0, 2); }