/**\brief  Attempt to pop the work item at the head of the queue.
   *
   *  Find entry 'i' such that
   *    ( m_queue[i] != BEGIN_TOKEN ) AND
   *    ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
   *  if found then
   *    increment begin hint
   *    return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
   *  else if i < total work
   *    return END_TOKEN
   *  else
   *    return COMPLETED_TOKEN
   *  
   */
  KOKKOS_INLINE_FUNCTION
  std::int32_t pop_work() const noexcept
    {
      const std::int32_t N = m_graph.numRows();

      std::int32_t volatile * const ready_queue = & m_queue[0] ;
      std::int32_t volatile * const begin_hint  = & m_queue[2*N] ;

      // begin hint is guaranteed to be less than or equal to
      // actual begin location in the queue.

      for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {

        const std::int32_t w = ready_queue[i] ;

        if ( w == END_TOKEN ) { return END_TOKEN ; }

        if ( ( w != BEGIN_TOKEN ) &&
             ( w == atomic_compare_exchange(ready_queue+i,w,(std::int32_t)BEGIN_TOKEN) ) ) {
          // Attempt to claim ready work index succeeded,
          // update the hint and return work index
          atomic_increment( begin_hint );
          return w ;
        }
        // arrive here when ready_queue[i] == BEGIN_TOKEN
      }

      return COMPLETED_TOKEN ;
    }
Beispiel #2
0
/*
 *	Routine: hw_lock_lock
 *
 *	Acquire lock, spinning until it becomes available,
 *	return with preemption disabled.
 */
void
hw_lock_lock(hw_lock_t lock)
{
	thread_t	thread;
	uintptr_t	state;

	thread = current_thread();
	disable_preemption_for_thread(thread);
	state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
#if	__SMP__

#if	LOCK_PRETEST
	if (ordered_load_hw(lock))
		goto contended;
#endif	// LOCK_PRETEST
	if (atomic_compare_exchange(&lock->lock_data, 0, state,
					memory_order_acquire_smp, TRUE)) {
		goto end;
	}
#if	LOCK_PRETEST
contended:
#endif	// LOCK_PRETEST
	hw_lock_lock_contended(lock, state, 0, TRUE);
end:
#else	// __SMP__
	if (lock->lock_data)
		panic("Spinlock held %p", lock);
	lock->lock_data = state;
#endif	// __SMP__
#if CONFIG_DTRACE
	LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
#endif
	return;
}
Beispiel #3
0
ThreadsExec::~ThreadsExec()
{
  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );

  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;

  if ( m_scratch ) {
    Record * const r = Record::get_record( m_scratch );

    m_scratch = 0 ;

    Record::decrement( r );
  }

  m_pool_base   = 0 ;
  m_scratch_reduce_end = 0 ;
  m_scratch_thread_end = 0 ;
  m_numa_rank      = 0 ;
  m_numa_core_rank = 0 ;
  m_pool_rank      = 0 ;
  m_pool_size      = 0 ;
  m_pool_fan_size  = 0 ;

  m_pool_state  = ThreadsExec::Terminating ;

  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
    ThreadsExec * const nil = 0 ;

    atomic_compare_exchange( s_threads_exec + entry , this , nil );

    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
  }
}
ThreadsExec::ThreadsExec()
  : m_pool_base(0)
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
  , m_scratch()
#else
  , m_scratch(0)
#endif
  , m_scratch_reduce_end(0)
  , m_scratch_thread_end(0)
  , m_numa_rank(0)
  , m_numa_core_rank(0)
  , m_pool_rank(0)
  , m_pool_size(0)
  , m_pool_fan_size(0)
  , m_pool_state( ThreadsExec::Terminating )
{
  if ( & s_threads_process != this ) {

    // A spawned thread

    ThreadsExec * const nil = 0 ;

    // Which entry in 's_threads_exec', possibly determined from hwloc binding
    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
                    ? ((size_t)s_current_function_arg)
                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));

    // Given a good entry set this thread in the 's_threads_exec' array
    if ( entry < s_thread_pool_size[0] &&
         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {

      const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();

      m_numa_rank       = coord.first ;
      m_numa_core_rank  = coord.second ;
      m_pool_base       = s_threads_exec ;
      m_pool_rank       = s_thread_pool_size[0] - ( entry + 1 );
      m_pool_size       = s_thread_pool_size[0] ;
      m_pool_fan_size   = fan_size( m_pool_rank , m_pool_size );
      m_pool_state      = ThreadsExec::Active ;

      s_threads_pid[ m_pool_rank ] = pthread_self();

      // Inform spawning process that the threads_exec entry has been set.
      s_threads_process.m_pool_state = ThreadsExec::Active ;
    }
    else {
      // Inform spawning process that the threads_exec entry could not be set.
      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
    }
  }
  else {
    // Enables 'parallel_for' to execute on unitialized Threads device
    m_pool_rank  = 0 ;
    m_pool_size  = 1 ;
    m_pool_state = ThreadsExec::Inactive ;

    s_threads_pid[ m_pool_rank ] = pthread_self();
  }
}
 T atomic_decrement(volatile T * const dest) {
   T oldval = *dest;
   T assume;
   do {
     assume = oldval;
     T newval = assume--;
     oldval = atomic_compare_exchange(dest, assume, newval);
   } while (assume != oldval);
 }
  T atomic_exchange(volatile T * const dest, const T val) {
    T oldval = *dest;
    T assume;
    do {
      assume = oldval;
      oldval = atomic_compare_exchange(dest, assume, val);
    } while (assume != oldval);

    return oldval;
  }
  T atomic_fetch_sub(volatile T * const dest, const T val) {
    T oldval = *dest;
    T assume;
    do {
      assume = oldval;
      T newval = val - oldval;
      oldval = atomic_compare_exchange(dest, assume, newval);
    } while (assume != oldval);

    return oldval;
  }
Beispiel #8
0
void EnterSpinLockCriticalSection(SPIN_LOCK_FLAG& spinLockFlag)
{
	unsigned int cnt = 0;
	while (1) {
		long orig = atomic_compare_exchange(&spinLockFlag, 1, 0);
		if (orig == 0 && spinLockFlag == 1)
			break;
		++cnt;
		if (cnt > 0x00000fff)
			SleepForMS(20);
	}
}
Beispiel #9
0
void EnterSpinLockCriticalSection_Share(SPIN_LOCK_FLAG& spinLockFlag)
{
	unsigned int cnt = 0;
	while (1) {
		long orig = atomic_compare_exchange(&spinLockFlag, 2, 0);
		if ((orig == 0 && spinLockFlag == 2) ||
			orig > 2)
			break;
		++cnt;
		if (cnt > 0x00000fff)
			SleepForMS(20);
	}
	atomic_increment(&spinLockFlag);
}
Beispiel #10
0
static unsigned int NOINLINE
hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic)
{
	uint64_t	end = 0;
	uintptr_t	holder = lock->lock_data;
	int		i;

	if (timeout == 0)
		timeout = LOCK_PANIC_TIMEOUT;
#if CONFIG_DTRACE
	uint64_t begin;
	boolean_t dtrace_enabled = lockstat_probemap[LS_LCK_SPIN_LOCK_SPIN] != 0;
	if (__improbable(dtrace_enabled))
		begin = mach_absolute_time();
#endif
	for ( ; ; ) {	
		for (i = 0; i < LOCK_SNOOP_SPINS; i++) {
			cpu_pause();
#if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST)
			holder = ordered_load_hw(lock);
			if (holder != 0)
				continue;
#endif
			if (atomic_compare_exchange(&lock->lock_data, 0, data,
			    memory_order_acquire_smp, TRUE)) {
#if CONFIG_DTRACE
				if (__improbable(dtrace_enabled)) {
					uint64_t spintime = mach_absolute_time() - begin;
					if (spintime > dtrace_spin_threshold)
						LOCKSTAT_RECORD2(LS_LCK_SPIN_LOCK_SPIN, lock, spintime, dtrace_spin_threshold);
				}
#endif
				return 1;
			}
		}
		if (end == 0) {
			end = ml_get_timebase() + timeout;
		}
		else if (ml_get_timebase() >= end)
			break;
	}
	if (do_panic) {
		// Capture the actual time spent blocked, which may be higher than the timeout
		// if a misbehaving interrupt stole this thread's CPU time.
		panic("Spinlock timeout after %llu ticks, %p = %lx",
			(ml_get_timebase() - end + timeout), lock, holder);
	}
	return 0;
}
Beispiel #11
0
/*
 *	Routine: hw_lock_try
 *
 *	returns with preemption disabled on success.
 */
unsigned int
hw_lock_try(hw_lock_t lock)
{
	thread_t	thread = current_thread();
	int		success = 0;
#if	LOCK_TRY_DISABLE_INT
	long		intmask;

	intmask = disable_interrupts();
#else
	disable_preemption_for_thread(thread);
#endif	// LOCK_TRY_DISABLE_INT

#if	__SMP__
#if	LOCK_PRETEST
	if (ordered_load_hw(lock))
		goto failed;
#endif	// LOCK_PRETEST
	success = atomic_compare_exchange(&lock->lock_data, 0, LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK,
					memory_order_acquire_smp, FALSE);
#else
	if (lock->lock_data == 0) {
		lock->lock_data = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
		success = 1;
	}
#endif	// __SMP__

#if	LOCK_TRY_DISABLE_INT
	if (success)
		disable_preemption_for_thread(thread);
#if	LOCK_PRETEST
failed:
#endif	// LOCK_PRETEST
	restore_interrupts(intmask);
#else
#if	LOCK_PRETEST
failed:
#endif	// LOCK_PRETEST
	if (!success)
		enable_preemption();
#endif	// LOCK_TRY_DISABLE_INT
#if CONFIG_DTRACE
	if (success)
		LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
#endif
	return success;
}
Beispiel #12
0
/*
 *	Routine: hw_lock_to
 *
 *	Acquire lock, spinning until it becomes available or timeout.
 *	Timeout is in mach_absolute_time ticks, return with
 *	preemption disabled.
 */
unsigned int
hw_lock_to(hw_lock_t lock, uint64_t timeout)
{
	thread_t	thread;
	uintptr_t	state;
	unsigned int success = 0;

	thread = current_thread();
	disable_preemption_for_thread(thread);
	state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
#if	__SMP__

#if	LOCK_PRETEST
	if (ordered_load_hw(lock))
		goto contended;
#endif	// LOCK_PRETEST
	if (atomic_compare_exchange(&lock->lock_data, 0, state,
					memory_order_acquire_smp, TRUE)) {
		success = 1;
		goto end;
	}
#if	LOCK_PRETEST
contended:
#endif	// LOCK_PRETEST
	success = hw_lock_lock_contended(lock, state, timeout, FALSE);
end:
#else	// __SMP__
	(void)timeout;
	if (ordered_load_hw(lock) == 0) {
		ordered_store_hw(lock, state);
		success = 1;
	}
#endif	// __SMP__
#if CONFIG_DTRACE
	if (success)
		LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
#endif
	return success;
}
ThreadsExec::~ThreadsExec()
{
  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );

  m_pool_base   = 0 ;
  m_scratch     = 0 ;
  m_scratch_reduce_end = 0 ;
  m_scratch_thread_end = 0 ;
  m_pool_rank     = 0 ;
  m_pool_size     = 0 ;
  m_pool_fan_size = 0 ;

  m_pool_state  = ThreadsExec::Terminating ;

  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
    ThreadsExec * const nil = 0 ;

    atomic_compare_exchange( s_threads_exec + entry , this , nil );

    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
  }
}
 KOKKOS_INLINE_FUNCTION
   T atomic_compare_exchange_strong(volatile T * const dest, const T & compare, const T & val)
 {
   return atomic_compare_exchange(dest,compare,val);
 }
KOKKOS_INLINE_FUNCTION
bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
{
  return compare == atomic_compare_exchange(dest, compare, val);
}
void AllocationTracker::enable_tracking()
{
  if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
    Impl::throw_runtime_exception("Error: Tracking already enabled");
  }
}
Beispiel #17
0
void LeaveSpinLockCriticalSection_Share(SPIN_LOCK_FLAG& spinLockFlag)
{
	atomic_decrement(&spinLockFlag);
	atomic_compare_exchange(&spinLockFlag, 0, 2);
}