void task_group_context::init () { __TBB_STATIC_ASSERT ( sizeof(my_version_and_traits) >= 4, "Layout of my_version_and_traits must be reconsidered on this platform" ); __TBB_STATIC_ASSERT ( sizeof(task_group_context) == 2 * NFS_MaxLineSize, "Context class has wrong size - check padding and members alignment" ); __TBB_ASSERT ( (uintptr_t(this) & (sizeof(my_cancellation_requested) - 1)) == 0, "Context is improperly aligned" ); __TBB_ASSERT ( __TBB_load_relaxed(my_kind) == isolated || __TBB_load_relaxed(my_kind) == bound, "Context can be created only as isolated or bound" ); my_parent = NULL; my_cancellation_requested = 0; my_exception = NULL; my_owner = NULL; my_state = 0; itt_caller = ITT_CALLER_NULL; #if __TBB_TASK_PRIORITY my_priority = normalized_normal_priority; #endif /* __TBB_TASK_PRIORITY */ #if __TBB_FP_CONTEXT __TBB_STATIC_ASSERT( sizeof(my_cpu_ctl_env) == sizeof(internal::uint64_t), "The reserved space for FPU settings are not equal sizeof(uint64_t)" ); __TBB_STATIC_ASSERT( sizeof(cpu_ctl_env) <= sizeof(my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t" ); suppress_unused_warning( my_cpu_ctl_env.space ); cpu_ctl_env &ctl = *internal::punned_cast<cpu_ctl_env*>(&my_cpu_ctl_env); new ( &ctl ) cpu_ctl_env; if ( my_version_and_traits & fp_settings ) ctl.get_env(); #endif }
void task_group_context::bind_to ( generic_scheduler *local_sched ) { __TBB_ASSERT ( __TBB_load_relaxed(my_kind) == binding_required, "Already bound or isolated?" ); __TBB_ASSERT ( !my_parent, "Parent is set before initial binding" ); my_parent = local_sched->my_innermost_running_task->prefix().context; #if __TBB_FP_CONTEXT // Inherit FPU settings only if the context has not captured FPU settings yet. if ( !(my_version_and_traits & fp_settings) ) copy_fp_settings(*my_parent); #endif // Condition below prevents unnecessary thrashing parent context's cache line if ( !(my_parent->my_state & may_have_children) ) my_parent->my_state |= may_have_children; // full fence is below if ( my_parent->my_parent ) { // Even if this context were made accessible for state change propagation // (by placing __TBB_store_with_release(s->my_context_list_head.my_next, &my_node) // above), it still could be missed if state propagation from a grand-ancestor // was underway concurrently with binding. // Speculative propagation from the parent together with epoch counters // detecting possibility of such a race allow to avoid taking locks when // there is no contention. // Acquire fence is necessary to prevent reordering subsequent speculative // loads of parent state data out of the scope where epoch counters comparison // can reliably validate it. uintptr_t local_count_snapshot = __TBB_load_with_acquire( my_parent->my_owner->my_context_state_propagation_epoch ); // Speculative propagation of parent's state. The speculation will be // validated by the epoch counters check further on. my_cancellation_requested = my_parent->my_cancellation_requested; #if __TBB_TASK_PRIORITY my_priority = my_parent->my_priority; #endif /* __TBB_TASK_PRIORITY */ register_with( local_sched ); // Issues full fence // If no state propagation was detected by the following condition, the above // full fence guarantees that the parent had correct state during speculative // propagation before the fence. Otherwise the propagation from parent is // repeated under the lock. if ( local_count_snapshot != the_context_state_propagation_epoch ) { // Another thread may be propagating state change right now. So resort to lock. context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); my_cancellation_requested = my_parent->my_cancellation_requested; #if __TBB_TASK_PRIORITY my_priority = my_parent->my_priority; #endif /* __TBB_TASK_PRIORITY */ } } else { register_with( local_sched ); // Issues full fence // As we do not have grand-ancestors, concurrent state propagation (if any) // may originate only from the parent context, and thus it is safe to directly // copy the state from it. my_cancellation_requested = my_parent->my_cancellation_requested; #if __TBB_TASK_PRIORITY my_priority = my_parent->my_priority; #endif /* __TBB_TASK_PRIORITY */ } __TBB_store_relaxed(my_kind, binding_completed); }
//------------------------------------------------------------------------ // Methods of allocate_root_with_context_proxy //------------------------------------------------------------------------ task& allocate_root_with_context_proxy::allocate( size_t size ) const { internal::generic_scheduler* s = governor::local_scheduler(); __TBB_ASSERT( s, "Scheduler auto-initialization failed?" ); __TBB_ASSERT( &my_context, "allocate_root(context) argument is a dereferenced NULL pointer" ); task& t = s->allocate_task( size, NULL, &my_context ); // Supported usage model prohibits concurrent initial binding. Thus we do not // need interlocked operations or fences to manipulate with my_context.my_kind if ( __TBB_load_relaxed(my_context.my_kind) == task_group_context::binding_required ) { // If we are in the outermost task dispatch loop of a master thread, then // there is nothing to bind this context to, and we skip the binding part // treating the context as isolated. if ( s->master_outermost_level() ) __TBB_store_relaxed(my_context.my_kind, task_group_context::isolated); else my_context.bind_to( s ); } #if __TBB_FP_CONTEXT if ( __TBB_load_relaxed(my_context.my_kind) == task_group_context::isolated && !(my_context.my_version_and_traits & task_group_context::fp_settings) ) my_context.copy_fp_settings( *s->my_arena->my_default_ctx ); #endif ITT_STACK_CREATE(my_context.itt_caller); return t; }
void concurrent_monitor::notify_one_relaxed() { if( waitset_ec.empty() ) return; waitset_node_t* n; const waitset_node_t* end = waitset_ec.end(); { tbb::spin_mutex::scoped_lock l( mutex_ec ); __TBB_store_relaxed( epoch, __TBB_load_relaxed(epoch) + 1 ); n = waitset_ec.front(); if( n!=end ) { waitset_ec.remove( *n ); to_thread_context(n)->in_waitset = false; } } if( n!=end ) to_thread_context(n)->semaphore().V(); }
void concurrent_monitor::prepare_wait( thread_context& thr, uintptr_t ctx ) { if( !thr.ready ) thr.init(); // this is good place to pump previous spurious wakeup else if( thr.spurious ) { thr.spurious = false; thr.semaphore().P(); } thr.context = ctx; thr.in_waitset = true; { tbb::spin_mutex::scoped_lock l( mutex_ec ); __TBB_store_relaxed( thr.epoch, __TBB_load_relaxed(epoch) ); waitset_ec.add( (waitset_t::node_t*)&thr ); } atomic_fence(); }
//------------------------------------------------------------------------ // Methods of allocate_root_with_context_proxy //------------------------------------------------------------------------ task& allocate_root_with_context_proxy::allocate( size_t size ) const { internal::generic_scheduler* s = governor::local_scheduler(); __TBB_ASSERT( s, "Scheduler auto-initialization failed?" ); task& t = s->allocate_task( size, NULL, &my_context ); // Supported usage model prohibits concurrent initial binding. Thus we do not // need interlocked operations or fences to manipulate with my_context.my_kind if ( __TBB_load_relaxed(my_context.my_kind) == task_group_context::binding_required ) { // If we are in the outermost task dispatch loop of a master thread, then // there is nothing to bind this context to, and we skip the binding part // treating the context as isolated. if ( s->my_innermost_running_task == s->my_dummy_task ) __TBB_store_relaxed(my_context.my_kind, task_group_context::isolated); else my_context.bind_to( s ); } ITT_STACK_CREATE(my_context.itt_caller); return t; }
void concurrent_monitor::notify_all_relaxed() { if( waitset_ec.empty() ) return; waitset_t temp; const waitset_node_t* end; { tbb::spin_mutex::scoped_lock l( mutex_ec ); __TBB_store_relaxed( epoch, __TBB_load_relaxed(epoch) + 1 ); waitset_ec.flush_to( temp ); end = temp.end(); for( waitset_node_t* n=temp.front(); n!=end; n=n->next ) to_thread_context(n)->in_waitset = false; } waitset_node_t* nxt; for( waitset_node_t* n=temp.front(); n!=end; n=nxt ) { nxt = n->next; to_thread_context(n)->semaphore().V(); } #if TBB_USE_ASSERT temp.clear(); #endif }
bool arena::is_out_of_work() { // TODO: rework it to return at least a hint about where a task was found; better if the task itself. for(;;) { pool_state_t snapshot = my_pool_state; switch( snapshot ) { case SNAPSHOT_EMPTY: return true; case SNAPSHOT_FULL: { // Use unique id for "busy" in order to avoid ABA problems. const pool_state_t busy = pool_state_t(&busy); // Request permission to take snapshot if( my_pool_state.compare_and_swap( busy, SNAPSHOT_FULL )==SNAPSHOT_FULL ) { // Got permission. Take the snapshot. // NOTE: This is not a lock, as the state can be set to FULL at // any moment by a thread that spawns/enqueues new task. size_t n = my_limit; // Make local copies of volatile parameters. Their change during // snapshot taking procedure invalidates the attempt, and returns // this thread into the dispatch loop. #if __TBB_TASK_PRIORITY intptr_t top_priority = my_top_priority; uintptr_t reload_epoch = my_reload_epoch; // Inspect primary task pools first #endif /* __TBB_TASK_PRIORITY */ size_t k; for( k=0; k<n; ++k ) { if( my_slots[k].task_pool != EmptyTaskPool && __TBB_load_relaxed(my_slots[k].head) < __TBB_load_relaxed(my_slots[k].tail) ) { // k-th primary task pool is nonempty and does contain tasks. break; } } __TBB_ASSERT( k <= n, NULL ); bool work_absent = k == n; #if __TBB_TASK_PRIORITY // Variable tasks_present indicates presence of tasks at any priority // level, while work_absent refers only to the current priority. bool tasks_present = !work_absent || my_orphaned_tasks; bool dequeuing_possible = false; if ( work_absent ) { // Check for the possibility that recent priority changes // brought some tasks to the current priority level uintptr_t abandonment_epoch = my_abandonment_epoch; // Master thread's scheduler needs special handling as it // may be destroyed at any moment (workers' schedulers are // guaranteed to be alive while at least one thread is in arena). // Have to exclude concurrency with task group state change propagation too. my_market->my_arenas_list_mutex.lock(); generic_scheduler *s = my_slots[0].my_scheduler; if ( s && __TBB_CompareAndSwapW(&my_slots[0].my_scheduler, (intptr_t)LockedMaster, (intptr_t)s) == (intptr_t)s ) { __TBB_ASSERT( my_slots[0].my_scheduler == LockedMaster && s != LockedMaster, NULL ); work_absent = !may_have_tasks( s, my_slots[0], tasks_present, dequeuing_possible ); __TBB_store_with_release( my_slots[0].my_scheduler, s ); } my_market->my_arenas_list_mutex.unlock(); // The following loop is subject to data races. While k-th slot's // scheduler is being examined, corresponding worker can either // leave to RML or migrate to another arena. // But the races are not prevented because all of them are benign. // First, the code relies on the fact that worker thread's scheduler // object persists until the whole library is deinitialized. // Second, in the worst case the races can only cause another // round of stealing attempts to be undertaken. Introducing complex // synchronization into this coldest part of the scheduler's control // flow does not seem to make sense because it both is unlikely to // ever have any observable performance effect, and will require // additional synchronization code on the hotter paths. for( k = 1; work_absent && k < n; ++k ) work_absent = !may_have_tasks( my_slots[k].my_scheduler, my_slots[k], tasks_present, dequeuing_possible ); // Preclude premature switching arena off because of a race in the previous loop. work_absent = work_absent && !__TBB_load_with_acquire(my_orphaned_tasks) && abandonment_epoch == my_abandonment_epoch; } #endif /* __TBB_TASK_PRIORITY */ // Test and test-and-set. if( my_pool_state==busy ) { #if __TBB_TASK_PRIORITY bool no_fifo_tasks = my_task_stream[top_priority].empty(); work_absent = work_absent && (!dequeuing_possible || no_fifo_tasks) && top_priority == my_top_priority && reload_epoch == my_reload_epoch; #else bool no_fifo_tasks = my_task_stream.empty(); work_absent = work_absent && no_fifo_tasks; #endif /* __TBB_TASK_PRIORITY */ if( work_absent ) { #if __TBB_TASK_PRIORITY if ( top_priority > my_bottom_priority ) { if ( my_market->lower_arena_priority(*this, top_priority - 1, top_priority) && !my_task_stream[top_priority].empty() ) { atomic_update( my_skipped_fifo_priority, top_priority, std::less<intptr_t>()); } } else if ( !tasks_present && !my_orphaned_tasks && no_fifo_tasks ) { #endif /* __TBB_TASK_PRIORITY */ // save current demand value before setting SNAPSHOT_EMPTY, // to avoid race with advertise_new_work. int current_demand = (int)my_max_num_workers; if( my_pool_state.compare_and_swap( SNAPSHOT_EMPTY, busy )==busy ) { // This thread transitioned pool to empty state, and thus is // responsible for telling RML that there is no other work to do. my_market->adjust_demand( *this, -current_demand ); #if __TBB_TASK_PRIORITY // Check for the presence of enqueued tasks "lost" on some of // priority levels because updating arena priority and switching // arena into "populated" (FULL) state happen non-atomically. // Imposing atomicity would require task::enqueue() to use a lock, // which is unacceptable. bool switch_back = false; for ( int p = 0; p < num_priority_levels; ++p ) { if ( !my_task_stream[p].empty() ) { switch_back = true; if ( p < my_bottom_priority || p > my_top_priority ) my_market->update_arena_priority(*this, p); } } if ( switch_back ) advertise_new_work</*Spawned*/false>(); #endif /* __TBB_TASK_PRIORITY */ return true; } return false; #if __TBB_TASK_PRIORITY } #endif /* __TBB_TASK_PRIORITY */ } // Undo previous transition SNAPSHOT_FULL-->busy, unless another thread undid it. my_pool_state.compare_and_swap( SNAPSHOT_FULL, busy ); } } return false; } default: // Another thread is taking a snapshot. return false; } } }
task_group_context::~task_group_context () { if ( __TBB_load_relaxed(my_kind) == binding_completed ) { if ( governor::is_set(my_owner) ) { // Local update of the context list uintptr_t local_count_snapshot = my_owner->my_context_state_propagation_epoch; my_owner->my_local_ctx_list_update.store<relaxed>(1); // Prevent load of nonlocal update flag from being hoisted before the // store to local update flag. atomic_fence(); if ( my_owner->my_nonlocal_ctx_list_update.load<relaxed>() ) { spin_mutex::scoped_lock lock(my_owner->my_context_list_mutex); my_node.my_prev->my_next = my_node.my_next; my_node.my_next->my_prev = my_node.my_prev; my_owner->my_local_ctx_list_update.store<relaxed>(0); } else { my_node.my_prev->my_next = my_node.my_next; my_node.my_next->my_prev = my_node.my_prev; // Release fence is necessary so that update of our neighbors in // the context list was committed when possible concurrent destroyer // proceeds after local update flag is reset by the following store. my_owner->my_local_ctx_list_update.store<release>(0); if ( local_count_snapshot != the_context_state_propagation_epoch ) { // Another thread was propagating cancellation request when we removed // ourselves from the list. We must ensure that it is not accessing us // when this destructor finishes. We'll be able to acquire the lock // below only after the other thread finishes with us. spin_mutex::scoped_lock lock(my_owner->my_context_list_mutex); } } } else { // Nonlocal update of the context list // Synchronizes with generic_scheduler::cleanup_local_context_list() // TODO: evaluate and perhaps relax, or add some lock instead if ( internal::as_atomic(my_kind).fetch_and_store(dying) == detached ) { my_node.my_prev->my_next = my_node.my_next; my_node.my_next->my_prev = my_node.my_prev; } else { //TODO: evaluate and perhaps relax my_owner->my_nonlocal_ctx_list_update.fetch_and_increment<full_fence>(); //TODO: evaluate and perhaps remove spin_wait_until_eq( my_owner->my_local_ctx_list_update, 0u ); my_owner->my_context_list_mutex.lock(); my_node.my_prev->my_next = my_node.my_next; my_node.my_next->my_prev = my_node.my_prev; my_owner->my_context_list_mutex.unlock(); //TODO: evaluate and perhaps relax my_owner->my_nonlocal_ctx_list_update.fetch_and_decrement<full_fence>(); } } } #if __TBB_FP_CONTEXT internal::punned_cast<cpu_ctl_env*>(&my_cpu_ctl_env)->~cpu_ctl_env(); #endif poison_value(my_version_and_traits); if ( my_exception ) my_exception->destroy(); ITT_STACK(itt_caller != ITT_CALLER_NULL, caller_destroy, itt_caller); }