int __pmp_thread_acquire (int nthreads) { int count = 1; /* count from 1 to ignore master thread */ /* NOTE - in the typical case this while construct does not loop */ while (count < nthreads) { int required = nthreads - count; int waiting = __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, -required); if (waiting >= required) { count += required; break; } else { count += waiting; required -= waiting; __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, required); if (__pmp_manager_create_more_threads(required) == 0) { break; } } } __pmp_atomic_add32(&__ompc_cur_numthreads, count - 1); __pmp_debug(PMP_DEBUG_THREAD, "acquired %d out of %d threads\n", count, nthreads); return count; }
static inline void __pmp_thread_barrier (pmp_thread_t *thread) { /* NOTE: the compiler optimizes away OMP barriers in the serial code * so there is no need to optimize that case here. The case of a team * with just one thread is not so common, so ideally don't optimize * that path either. However, it is currently necessary to check * (team != NULL) so one might as well check for the 1-thread team too. * The most important case is the n-way barrier where n > 1. */ pmp_team_t *team = thread->team; int team_size = __pmp_get_team_size(team); if (team_size > 1) { int32_t count = __pmp_atomic_xadd32(&team->barrier_count, -1); assert(count > 0); __pmp_debug(PMP_DEBUG_THREAD, "thread hits barrier with count of %d\n", (int) count); if (count > 1) { __pmp_thread_wait(thread); } else { pmp_local_id_t local_id = thread->local_id; int i; team->barrier_count = team_size; for (i = 0; i < team_size; i++) { pmp_thread_t *t = team->members[i]; if (i != local_id) { __pmp_thread_wake(t); } } } } }
static void __pmp_thread_release (pmp_team_t *team, pmp_thread_t *master) { pmp_local_id_t old_local_id; pmp_idstack_t *idstack = &__pmp_manager.idstack; int nworkers = team->team_size - 1; int i; __pmp_lock(master->global_id, &__pmp_manager.idlock); for (i = nworkers; i >= 1; i--) { pmp_thread_t *thread = team->members[i]; assert(thread != master); old_local_id = thread->local_id; thread->local_id = -1; thread->team = NULL; __pmp_idstack_push(idstack, thread->global_id); __pmp_debug(PMP_DEBUG_THREAD, "released thread global_id %d from local_id %d " "of team at %p\n", thread->global_id, old_local_id, team); } __pmp_unlock(master->global_id, &__pmp_manager.idlock); __pmp_atomic_add32(&__ompc_cur_numthreads, -nworkers); __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, nworkers); }
static inline void __pmp_thread_master_join (pmp_thread_t *master) { pmp_team_t *team = master->team; int32_t count; int thread_spin = __pmp_get_param()->thread_spin; int i; /* NOTE: insert a small spin loop here to try to arrange for the master * to arrive just after the last worker thread. If this happens * then we avoid a much more expensive thread synchronization. */ for (i = 0; i < thread_spin; i++) { /* USER LEVEL SPIN LOOP */ if (team->working_threads == 1) { team->working_threads = 0; return; } __pmp_yield(); } count = __pmp_atomic_xadd32(&team->working_threads, -1); __pmp_debug(PMP_DEBUG_THREAD, "master thread joins with count of %d\n", (int) count); assert(count >= 1); if (count > 1) { __pmp_thread_wait(master); } }
static inline void __pmp_thread_worker_join (pmp_team_t *team) { int32_t count = __pmp_atomic_xadd32(&team->working_threads, -1); __pmp_debug(PMP_DEBUG_THREAD, "worker thread joins with count of %d\n", (int) count); assert(count >= 1); if (count == 1) { __pmp_thread_wake(team->members[0]); } }
static inline int __pmp_schedule_next (int global_id, int64_t *lowerp, int64_t *upperp, int64_t *incp) { pmp_thread_t *thread = __pmp_get_thread(global_id); int team_size = __pmp_get_team_size(thread->team); int64_t iteration = thread->iteration; pmp_local_id_t local_id = thread->local_id; pmp_loop_t *loop = thread->loop; assert(loop != NULL); assert(local_id < team_size); if (team_size == 1) { if (iteration == 0) { *lowerp = loop->lower; *upperp = loop->upper; *incp = loop->inc; thread->ticket_number = loop->lower; thread->iteration = 1; __pmp_loop_analyser(thread, loop->sched, global_id, local_id, loop->lower, loop->upper, *lowerp, *upperp, *incp, 0, 0); return 1; } else { assert(iteration == 1); __pmp_loop_free(thread); return 0; } } else { int sched = loop->sched; int64_t lower = loop->lower; int64_t upper = loop->upper; int64_t inc = loop->inc; int64_t chunk = loop->chunk; switch (sched) { case PMP_SCHED_STATIC: case PMP_SCHED_ORDERED_STATIC: { /* NOTE: setting a small value of chunk causes (unnecessary) iteration * through this code. If the chunk is ignored, the code degenerates * into the static even case (which is the default). */ int64_t size = (upper - lower) / inc + 1; int64_t size_per_thread = ((size - 1) / team_size + 1) * inc; int64_t thread_lower = lower + (local_id * size_per_thread); int64_t thread_upper = thread_lower + size_per_thread - inc; int64_t this_lower = thread_lower + (iteration * chunk * inc); int64_t this_upper = this_lower + (chunk - 1) * inc; thread_upper = LOOPMIN(inc, thread_upper, upper); this_upper = LOOPMIN(inc, this_upper, thread_upper); if ((inc >= 0) ? (this_lower > thread_upper) : (this_lower < thread_upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = this_lower; *upperp = this_upper; thread->ticket_number = this_lower; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } case PMP_SCHED_STATIC_EVEN: case PMP_SCHED_ORDERED_STATIC_EVEN: { if (iteration == 0) { int64_t size = (upper - lower) / inc + 1; int64_t thread_lower; int64_t thread_upper; if (!__pmp_get_param()->static_fair) { int64_t size_per_thread = ((size - 1) / team_size + 1) * inc; thread_lower = lower + (local_id * size_per_thread); thread_upper = thread_lower + size_per_thread - inc; } else { int64_t chunk = size / team_size; int64_t remainder = size - (chunk * team_size); int64_t index = MIN(local_id, remainder) * (chunk + 1); if (local_id > remainder) { index += (local_id - remainder) * chunk; } thread_lower = lower + (index * inc); chunk += (local_id < remainder); thread_upper = thread_lower + (chunk - 1) * inc; } thread_upper = LOOPMIN(inc, thread_upper, upper); if ((inc >= 0) ? (thread_lower > thread_upper) : (thread_lower < thread_upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = thread_lower; *upperp = thread_upper; thread->ticket_number = thread_lower; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } } else { assert(iteration == 1); __pmp_loop_free(thread); return 0; } /* NOT REACHED */ break; } case PMP_SCHED_DYNAMIC: case PMP_SCHED_ORDERED_DYNAMIC: { int64_t stride = inc * chunk; #if __WORDSIZE == 64 int64_t current = __pmp_atomic_xadd64(&loop->current, stride); #else /* TODO: the atomic xadd64 is a problem for 32-bit compilation */ /* the workaround below is just to do a 32-bit atomic add */ int64_t current; current = (int64_t) __pmp_atomic_xadd32((int32_t *) &loop->current, (int32_t) stride); #endif if ((inc >= 0) ? (current > upper) : (current < upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = current; *upperp = *lowerp + stride - inc; *upperp = LOOPMIN(inc, upper, *upperp); thread->ticket_number = current; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } case PMP_SCHED_GUIDED: case PMP_SCHED_ORDERED_GUIDED: { /* NOTE: guided scheduling uses a heuristic to choose a good * chunk size to divide up the remaining iterations amongst * the team (subject to a minimum). An exact implementation of * this would require a lock on the loop data. However, the * heuristic can be approximated using (possibly) stale values * and this should be good enough. The value of "remaining" * is monotonically decreasing. The worst that could happen * is that an update to loop->chunk is lost slightly unbalancing * the distribution. The most important point is that loop->current * is maintained atomically. */ /* UPDATE: if cmpxchg64 is available then this is used to protect * the update of loop->chunk. This is fairly cunning, and makes * the chunk update more accurate in this case! */ int64_t min_chunk = loop->min_chunk; int64_t remaining = upper - loop->current + 1; /* estimate */ int64_t my_chunk = MAX(min_chunk, MIN(chunk, remaining));/* estimate */ int64_t stride = inc * my_chunk; #if __WORDSIZE == 64 int64_t current = __pmp_atomic_xadd64(&loop->current, stride); #else /* TODO: the atomic xadd64 is a problem for 32-bit compilation */ /* the workaround below is just to do a 32-bit atomic add */ int64_t current = __pmp_atomic_xadd32((int32_t *) &loop->current, (int32_t) stride); #endif assert(stride != 0); #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__pmp_schedule_next: global_id=%d, " "remaining=%d, my_chunk=%d, stride=%d, current=%d\n", global_id, remaining, my_chunk, stride, current); #endif if ((inc >= 0) ? (current > upper) : (current < upper)) { __pmp_loop_free(thread); return 0; } else { pmp_param_t *param = __pmp_get_param(); int64_t my_upper = LOOPMIN(inc, upper, current + stride - inc); int64_t new_chunk; int64_t divisor; remaining = upper - my_upper; /* estimate */ divisor = team_size * param->guided_chunk_divisor; new_chunk = (remaining + divisor - 1) / divisor; new_chunk = MIN(param->guided_chunk_max, new_chunk); new_chunk = MAX(min_chunk, new_chunk); #if __WORDSIZE == 64 (void) __pmp_atomic_cmpxchg64(&loop->chunk, chunk, new_chunk); #else loop->chunk = new_chunk; /* estimate */ #endif *incp = inc; *lowerp = current; *upperp = my_upper; thread->ticket_number = current; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } default: { __pmp_fatal("unknown dynamic scheduling type %d\n", sched); break; } } /* NOT REACHED */ assert(0); __pmp_loop_free(thread); return 0; } /* NOT REACHED */ }
void __pmp_thread_create (pmp_thread_t *thread) { pmp_thread_t *creator = __pmp_get_current_thread(); pthread_t pthread_id; int result; pmp_param_t *param = __pmp_get_param(); thread->creator = creator; if (param->thread_guard_size > 0) { void *guard; /* NOTE: this lock is to give a better chance of the guard page * allocation to immediately follow the pthread stack allocation. */ __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock); /* NOTE: it seems that mmap tends to allocate in an upwards direction so allocate the guard page first. */ guard = mmap(0, param->thread_guard_size, PROT_NONE, #if defined(BUILD_OS_DARWIN) MAP_PRIVATE | MAP_ANON, #else /* defined(BUILD_OS_DARWIN) */ MAP_PRIVATE | MAP_ANONYMOUS, #endif /* defined(BUILD_OS_DARWIN) */ 0, 0); if (guard == MAP_FAILED) { __pmp_warning("unable to allocate a guard page of %ld bytes\n", (long) param->thread_guard_size); } else { __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n", guard); thread->guard_page = guard; } } if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr, __pmp_thread_run, thread)) != 0) { if (__pmp_manager.allocated_threads > param->initial_team_size) { __pmp_warning( "pthread_create failed when trying to allocate thread %d\n", __pmp_manager.allocated_threads); __pmp_warning( "note this is more than the initial number of threads (%d)\n", param->initial_team_size); #if defined(BUILD_OS_DARWIN) if (sizeof(long) == 4) #else /* defined(BUILD_OS_DARWIN) */ if (__WORDSIZE == 32) #endif /* defined(BUILD_OS_DARWIN) */ { int64_t total_stack = ((int64_t) param->thread_stack_size) * ((int64_t) __pmp_manager.allocated_threads); if (total_stack > 0x40000000LL) { __pmp_warning( "the failure may be due to excessive thread stack size\n"); __pmp_warning( "try using a smaller setting for PSC_OMP_STACK_SIZE\n"); } } } __pmp_fatal("unable to create thread (result code %d)\n", result); } if (param->thread_guard_size > 0) { __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock); } __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1); __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", thread->global_id); }