void __ompc_static_init_4 (int global_id, int sched, int *lower, int *upper, int *stride, int inc, int chunk) { int64_t l = (int64_t) *lower; int64_t u = (int64_t) *upper; int64_t s; #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_static_init_4 global_id=%d with " "sched=%d, lower=%d, upper=%d, inc=%d, chunk=%d\n", global_id, sched, *lower, *upper, inc, chunk); #endif __pmp_sample(PMP_PROFILE_OMPC_STATIC_INIT_4); __pmp_static_init(global_id, sched, &l, &u, &s, inc, chunk); *lower = (int) l; *upper = (int) u; *stride = (int) s; #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_static_init_4 global_id=%d returns " "lower=%d, upper=%d, stride=%d\n", global_id, *lower, *upper, *stride); #endif }
static void __pmp_shared_catch_segv (pmp_thread_t *thread) { static int32_t installing_segv = 0; static int32_t installed_segv = 0; /* For Linuxthreads this only needs to be done once, since sigaction's are * shared across all of the pthreads. I arrange for it to be set up by the * first worker thread that is woken up. This tranfers SEGV catching * responsibility from the serial code in libfoobar to libopenmp as * soon as parallelism is employed. */ if (__pmp_atomic_cmpxchg32(&installing_segv, 0, 1) == 0) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d " "is installing the SEGV handler\n", thread->global_id); __pmp_catch_segv(); __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d " "has installed the SEGV handler\n", thread->global_id); installed_segv = 1; } while (installed_segv == 0) { /* USER LEVEL SPIN LOCK */ __pmp_yield(); } }
int __ompc_schedule_next_4 (int global_id, int *lower, int *upper, int *inc) { int result; int64_t l; int64_t u; int64_t i; __pmp_sample(PMP_PROFILE_OMPC_SCHEDULE_NEXT_4); result = __pmp_schedule_next(global_id, &l, &u, &i); *lower = (int) l; *upper = (int) u; *inc = (int) i; #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) { if (result == 0) __pmp_debug("CALLS_DEBUG", "__ompc_schedule_next_4 global_id=%d returns " "result=0\n", global_id); else __pmp_debug("CALLS_DEBUG", "__ompc_schedule_next_4 global_id=%d returns " "lower=%d, upper=%d, inc=%d, result=%d\n", global_id, *lower, *upper, *inc, result); } #endif return result; }
void __ompc_copyin_thdprv (int n, ...) { pmp_global_id_t global_id; __pmp_debug(PMP_DEBUG_CALLS, "__ompc_copyin_thdprv: n=%d\n", n); __pmp_sample(PMP_PROFILE_OMPC_COPYIN_THDPRV); if (__pmp_get_param()->disabled) { return; } global_id = __pmp_get_current_global_id(); va_list ap; va_start(ap, n); while (n > 0) { void *dst = va_arg(ap, void*); void *src = va_arg(ap, void*); int size = va_arg(ap, int); if (dst != src) { __pmp_debug(PMP_DEBUG_THREAD, "__ompc_copyin_thdprv: global_id=%d " "dst: %p, src: %p, size: %d\n", global_id, dst, src, size); memcpy(dst, src, size); } n -= 3; } va_end(ap); }
static inline void __pmp_thread_wake (pmp_thread_t *thread) { int32_t sync = __pmp_atomic_cmpxchg32(&thread->sync, PMP_SYNC_IDLE, PMP_SYNC_UNBLOCKED); assert(sync != PMP_SYNC_UNBLOCKED); if (sync == PMP_SYNC_BLOCKED) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is being signaled\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; assert(thread->tid != -1); __pmp_sample(PMP_PROFILE_THREAD_RESCHEDULE); #if (defined PMP_USE_PTHREAD_SIGNALS) if (pthread_kill(thread->pthread_id, SIGPMP) != 0) { __pmp_fatal("unable to wake thread using pthread_kill\n"); } #elif (defined PMP_NO_NPTL) if (kill(thread->tid, SIGPMP) != 0) { __pmp_fatal("unable to wake thread using kill\n"); } #else if (tkill(thread->tid, SIGPMP) != 0) { __pmp_fatal("unable to wake thread using tkill\n"); } #endif } else { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is woken\n", thread->global_id); } }
static inline void __pmp_thread_wait (pmp_thread_t *thread) { int32_t sync; int thread_spin; int i; if (thread->sync == PMP_SYNC_UNBLOCKED) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (1)\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; return; } thread_spin = __pmp_get_param()->thread_spin; for (i = 0; i < thread_spin; i++) { /* USER LEVEL SPIN LOOP */ if (thread->sync == PMP_SYNC_UNBLOCKED) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (2)\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; return; } __pmp_yield(); } sync = __pmp_atomic_cmpxchg32(&thread->sync, PMP_SYNC_IDLE, PMP_SYNC_BLOCKED); if (sync == PMP_SYNC_IDLE) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is waiting\n", thread->global_id); __pmp_sample(PMP_PROFILE_THREAD_DESCHEDULE); #ifdef PMP_USE_PTHREAD_SIGNALS { int sig; do { sigwait(&__pmp_manager.mask_block_sigpmp, &sig); } while (sig != SIGPMP); } #else sigsuspend(&__pmp_manager.mask_unblock_sigpmp); /* NOTE: it is unfortunate that sigsuspend does not tell us which * signal has been raised. */ #endif __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is awake\n", thread->global_id); } else { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (3)\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; } }
void __pmp_thread_create_main (void) { pmp_thread_t *thread = __pmp_get_main_thread(); int global_id; global_id = __pmp_idstack_pop(&__pmp_manager.idstack); assert(global_id == 0); thread->pthread_id = pthread_self(); #ifdef PMP_NO_NPTL thread->tid = getpid(); #else thread->tid = gettid(); #endif thread->local_id = 0; #ifndef PMP_NO_TLS #ifdef PMP_TLS_THREAD __pmp_tls_current_thread = thread; #endif #ifdef PMP_TLS_LOCAL_ID __pmp_tls_current_local_id = 0; #endif #ifdef PMP_TLS_GLOBAL_ID __pmp_tls_current_global_id = thread->global_id; #endif #endif #ifdef PMP_USE_PTHREAD_SIGNALS if (pthread_sigmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) { __pmp_fatal("unable to set thread-specific sigmask\n"); } #else if (sigprocmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) { __pmp_fatal("unable to set thread-specific sigmask\n"); } #endif if (pthread_setspecific(__pmp_manager.thread_key, (void *) thread) != 0) { __pmp_fatal("unable to set thread-specific data\n"); } __pmp_thread_bind(thread); /* early master bind */ __pmp_debug(PMP_DEBUG_THREAD, "created main thread global_id %d\n", thread->global_id); __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_create_main: tid=%d, pthread_id=0x%08x " "global_id=%d, local_id=%d\n", (int) thread->tid, (int) thread->pthread_id, (int) thread->global_id, (int) thread->local_id); }
void __ompc_fork (int nthreads, workfunc_t work, void *fp) { pmp_thread_t *master = __pmp_get_current_thread(); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_fork nthreads=%d, work=%p, fp=%p " " (nesting depth = %d)\n", nthreads, work, fp, master->nesting_depth); __pmp_sample(PMP_PROFILE_OMPC_FORK); __pmp_memory_fence(); __pmp_thread_fork(master, nthreads, work, fp); __pmp_memory_fence(); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_fork completed" " (nesting depth = %d)\n", master->nesting_depth); }
static inline int __pmp_thread_single (pmp_thread_t *thread) { /* NOTE: the compiler optimizes away OMP singles in the serial code * so there is no need to optimize that case here. The case of a team * with just one thread is not so common, so don't optimize that path either. * The most important case is the n-way single where n > 1. However, it * is possible for a thread with no team to call this (e.g. orphaned * directive) so in that case one has to check to see if there is a team. * TODO: allocate a team of 1 thread in the orphaned case to reduce the * amount of special case code throughout the library looking for NULL * team pointers. */ pmp_team_t *team = thread->team; if (__pmp_get_team_size(team) == 1) { return 1; } else { int32_t thread_count = thread->single_count; int32_t team_count = __pmp_atomic_cmpxchg32(&team->single_count, thread_count, thread_count + 1); thread->single_count++; __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_single, local_id %d " "thread_count=%d team_count=%d %s\n", thread->local_id, thread_count, team_count, (thread_count == team_count) ? "win" : "lose"); return (thread_count == team_count); } }
static inline void __pmp_thread_barrier (pmp_thread_t *thread) { /* NOTE: the compiler optimizes away OMP barriers in the serial code * so there is no need to optimize that case here. The case of a team * with just one thread is not so common, so ideally don't optimize * that path either. However, it is currently necessary to check * (team != NULL) so one might as well check for the 1-thread team too. * The most important case is the n-way barrier where n > 1. */ pmp_team_t *team = thread->team; int team_size = __pmp_get_team_size(team); if (team_size > 1) { int32_t count = __pmp_atomic_xadd32(&team->barrier_count, -1); assert(count > 0); __pmp_debug(PMP_DEBUG_THREAD, "thread hits barrier with count of %d\n", (int) count); if (count > 1) { __pmp_thread_wait(thread); } else { pmp_local_id_t local_id = thread->local_id; int i; team->barrier_count = team_size; for (i = 0; i < team_size; i++) { pmp_thread_t *t = team->members[i]; if (i != local_id) { __pmp_thread_wake(t); } } } } }
static void __pmp_thread_release (pmp_team_t *team, pmp_thread_t *master) { pmp_local_id_t old_local_id; pmp_idstack_t *idstack = &__pmp_manager.idstack; int nworkers = team->team_size - 1; int i; __pmp_lock(master->global_id, &__pmp_manager.idlock); for (i = nworkers; i >= 1; i--) { pmp_thread_t *thread = team->members[i]; assert(thread != master); old_local_id = thread->local_id; thread->local_id = -1; thread->team = NULL; __pmp_idstack_push(idstack, thread->global_id); __pmp_debug(PMP_DEBUG_THREAD, "released thread global_id %d from local_id %d " "of team at %p\n", thread->global_id, old_local_id, team); } __pmp_unlock(master->global_id, &__pmp_manager.idlock); __pmp_atomic_add32(&__ompc_cur_numthreads, -nworkers); __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, nworkers); }
int __pmp_thread_acquire (int nthreads) { int count = 1; /* count from 1 to ignore master thread */ /* NOTE - in the typical case this while construct does not loop */ while (count < nthreads) { int required = nthreads - count; int waiting = __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, -required); if (waiting >= required) { count += required; break; } else { count += waiting; required -= waiting; __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, required); if (__pmp_manager_create_more_threads(required) == 0) { break; } } } __pmp_atomic_add32(&__ompc_cur_numthreads, count - 1); __pmp_debug(PMP_DEBUG_THREAD, "acquired %d out of %d threads\n", count, nthreads); return count; }
static inline void __pmp_thread_master_join (pmp_thread_t *master) { pmp_team_t *team = master->team; int32_t count; int thread_spin = __pmp_get_param()->thread_spin; int i; /* NOTE: insert a small spin loop here to try to arrange for the master * to arrive just after the last worker thread. If this happens * then we avoid a much more expensive thread synchronization. */ for (i = 0; i < thread_spin; i++) { /* USER LEVEL SPIN LOOP */ if (team->working_threads == 1) { team->working_threads = 0; return; } __pmp_yield(); } count = __pmp_atomic_xadd32(&team->working_threads, -1); __pmp_debug(PMP_DEBUG_THREAD, "master thread joins with count of %d\n", (int) count); assert(count >= 1); if (count > 1) { __pmp_thread_wait(master); } }
void __ompc_serialized_parallel (void) { pmp_thread_t *thread = __pmp_get_current_thread(); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_serialized_parallel\n"); __pmp_sample(PMP_PROFILE_OMPC_SERIALIZED_PARALLEL); thread->serialized_parallel++; }
int __ompc_get_thread_num (void) { int global_id = __pmp_get_current_global_id(); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_get_thread_num returns %d\n",global_id); __pmp_sample(PMP_PROFILE_OMPC_GET_THREAD_NUM); return global_id; }
void __ompc_ordered (int global_id) { pmp_thread_t *thread = __pmp_get_thread(global_id); if (__pmp_get_team_size(thread->team) > 1) { pmp_loop_t *loop = thread->loop; int64_t ticket_number = thread->ticket_number; int64_t now_serving; #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_ordered: global_id=%d\n", global_id); #endif __pmp_sample(PMP_PROFILE_OMPC_ORDERED); if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) { __pmp_warning("ordered directives must be used inside ordered " "OpenMP loops\n"); return; } assert(loop != NULL); now_serving = loop->now_serving; if (now_serving != ticket_number) { if ((loop->inc >= 0) ? (now_serving > ticket_number) : (now_serving < ticket_number)) { __pmp_warning("ordered OpenMP loop may result in program deadlock\n"); __pmp_warning("maybe due to multiple ordered directives " "in a loop iteration\n"); } while (loop->now_serving != ticket_number) { /* USER LEVEL SPIN LOOP */ __pmp_yield(); } } #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: now serving global_id=%d " " ticket_number=%" PRId64 "\n", global_id, ticket_number); #endif } __pmp_memory_fence(); }
int __ompc_master (int global_id) { int master = (__pmp_get_thread(global_id)->local_id == 0); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_master global_id=%d returns %d\n", global_id, master); __pmp_sample(PMP_PROFILE_OMPC_MASTER); return master; }
int __ompc_single (int global_id) { int result = __pmp_thread_single(__pmp_get_thread(global_id)); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_single global_id=%d returns %d\n", global_id, result); __pmp_sample(PMP_PROFILE_OMPC_SINGLE); return result; }
void __ompc_barrier (void) { pmp_thread_t *thread = __pmp_get_current_thread(); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_barrier\n"); __pmp_sample(PMP_PROFILE_OMPC_BARRIER); __pmp_memory_fence(); __pmp_thread_barrier(thread); }
static inline void __pmp_thread_work (pmp_thread_t *thread) { pmp_team_t *team = thread->team; assert(team != NULL); __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d, local_id %d, " "team at %p has lots of work to do now\n", thread->global_id, thread->local_id, team); /* NOTE: the id passed to the work function is the global_id. This is * passed back to certain library routines as the global_id parameter. * It can be used to find the thread structure very quickly. */ team->work(thread->global_id, team->fp); __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d, local_id %d, " "team at %p has no more work to do\n", thread->global_id, thread->local_id, team); }
void __ompc_static_fini (void) { #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_static_fini\n"); #endif __pmp_sample(PMP_PROFILE_OMPC_STATIC_FINI); /* no work here, NOTE: does not appear to be called by compiler anyway */ }
int __ompc_get_local_thread_num (void) { int local_id = __pmp_get_current_local_id(); assert(local_id != -1); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_get_local_thread_num returns %d\n", local_id); __pmp_sample(PMP_PROFILE_OMPC_GET_LOCAL_THREAD_NUM); return local_id; }
int __ompc_in_parallel (void) { pmp_thread_t *thread = __pmp_get_current_thread(); int in_parallel = (thread->team != NULL) || (thread->serialized_parallel > 0); __pmp_debug(PMP_DEBUG_CALLS, "__ompc_in_parallel returns %d\n", in_parallel); __pmp_sample(PMP_PROFILE_OMPC_IN_PARALLEL); return in_parallel; }
void __ompc_static_init_8 (int global_id, int sched, int64_t *lower, int64_t *upper, int64_t *stride, int64_t inc, int64_t chunk) { #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_static_init_8 global_id=%d with " "sched=%d, lower=%" PRId64 ", upper=%" PRId64 ", inc=%" PRId64 ", chunk=%" PRId64 "\n", global_id, sched, *lower, *upper, inc, chunk); #endif __pmp_sample(PMP_PROFILE_OMPC_STATIC_INIT_8); __pmp_static_init(global_id, sched, lower, upper, stride, inc, chunk); #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_static_init_8 global_id=%d returns " "lower=%" PRId64 ", upper=%" PRId64 ", stride=%" PRId64 "\n", global_id, *lower, *upper, *stride); #endif }
static inline void __pmp_thread_worker_join (pmp_team_t *team) { int32_t count = __pmp_atomic_xadd32(&team->working_threads, -1); __pmp_debug(PMP_DEBUG_THREAD, "worker thread joins with count of %d\n", (int) count); assert(count >= 1); if (count == 1) { __pmp_thread_wake(team->members[0]); } }
int __ompc_can_fork (void) { int team_size = __pmp_get_new_team_size(); int has_forked = (__pmp_get_main_thread()->nesting_depth > 0); pmp_param_t *param = __pmp_get_param(); int serial_outline = param->serial_outline; int disabled = param->disabled; int can_fork = (team_size > 1 || has_forked || serial_outline) && !disabled; __pmp_debug(PMP_DEBUG_CALLS, "__ompc_can_fork returns %d\n", can_fork); __pmp_sample(PMP_PROFILE_OMPC_CAN_FORK); return can_fork; }
void __pmp_thread_assign (pmp_team_t *team, pmp_thread_t *master, int nthreads) { pmp_idstack_t *idstack = &__pmp_manager.idstack; int i; __pmp_lock(master->global_id, &__pmp_manager.idlock); for (i = 1; i < nthreads; i++) { int global_id = __pmp_idstack_pop(idstack); pmp_thread_t *thread = __pmp_manager.threads + global_id; __pmp_debug(PMP_DEBUG_THREAD, "assigning thread global_id %d to local_id %d " "of team at %p\n", thread->global_id, i, team); assert(thread->global_id == global_id); assert(thread->local_id == -1); assert(thread->team == NULL); thread->local_id = i; thread->team = team; team->members[i] = thread; __pmp_debug(PMP_DEBUG_THREAD, "assigned thread global_id %d to local_id %d " "of team at %p\n", thread->global_id, thread->local_id, team); } __pmp_unlock(master->global_id, &__pmp_manager.idlock); for (i = 1; i < nthreads; i++) { /* This is pulled outside the idlock because waking a thread typically * involves an inter-CPU synchronization which is relatively expensive. */ __pmp_thread_wake(team->members[i]); } }
void __ompc_scheduler_init_8 (int global_id, int sched, int64_t lower, int64_t upper, int64_t inc, int64_t chunk) { #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_scheduler_init_8 global_id=%d with " "sched=%d, lower=%" PRId64 ", upper=%" PRId64 ", inc=%" PRId64 ", chunk=%" PRId64 "\n", global_id, sched, lower, upper, inc, chunk); #endif __pmp_sample(PMP_PROFILE_OMPC_SCHEDULER_INIT_8); __pmp_scheduler_init(global_id, sched, lower, upper, inc, chunk); }
void __ompc_end_ordered (int global_id) { pmp_thread_t *thread = __pmp_get_thread(global_id); __pmp_memory_fence(); if (__pmp_get_team_size(thread->team) > 1) { pmp_loop_t *loop = thread->loop; int64_t ticket_number = thread->ticket_number; #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_end_ordered: global_id=%d\n", global_id); #endif __pmp_sample(PMP_PROFILE_OMPC_END_ORDERED); if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) { if (thread->global_id == 0) __pmp_warning("ordered directives must be used inside ordered " "OpenMP loops\n"); return; } assert(loop != NULL); assert(loop->now_serving == ticket_number); #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: stop serving global_id=%d " " ticket_number=%" PRId64 "\n", global_id, ticket_number); #endif loop->now_serving += loop->inc; thread->ticket_number = ticket_number + loop->inc; } }
void __ompc_scheduler_init_4 (int global_id, int sched, int lower, int upper, int inc, int chunk) { #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_scheduler_init_4 global_id=%d with " "sched=%d, lower=%d, upper=%d, inc=%d, chunk=%d\n", global_id, sched, lower, upper, inc, chunk); #endif __pmp_sample(PMP_PROFILE_OMPC_SCHEDULER_INIT_4); __pmp_scheduler_init(global_id, sched, (int64_t) lower, (int64_t) upper, (int64_t) inc, (int64_t) chunk); }