void __ompc_ordered (int global_id) { pmp_thread_t *thread = __pmp_get_thread(global_id); if (__pmp_get_team_size(thread->team) > 1) { pmp_loop_t *loop = thread->loop; int64_t ticket_number = thread->ticket_number; int64_t now_serving; #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_ordered: global_id=%d\n", global_id); #endif __pmp_sample(PMP_PROFILE_OMPC_ORDERED); if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) { __pmp_warning("ordered directives must be used inside ordered " "OpenMP loops\n"); return; } assert(loop != NULL); now_serving = loop->now_serving; if (now_serving != ticket_number) { if ((loop->inc >= 0) ? (now_serving > ticket_number) : (now_serving < ticket_number)) { __pmp_warning("ordered OpenMP loop may result in program deadlock\n"); __pmp_warning("maybe due to multiple ordered directives " "in a loop iteration\n"); } while (loop->now_serving != ticket_number) { /* USER LEVEL SPIN LOOP */ __pmp_yield(); } } #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: now serving global_id=%d " " ticket_number=%" PRId64 "\n", global_id, ticket_number); #endif } __pmp_memory_fence(); }
static void __pmp_thread_bind (pmp_thread_t *thread) { /* TODO : use dynamic information to bind threads appropriately */ pmp_param_t *param = __pmp_get_param(); if (param->enable_affinity) { int cpu; int index = param->global_affinity ? thread->global_id : thread->local_id; assert(index < PMP_MAX_THREADS); cpu = param->thread_to_cpu_map[index]; assert(cpu < param->machine_num_cpus); if (thread->cpu != cpu) { static bool __pmp_enable_affinity_warning = true; int e; if (__pmp_manager.params != NULL) { thread->param = &__pmp_manager.params[cpu]; } else { thread->param = &__pmp_param; } e = __pmp_set_affinity(cpu); __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_bind: global_id=%d, " "local_id=%d, CPU=%d, param=%p\n", thread->global_id, thread->local_id, cpu, thread->param); if (e != 0 && __pmp_enable_affinity_warning) { __pmp_warning("failed to set affinity\n"); __pmp_warning("maybe the kernel does not support " "affinity system calls\n"); __pmp_enable_affinity_warning = false; } thread->cpu = cpu; } /* TODO: give the thread an opportunity to move to its bound CPU * before continuing? Currently just do a __pmp_yield(). It is not * clear if this is necessary or sufficient. */ __pmp_yield(); } }
static inline void __pmp_scheduler_init (int global_id, int sched, int64_t lower, int64_t upper, int64_t inc, int64_t chunk) { /* NOTE: chunk parameter is undefined/unused for static even scheduling */ pmp_thread_t *thread = __pmp_get_thread(global_id); pmp_param_t *param = __pmp_get_param(); int64_t min_chunk = MAX(1, chunk); if (sched == PMP_SCHED_RUNTIME || sched == PMP_SCHED_ORDERED_RUNTIME) { int old = sched; sched = param->runtime_schedule; chunk = param->runtime_chunk; if (old == PMP_SCHED_ORDERED_RUNTIME) { sched += PMP_SCHED_ORDERED_OFFSET; } } if (sched == PMP_SCHED_GUIDED || sched == PMP_SCHED_ORDERED_GUIDED) { /* The initial chunk is loop trip count spread over the number of * threads (the division is rounded up) */ int team_size = __pmp_get_team_size(thread->team); int64_t divisor = team_size * param->guided_chunk_divisor; chunk = (upper - lower + divisor) / divisor; chunk = MIN(param->guided_chunk_max, chunk); chunk = MAX(min_chunk, chunk); } if (chunk <= 0) { if (thread->global_id == 0) __pmp_warning("Chunk size is non-positive, set to default '1'\n"); chunk = 1; } __pmp_scheduler_sample(sched); assert(inc != 0 && chunk != 0 && min_chunk != 0); __pmp_loop_alloc(thread, sched, lower, upper, inc, chunk, min_chunk); thread->iteration = 0; }
void __ompc_end_ordered (int global_id) { pmp_thread_t *thread = __pmp_get_thread(global_id); __pmp_memory_fence(); if (__pmp_get_team_size(thread->team) > 1) { pmp_loop_t *loop = thread->loop; int64_t ticket_number = thread->ticket_number; #ifdef SUPER_DEBUG if (Enabled_Libomp_Call_Debug) __pmp_debug("CALLS_DEBUG", "__ompc_end_ordered: global_id=%d\n", global_id); #endif __pmp_sample(PMP_PROFILE_OMPC_END_ORDERED); if (loop == NULL || loop->sched <= PMP_SCHED_ORDERED_OFFSET) { if (thread->global_id == 0) __pmp_warning("ordered directives must be used inside ordered " "OpenMP loops\n"); return; } assert(loop != NULL); assert(loop->now_serving == ticket_number); #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__ompc_ordered: stop serving global_id=%d " " ticket_number=%" PRId64 "\n", global_id, ticket_number); #endif loop->now_serving += loop->inc; thread->ticket_number = ticket_number + loop->inc; } }
void __pmp_catch_segv (void) { #define ALT_STACK_SIZE 8192 void *new_stack = malloc(ALT_STACK_SIZE); stack_t ss = { .ss_sp = new_stack, .ss_flags = 0, .ss_size = ALT_STACK_SIZE }; struct sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_sigaction = __pmp_segv; sa.sa_flags = SA_RESETHAND | SA_ONSTACK | SA_SIGINFO | SA_NODEFER; if (sigaltstack(&ss, NULL) == -1) { perror("sigaltstack"); exit(1); } if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("sigaction"); exit(1); } } #define STACK_MIN 65536 #define dprint(...) \ do { \ if (verbose) { \ fprintf(stderr, __VA_ARGS__); \ } \ } while (0) int __pmp_get_stack_size_limit(const char *new_limit, int64_t *max_stack_ptr, int nthreads) { struct rlimit rl; int64_t ncpus; int64_t phys_mem; int64_t max_stack; int verbose = getenv("PSC_STACK_VERBOSE") != NULL; *max_stack_ptr = 0; if (getrlimit(RLIMIT_STACK, &rl) == -1) { __pmp_warning("could not calculate your stack size limit\n"); return -1; } if (rl.rlim_cur == RLIM_INFINITY) { dprint("No stack size limits currently in place\n"); } else { dprint("Stack size limits: %ld current, %ld maximum\n", (long) rl.rlim_cur, (long) rl.rlim_max); } #if defined(BUILD_OS_DARWIN) phys_mem = get_sysctl_int("hw.memsize"); ncpus = get_sysctl_int(SYSCTL_NPROCESSORS_ONLN); #elif defined(__NetBSD__) phys_mem = get_sysctl_int("hw.physmem64"); ncpus = sysconf(_SC_NPROCESSORS_ONLN); #else /* defined(BUILD_OS_DARWIN) */ phys_mem = (int64_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE); ncpus = sysconf(_SC_NPROCESSORS_ONLN); #endif /* defined(BUILD_OS_DARWIN) */ nthreads = (nthreads > 0) ? nthreads : ncpus; dprint("Physical memory: %"PRId64" bytes\n", phys_mem); dprint("Number of CPUs: %"PRId64"\n", ncpus); dprint("Default number of threads per team: %d\n", nthreads); max_stack = phys_mem - 64LL * 1048576; if (max_stack > ULONG_MAX) { max_stack = ULONG_MAX; } if (phys_mem > 512LL * 1048576) { max_stack -= 128LL * 1048576 * ncpus; } else { max_stack -= (phys_mem >> 3) * ncpus; } if (nthreads > 1) { /* share max_stack over nthreads for OpenMP programs */ max_stack /= nthreads; } dprint("Automatic maximum stack size limit: %"PRId64" (%"PRId64"%% of RAM)\n", max_stack, (max_stack * 100) / phys_mem); if (new_limit && *new_limit) { double max; char *end; max = strtod(new_limit, &end); if (errno == ERANGE) { __pmp_warning("your requested stack size limit of " "\"%s\" is not well-formed\n", new_limit); return -1; } switch (tolower((unsigned char)*end)) { case 'k': max *= 1024; break; case 'm': max *= 1048576; break; case 'g': max *= 1073741824; break; case '%': max = phys_mem * max / 100; break; default: break; } if (*end && strcasecmp(end + 1, "/cpu") == 0) { max *= ncpus; } if (isinf(max)) { max_stack = RLIM_INFINITY; } else if (max < 0) { max_stack = phys_mem + max; } else { max_stack = max; } if (max_stack != RLIM_INFINITY && max_stack < STACK_MIN) { __pmp_warning("bad maximum stack size " "limit of %"PRId64" (specified as \"%s\")\n", max_stack, new_limit); return -1; } if (max_stack > ULONG_MAX) { __pmp_warning("treating requested stack " "size limit of %"PRId64" as no limit\n", max_stack); max_stack = RLIM_INFINITY; } if (max_stack == RLIM_INFINITY) { dprint("You have asked for no stack size limit\n"); } else { dprint("You have asked for a stack size limit of " "%"PRId64" (%"PRId64"%% of RAM)\n", max_stack, (max_stack * 100) / phys_mem); } if (max_stack > phys_mem) { __pmp_warning("your requested stack " "size limit is %"PRId64"%% of physical memory\n", (max_stack * 100) / phys_mem); } } else if (rl.rlim_cur == RLIM_INFINITY || rl.rlim_cur > max_stack) { dprint("Will not automatically reduce stack size limit\n"); } if (max_stack > rl.rlim_max) max_stack = rl.rlim_max; *max_stack_ptr = max_stack; return 0; }
static inline void __pmp_static_init (int global_id, int sched, int64_t *lowerp, int64_t *upperp, int64_t *stridep, int64_t inc, int64_t chunk) { /* NOTE: chunk parameter is undefined/unused for static even scheduling */ pmp_thread_t *thread = __pmp_get_thread(global_id); int team_size = __pmp_get_team_size(thread->team); int64_t loop_lower = *lowerp; int64_t loop_upper = *upperp; int64_t lower; int64_t upper; assert(team_size > 0); if (chunk <= 0) { if (thread->global_id == 0) __pmp_warning("Chunk size is non-positive, set to default '1'\n"); chunk = 1; } if (team_size == 1) { *stridep = (inc > 0) ? (loop_upper - loop_lower + 1) : (loop_upper - loop_lower - 1); } else { pmp_local_id_t local_id = thread->local_id; int64_t stride; switch (sched) { case PMP_SCHED_STATIC_EVEN: { int64_t size = (loop_upper - loop_lower) / inc + 1; assert(size >= 0); if (!__pmp_get_param()->static_fair) { /* The size is divided by the team_size and rounded up to give * the chunk size. Chunks of this size are assigned to threads * in increased local_id order. If the division was not exact * then the last thread will have fewer iterations, and possibly * none at all. */ chunk = (size + team_size - 1) / team_size; lower = loop_lower + (local_id * chunk * inc); } else { /* The size is divided by the team_size and rounded down to * give the chunk. Each thread will have at least this many * iterations. If the division was not exact then the remainder * iterations are scheduled across the threads in increasing * thread order. Note that the difference between the minimum * and maximum number of iterations assigned to the threads * across the team is at most 1. The maximum number of iterations * assigned to a thread (the worst case path through the schedule) * is the same as for default behavior. */ int64_t remainder; int64_t index; chunk = size / team_size; remainder = size - (chunk * team_size); index = MIN(local_id, remainder) * (chunk + 1); if (local_id > remainder) { index += (local_id - remainder) * chunk; } lower = loop_lower + (index * inc); chunk += (local_id < remainder); } if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) { upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper); stride = size * inc; } else { /* If the entire set of iterations falls out of the loop bounds * then arrange for a non-iterating loop which will not trigger * the LASTPRIVATE check made by the compiler. This means that * the final value of the loop induction variable must not exceed * the loop upper bound. */ lower = loop_lower - inc; upper = lower - inc; stride = inc; } __pmp_loop_analyser(thread, sched, global_id, local_id, loop_lower, loop_upper, lower, upper, inc, chunk, stride); break; } case PMP_SCHED_STATIC: { stride = chunk * inc; lower = loop_lower + (local_id * stride); if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) { upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper); stride *= team_size; } else { /* If the entire set of iterations falls out of the loop bounds * then arrange for a non-iterating loop which will not trigger * the LASTPRIVATE check made by the compiler. This means that * the final value of the loop induction variable must not exceed * the loop upper bound. */ lower = loop_lower - inc; upper = lower - inc; stride = inc; } __pmp_loop_analyser(thread, sched, global_id, local_id, loop_lower, loop_upper, lower, upper, inc, chunk, stride); break; } default: { __pmp_fatal("unknown static scheduling type %d\n", sched); stride = 0; lower = loop_lower; upper = loop_upper; } } *lowerp = lower; *upperp = upper; *stridep = stride; } __pmp_scheduler_sample(sched); }
void __pmp_thread_create (pmp_thread_t *thread) { pmp_thread_t *creator = __pmp_get_current_thread(); pthread_t pthread_id; int result; pmp_param_t *param = __pmp_get_param(); thread->creator = creator; if (param->thread_guard_size > 0) { void *guard; /* NOTE: this lock is to give a better chance of the guard page * allocation to immediately follow the pthread stack allocation. */ __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock); /* NOTE: it seems that mmap tends to allocate in an upwards direction so allocate the guard page first. */ guard = mmap(0, param->thread_guard_size, PROT_NONE, #if defined(BUILD_OS_DARWIN) MAP_PRIVATE | MAP_ANON, #else /* defined(BUILD_OS_DARWIN) */ MAP_PRIVATE | MAP_ANONYMOUS, #endif /* defined(BUILD_OS_DARWIN) */ 0, 0); if (guard == MAP_FAILED) { __pmp_warning("unable to allocate a guard page of %ld bytes\n", (long) param->thread_guard_size); } else { __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n", guard); thread->guard_page = guard; } } if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr, __pmp_thread_run, thread)) != 0) { if (__pmp_manager.allocated_threads > param->initial_team_size) { __pmp_warning( "pthread_create failed when trying to allocate thread %d\n", __pmp_manager.allocated_threads); __pmp_warning( "note this is more than the initial number of threads (%d)\n", param->initial_team_size); #if defined(BUILD_OS_DARWIN) if (sizeof(long) == 4) #else /* defined(BUILD_OS_DARWIN) */ if (__WORDSIZE == 32) #endif /* defined(BUILD_OS_DARWIN) */ { int64_t total_stack = ((int64_t) param->thread_stack_size) * ((int64_t) __pmp_manager.allocated_threads); if (total_stack > 0x40000000LL) { __pmp_warning( "the failure may be due to excessive thread stack size\n"); __pmp_warning( "try using a smaller setting for PSC_OMP_STACK_SIZE\n"); } } } __pmp_fatal("unable to create thread (result code %d)\n", result); } if (param->thread_guard_size > 0) { __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock); } __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1); __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", thread->global_id); }