static inline void __pmp_thread_master_join (pmp_thread_t *master) { pmp_team_t *team = master->team; int32_t count; int thread_spin = __pmp_get_param()->thread_spin; int i; /* NOTE: insert a small spin loop here to try to arrange for the master * to arrive just after the last worker thread. If this happens * then we avoid a much more expensive thread synchronization. */ for (i = 0; i < thread_spin; i++) { /* USER LEVEL SPIN LOOP */ if (team->working_threads == 1) { team->working_threads = 0; return; } __pmp_yield(); } count = __pmp_atomic_xadd32(&team->working_threads, -1); __pmp_debug(PMP_DEBUG_THREAD, "master thread joins with count of %d\n", (int) count); assert(count >= 1); if (count > 1) { __pmp_thread_wait(master); } }
void __ompc_copyin_thdprv (int n, ...) { pmp_global_id_t global_id; __pmp_debug(PMP_DEBUG_CALLS, "__ompc_copyin_thdprv: n=%d\n", n); __pmp_sample(PMP_PROFILE_OMPC_COPYIN_THDPRV); if (__pmp_get_param()->disabled) { return; } global_id = __pmp_get_current_global_id(); va_list ap; va_start(ap, n); while (n > 0) { void *dst = va_arg(ap, void*); void *src = va_arg(ap, void*); int size = va_arg(ap, int); if (dst != src) { __pmp_debug(PMP_DEBUG_THREAD, "__ompc_copyin_thdprv: global_id=%d " "dst: %p, src: %p, size: %d\n", global_id, dst, src, size); memcpy(dst, src, size); } n -= 3; } va_end(ap); }
static inline void __pmp_thread_wait (pmp_thread_t *thread) { int32_t sync; int thread_spin; int i; if (thread->sync == PMP_SYNC_UNBLOCKED) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (1)\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; return; } thread_spin = __pmp_get_param()->thread_spin; for (i = 0; i < thread_spin; i++) { /* USER LEVEL SPIN LOOP */ if (thread->sync == PMP_SYNC_UNBLOCKED) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (2)\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; return; } __pmp_yield(); } sync = __pmp_atomic_cmpxchg32(&thread->sync, PMP_SYNC_IDLE, PMP_SYNC_BLOCKED); if (sync == PMP_SYNC_IDLE) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is waiting\n", thread->global_id); __pmp_sample(PMP_PROFILE_THREAD_DESCHEDULE); #ifdef PMP_USE_PTHREAD_SIGNALS { int sig; do { sigwait(&__pmp_manager.mask_block_sigpmp, &sig); } while (sig != SIGPMP); } #else sigsuspend(&__pmp_manager.mask_unblock_sigpmp); /* NOTE: it is unfortunate that sigsuspend does not tell us which * signal has been raised. */ #endif __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is awake\n", thread->global_id); } else { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d does not block (3)\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; } }
int __ompc_can_fork (void) { int team_size = __pmp_get_new_team_size(); int has_forked = (__pmp_get_main_thread()->nesting_depth > 0); pmp_param_t *param = __pmp_get_param(); int serial_outline = param->serial_outline; int disabled = param->disabled; int can_fork = (team_size > 1 || has_forked || serial_outline) && !disabled; __pmp_debug(PMP_DEBUG_CALLS, "__ompc_can_fork returns %d\n", can_fork); __pmp_sample(PMP_PROFILE_OMPC_CAN_FORK); return can_fork; }
static inline void __pmp_scheduler_init (int global_id, int sched, int64_t lower, int64_t upper, int64_t inc, int64_t chunk) { /* NOTE: chunk parameter is undefined/unused for static even scheduling */ pmp_thread_t *thread = __pmp_get_thread(global_id); pmp_param_t *param = __pmp_get_param(); int64_t min_chunk = MAX(1, chunk); if (sched == PMP_SCHED_RUNTIME || sched == PMP_SCHED_ORDERED_RUNTIME) { int old = sched; sched = param->runtime_schedule; chunk = param->runtime_chunk; if (old == PMP_SCHED_ORDERED_RUNTIME) { sched += PMP_SCHED_ORDERED_OFFSET; } } if (sched == PMP_SCHED_GUIDED || sched == PMP_SCHED_ORDERED_GUIDED) { /* The initial chunk is loop trip count spread over the number of * threads (the division is rounded up) */ int team_size = __pmp_get_team_size(thread->team); int64_t divisor = team_size * param->guided_chunk_divisor; chunk = (upper - lower + divisor) / divisor; chunk = MIN(param->guided_chunk_max, chunk); chunk = MAX(min_chunk, chunk); } if (chunk <= 0) { if (thread->global_id == 0) __pmp_warning("Chunk size is non-positive, set to default '1'\n"); chunk = 1; } __pmp_scheduler_sample(sched); assert(inc != 0 && chunk != 0 && min_chunk != 0); __pmp_loop_alloc(thread, sched, lower, upper, inc, chunk, min_chunk); thread->iteration = 0; }
static void __pmp_thread_bind (pmp_thread_t *thread) { /* TODO : use dynamic information to bind threads appropriately */ pmp_param_t *param = __pmp_get_param(); if (param->enable_affinity) { int cpu; int index = param->global_affinity ? thread->global_id : thread->local_id; assert(index < PMP_MAX_THREADS); cpu = param->thread_to_cpu_map[index]; assert(cpu < param->machine_num_cpus); if (thread->cpu != cpu) { static bool __pmp_enable_affinity_warning = true; int e; if (__pmp_manager.params != NULL) { thread->param = &__pmp_manager.params[cpu]; } else { thread->param = &__pmp_param; } e = __pmp_set_affinity(cpu); __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_bind: global_id=%d, " "local_id=%d, CPU=%d, param=%p\n", thread->global_id, thread->local_id, cpu, thread->param); if (e != 0 && __pmp_enable_affinity_warning) { __pmp_warning("failed to set affinity\n"); __pmp_warning("maybe the kernel does not support " "affinity system calls\n"); __pmp_enable_affinity_warning = false; } thread->cpu = cpu; } /* TODO: give the thread an opportunity to move to its bound CPU * before continuing? Currently just do a __pmp_yield(). It is not * clear if this is necessary or sufficient. */ __pmp_yield(); } }
static inline void __pmp_loop_analyser (pmp_thread_t *thread, int sched, pmp_global_id_t global_id, pmp_local_id_t local_id, int64_t loop_lower, int64_t loop_upper, int64_t my_lower, int64_t my_upper, int64_t inc, int64_t chunk, int64_t stride) { pmp_loop_t *loop; bool allocated; #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__pmp_loop_analyser: sched=%d, global_id=%d, " "local_id=%d, loop_lower=%" PRId64 ", loop_upper=%" PRId64 ", " "my_lower=%" PRId64 ", my_upper=%" PRId64 ", inc=%" PRId64 ", chunk=%" PRId64 ", " "stride=%" PRId64 "\n", sched, global_id, local_id, loop_lower, loop_upper, my_lower, my_upper, inc, chunk, stride); #endif if (!__pmp_profile.enabled && !__pmp_get_param()->check) { return; } /* NOTE: set chunk=0 and stride=0 for a non-strided loop. They will * then be auto-sized to use the inner loop for the required iterations * from my_lower to my_upper (inclusive). The outer loop will run * only once. */ if (chunk == 0 && stride == 0) { chunk = (my_upper - my_lower) / inc + 1; } assert(inc != 0 && chunk != 0); assert((inc > 0 && stride >= 0) || (inc < 0 && stride <= 0)); allocated = false; #ifdef PMP_CHECK if (thread->loop == NULL) { /* For statically scheduled loops, allocate a loop to hold check data */ __pmp_loop_alloc(thread, sched, loop_lower, loop_upper, inc, chunk, chunk); allocated = true; } #endif /* NOTE: filter out cases where the loop contains no iterations */ if ((inc >= 0) ? (my_lower <= my_upper) : (my_lower >= my_upper)) { assert((inc >= 0) ? (loop_lower <= my_lower && my_lower <= my_upper && my_upper <= loop_upper) : (loop_upper <= my_upper && my_upper <= my_lower && my_lower <= loop_lower)); loop = thread->loop; #if (defined PMP_PROFILE) || (defined PMP_CHECK) if (inc >= 0) { int64_t count = 0; int64_t outer = my_lower; while (outer <= loop_upper) { int64_t inner = outer; int64_t i; for (i = 0; i < chunk && inner <= my_upper; i++) { __pmp_loop_check(loop, inner); count++; inner += inc; } if (inner > loop_upper) { __pmp_last_check(loop); } if (stride == 0) { break; } else { outer += stride; my_upper = MIN(my_upper + stride, loop_upper); } } __pmp_profile_iterations(global_id, count); } else { int64_t count = 0; int64_t outer = my_lower; while (outer >= loop_upper) { int64_t inner = outer; int64_t i; for (i = 0; i < chunk && inner >= my_upper; i++) { __pmp_loop_check(loop, inner); count++; inner += inc; } if (inner < loop_upper) { __pmp_last_check(loop); } if (stride == 0) { break; } else { outer += stride; my_upper = MAX(my_upper + stride, loop_upper); } } __pmp_profile_iterations(global_id, count); } #endif } #ifdef PMP_CHECK if (allocated) { /* For statically scheduled loops, deallocate the loop */ __pmp_loop_free(thread); } #endif }
static inline int __pmp_schedule_next (int global_id, int64_t *lowerp, int64_t *upperp, int64_t *incp) { pmp_thread_t *thread = __pmp_get_thread(global_id); int team_size = __pmp_get_team_size(thread->team); int64_t iteration = thread->iteration; pmp_local_id_t local_id = thread->local_id; pmp_loop_t *loop = thread->loop; assert(loop != NULL); assert(local_id < team_size); if (team_size == 1) { if (iteration == 0) { *lowerp = loop->lower; *upperp = loop->upper; *incp = loop->inc; thread->ticket_number = loop->lower; thread->iteration = 1; __pmp_loop_analyser(thread, loop->sched, global_id, local_id, loop->lower, loop->upper, *lowerp, *upperp, *incp, 0, 0); return 1; } else { assert(iteration == 1); __pmp_loop_free(thread); return 0; } } else { int sched = loop->sched; int64_t lower = loop->lower; int64_t upper = loop->upper; int64_t inc = loop->inc; int64_t chunk = loop->chunk; switch (sched) { case PMP_SCHED_STATIC: case PMP_SCHED_ORDERED_STATIC: { /* NOTE: setting a small value of chunk causes (unnecessary) iteration * through this code. If the chunk is ignored, the code degenerates * into the static even case (which is the default). */ int64_t size = (upper - lower) / inc + 1; int64_t size_per_thread = ((size - 1) / team_size + 1) * inc; int64_t thread_lower = lower + (local_id * size_per_thread); int64_t thread_upper = thread_lower + size_per_thread - inc; int64_t this_lower = thread_lower + (iteration * chunk * inc); int64_t this_upper = this_lower + (chunk - 1) * inc; thread_upper = LOOPMIN(inc, thread_upper, upper); this_upper = LOOPMIN(inc, this_upper, thread_upper); if ((inc >= 0) ? (this_lower > thread_upper) : (this_lower < thread_upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = this_lower; *upperp = this_upper; thread->ticket_number = this_lower; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } case PMP_SCHED_STATIC_EVEN: case PMP_SCHED_ORDERED_STATIC_EVEN: { if (iteration == 0) { int64_t size = (upper - lower) / inc + 1; int64_t thread_lower; int64_t thread_upper; if (!__pmp_get_param()->static_fair) { int64_t size_per_thread = ((size - 1) / team_size + 1) * inc; thread_lower = lower + (local_id * size_per_thread); thread_upper = thread_lower + size_per_thread - inc; } else { int64_t chunk = size / team_size; int64_t remainder = size - (chunk * team_size); int64_t index = MIN(local_id, remainder) * (chunk + 1); if (local_id > remainder) { index += (local_id - remainder) * chunk; } thread_lower = lower + (index * inc); chunk += (local_id < remainder); thread_upper = thread_lower + (chunk - 1) * inc; } thread_upper = LOOPMIN(inc, thread_upper, upper); if ((inc >= 0) ? (thread_lower > thread_upper) : (thread_lower < thread_upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = thread_lower; *upperp = thread_upper; thread->ticket_number = thread_lower; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } } else { assert(iteration == 1); __pmp_loop_free(thread); return 0; } /* NOT REACHED */ break; } case PMP_SCHED_DYNAMIC: case PMP_SCHED_ORDERED_DYNAMIC: { int64_t stride = inc * chunk; #if __WORDSIZE == 64 int64_t current = __pmp_atomic_xadd64(&loop->current, stride); #else /* TODO: the atomic xadd64 is a problem for 32-bit compilation */ /* the workaround below is just to do a 32-bit atomic add */ int64_t current; current = (int64_t) __pmp_atomic_xadd32((int32_t *) &loop->current, (int32_t) stride); #endif if ((inc >= 0) ? (current > upper) : (current < upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = current; *upperp = *lowerp + stride - inc; *upperp = LOOPMIN(inc, upper, *upperp); thread->ticket_number = current; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } case PMP_SCHED_GUIDED: case PMP_SCHED_ORDERED_GUIDED: { /* NOTE: guided scheduling uses a heuristic to choose a good * chunk size to divide up the remaining iterations amongst * the team (subject to a minimum). An exact implementation of * this would require a lock on the loop data. However, the * heuristic can be approximated using (possibly) stale values * and this should be good enough. The value of "remaining" * is monotonically decreasing. The worst that could happen * is that an update to loop->chunk is lost slightly unbalancing * the distribution. The most important point is that loop->current * is maintained atomically. */ /* UPDATE: if cmpxchg64 is available then this is used to protect * the update of loop->chunk. This is fairly cunning, and makes * the chunk update more accurate in this case! */ int64_t min_chunk = loop->min_chunk; int64_t remaining = upper - loop->current + 1; /* estimate */ int64_t my_chunk = MAX(min_chunk, MIN(chunk, remaining));/* estimate */ int64_t stride = inc * my_chunk; #if __WORDSIZE == 64 int64_t current = __pmp_atomic_xadd64(&loop->current, stride); #else /* TODO: the atomic xadd64 is a problem for 32-bit compilation */ /* the workaround below is just to do a 32-bit atomic add */ int64_t current = __pmp_atomic_xadd32((int32_t *) &loop->current, (int32_t) stride); #endif assert(stride != 0); #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__pmp_schedule_next: global_id=%d, " "remaining=%d, my_chunk=%d, stride=%d, current=%d\n", global_id, remaining, my_chunk, stride, current); #endif if ((inc >= 0) ? (current > upper) : (current < upper)) { __pmp_loop_free(thread); return 0; } else { pmp_param_t *param = __pmp_get_param(); int64_t my_upper = LOOPMIN(inc, upper, current + stride - inc); int64_t new_chunk; int64_t divisor; remaining = upper - my_upper; /* estimate */ divisor = team_size * param->guided_chunk_divisor; new_chunk = (remaining + divisor - 1) / divisor; new_chunk = MIN(param->guided_chunk_max, new_chunk); new_chunk = MAX(min_chunk, new_chunk); #if __WORDSIZE == 64 (void) __pmp_atomic_cmpxchg64(&loop->chunk, chunk, new_chunk); #else loop->chunk = new_chunk; /* estimate */ #endif *incp = inc; *lowerp = current; *upperp = my_upper; thread->ticket_number = current; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } default: { __pmp_fatal("unknown dynamic scheduling type %d\n", sched); break; } } /* NOT REACHED */ assert(0); __pmp_loop_free(thread); return 0; } /* NOT REACHED */ }
static inline void __pmp_static_init (int global_id, int sched, int64_t *lowerp, int64_t *upperp, int64_t *stridep, int64_t inc, int64_t chunk) { /* NOTE: chunk parameter is undefined/unused for static even scheduling */ pmp_thread_t *thread = __pmp_get_thread(global_id); int team_size = __pmp_get_team_size(thread->team); int64_t loop_lower = *lowerp; int64_t loop_upper = *upperp; int64_t lower; int64_t upper; assert(team_size > 0); if (chunk <= 0) { if (thread->global_id == 0) __pmp_warning("Chunk size is non-positive, set to default '1'\n"); chunk = 1; } if (team_size == 1) { *stridep = (inc > 0) ? (loop_upper - loop_lower + 1) : (loop_upper - loop_lower - 1); } else { pmp_local_id_t local_id = thread->local_id; int64_t stride; switch (sched) { case PMP_SCHED_STATIC_EVEN: { int64_t size = (loop_upper - loop_lower) / inc + 1; assert(size >= 0); if (!__pmp_get_param()->static_fair) { /* The size is divided by the team_size and rounded up to give * the chunk size. Chunks of this size are assigned to threads * in increased local_id order. If the division was not exact * then the last thread will have fewer iterations, and possibly * none at all. */ chunk = (size + team_size - 1) / team_size; lower = loop_lower + (local_id * chunk * inc); } else { /* The size is divided by the team_size and rounded down to * give the chunk. Each thread will have at least this many * iterations. If the division was not exact then the remainder * iterations are scheduled across the threads in increasing * thread order. Note that the difference between the minimum * and maximum number of iterations assigned to the threads * across the team is at most 1. The maximum number of iterations * assigned to a thread (the worst case path through the schedule) * is the same as for default behavior. */ int64_t remainder; int64_t index; chunk = size / team_size; remainder = size - (chunk * team_size); index = MIN(local_id, remainder) * (chunk + 1); if (local_id > remainder) { index += (local_id - remainder) * chunk; } lower = loop_lower + (index * inc); chunk += (local_id < remainder); } if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) { upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper); stride = size * inc; } else { /* If the entire set of iterations falls out of the loop bounds * then arrange for a non-iterating loop which will not trigger * the LASTPRIVATE check made by the compiler. This means that * the final value of the loop induction variable must not exceed * the loop upper bound. */ lower = loop_lower - inc; upper = lower - inc; stride = inc; } __pmp_loop_analyser(thread, sched, global_id, local_id, loop_lower, loop_upper, lower, upper, inc, chunk, stride); break; } case PMP_SCHED_STATIC: { stride = chunk * inc; lower = loop_lower + (local_id * stride); if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) { upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper); stride *= team_size; } else { /* If the entire set of iterations falls out of the loop bounds * then arrange for a non-iterating loop which will not trigger * the LASTPRIVATE check made by the compiler. This means that * the final value of the loop induction variable must not exceed * the loop upper bound. */ lower = loop_lower - inc; upper = lower - inc; stride = inc; } __pmp_loop_analyser(thread, sched, global_id, local_id, loop_lower, loop_upper, lower, upper, inc, chunk, stride); break; } default: { __pmp_fatal("unknown static scheduling type %d\n", sched); stride = 0; lower = loop_lower; upper = loop_upper; } } *lowerp = lower; *upperp = upper; *stridep = stride; } __pmp_scheduler_sample(sched); }
void __ompc_get_thdprv (void ***thdprv, int64_t size, void *data, int global_id) { __pmp_debug(PMP_DEBUG_CALLS, "__ompc_get_thdprv: thdprv=%p, size=%ld, " "data=%p, global_id=%d\n", thdprv, (long) size, data, global_id); __pmp_sample(PMP_PROFILE_OMPC_GET_THDPRV); if (__pmp_get_param()->disabled) { void **t = (void **) calloc (1, sizeof(void *)); if (t == NULL) { __pmp_fatal("failed to allocate thread private data\n"); } t[0] = data; *thdprv = t; } else { void **t = *thdprv; if (t == NULL) { /* TODO: can I reduce the size of this array? Note that it is indexed * by global_id and global_id's can be arbitrarily assigned to threads * in general, so this may be difficult. */ void *t_new; void *t_cur; t = (void **) calloc(PMP_MAX_THREADS, sizeof(void *)); if (t == NULL) { __pmp_fatal("failed to allocate thread private data\n"); } t_new = (void *) t; t_cur = __pmp_atomic_cmpxchgptr((volatile voidptr_t *) thdprv, NULL, t_new); if (t_cur != NULL) { /* This thread lost the race and another thread has already * installed a thdprv array. Simply back out this allocation * and use *thdprv. */ free(t); t = (void **) t_cur; } } if (t[global_id] == NULL) { /* The OpenMP 2.5 standard says: * * "Each copy of a threadprivate object is initialized once, in the manner * specified by the program, but at an unspecified point in the program * prior to the first reference to that copy." * * Since the initial values live in the statically allocated block of * memory passed to our "data" argument, the master thread needs to use * a dynamically allocated block, just as the additional threads do, so * that it if it changes its copies of the variables before the program * enters the first parallel region, those changes have no effect on the * copies in the additional threads. Observation shows that the code * generator calls __ompc_get_thdprv from the serial portion of the * program, for the master thread, before it changes any values. * * Note the copying is done without synchronization, which is safe only * because we're copying statically initialized and subsequently * unchanged values: copying from the main thread would require a * barrier. */ t[global_id] = (void *) malloc(size); if (t[global_id] == NULL) { __pmp_fatal("failed to allocate thread private data"); } memcpy(t[global_id], data, size); } } }
void __pmp_thread_create (pmp_thread_t *thread) { pmp_thread_t *creator = __pmp_get_current_thread(); pthread_t pthread_id; int result; pmp_param_t *param = __pmp_get_param(); thread->creator = creator; if (param->thread_guard_size > 0) { void *guard; /* NOTE: this lock is to give a better chance of the guard page * allocation to immediately follow the pthread stack allocation. */ __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock); /* NOTE: it seems that mmap tends to allocate in an upwards direction so allocate the guard page first. */ guard = mmap(0, param->thread_guard_size, PROT_NONE, #if defined(BUILD_OS_DARWIN) MAP_PRIVATE | MAP_ANON, #else /* defined(BUILD_OS_DARWIN) */ MAP_PRIVATE | MAP_ANONYMOUS, #endif /* defined(BUILD_OS_DARWIN) */ 0, 0); if (guard == MAP_FAILED) { __pmp_warning("unable to allocate a guard page of %ld bytes\n", (long) param->thread_guard_size); } else { __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n", guard); thread->guard_page = guard; } } if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr, __pmp_thread_run, thread)) != 0) { if (__pmp_manager.allocated_threads > param->initial_team_size) { __pmp_warning( "pthread_create failed when trying to allocate thread %d\n", __pmp_manager.allocated_threads); __pmp_warning( "note this is more than the initial number of threads (%d)\n", param->initial_team_size); #if defined(BUILD_OS_DARWIN) if (sizeof(long) == 4) #else /* defined(BUILD_OS_DARWIN) */ if (__WORDSIZE == 32) #endif /* defined(BUILD_OS_DARWIN) */ { int64_t total_stack = ((int64_t) param->thread_stack_size) * ((int64_t) __pmp_manager.allocated_threads); if (total_stack > 0x40000000LL) { __pmp_warning( "the failure may be due to excessive thread stack size\n"); __pmp_warning( "try using a smaller setting for PSC_OMP_STACK_SIZE\n"); } } } __pmp_fatal("unable to create thread (result code %d)\n", result); } if (param->thread_guard_size > 0) { __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock); } __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1); __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", thread->global_id); }