static inline void __pmp_thread_wake (pmp_thread_t *thread) { int32_t sync = __pmp_atomic_cmpxchg32(&thread->sync, PMP_SYNC_IDLE, PMP_SYNC_UNBLOCKED); assert(sync != PMP_SYNC_UNBLOCKED); if (sync == PMP_SYNC_BLOCKED) { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is being signaled\n", thread->global_id); thread->sync = PMP_SYNC_IDLE; assert(thread->tid != -1); __pmp_sample(PMP_PROFILE_THREAD_RESCHEDULE); #if (defined PMP_USE_PTHREAD_SIGNALS) if (pthread_kill(thread->pthread_id, SIGPMP) != 0) { __pmp_fatal("unable to wake thread using pthread_kill\n"); } #elif (defined PMP_NO_NPTL) if (kill(thread->tid, SIGPMP) != 0) { __pmp_fatal("unable to wake thread using kill\n"); } #else if (tkill(thread->tid, SIGPMP) != 0) { __pmp_fatal("unable to wake thread using tkill\n"); } #endif } else { __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is woken\n", thread->global_id); } }
void __pmp_thread_create_main (void) { pmp_thread_t *thread = __pmp_get_main_thread(); int global_id; global_id = __pmp_idstack_pop(&__pmp_manager.idstack); assert(global_id == 0); thread->pthread_id = pthread_self(); #ifdef PMP_NO_NPTL thread->tid = getpid(); #else thread->tid = gettid(); #endif thread->local_id = 0; #ifndef PMP_NO_TLS #ifdef PMP_TLS_THREAD __pmp_tls_current_thread = thread; #endif #ifdef PMP_TLS_LOCAL_ID __pmp_tls_current_local_id = 0; #endif #ifdef PMP_TLS_GLOBAL_ID __pmp_tls_current_global_id = thread->global_id; #endif #endif #ifdef PMP_USE_PTHREAD_SIGNALS if (pthread_sigmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) { __pmp_fatal("unable to set thread-specific sigmask\n"); } #else if (sigprocmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) { __pmp_fatal("unable to set thread-specific sigmask\n"); } #endif if (pthread_setspecific(__pmp_manager.thread_key, (void *) thread) != 0) { __pmp_fatal("unable to set thread-specific data\n"); } __pmp_thread_bind(thread); /* early master bind */ __pmp_debug(PMP_DEBUG_THREAD, "created main thread global_id %d\n", thread->global_id); __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_create_main: tid=%d, pthread_id=0x%08x " "global_id=%d, local_id=%d\n", (int) thread->tid, (int) thread->pthread_id, (int) thread->global_id, (int) thread->local_id); }
static inline void __pmp_scheduler_sample (int sched) { #ifdef PMP_PROFILE switch (sched) { case PMP_SCHED_STATIC: { __pmp_sample(PMP_PROFILE_SCHED_STATIC); break; } case PMP_SCHED_STATIC_EVEN: { __pmp_sample(PMP_PROFILE_SCHED_STATIC_EVEN); break; } case PMP_SCHED_DYNAMIC: { __pmp_sample(PMP_PROFILE_SCHED_DYNAMIC); break; } case PMP_SCHED_GUIDED: { __pmp_sample(PMP_PROFILE_SCHED_GUIDED); break; } case PMP_SCHED_ORDERED_STATIC: { __pmp_sample(PMP_PROFILE_SCHED_ORDERED_STATIC); break; } case PMP_SCHED_ORDERED_STATIC_EVEN: { __pmp_sample(PMP_PROFILE_SCHED_ORDERED_STATIC_EVEN); break; } case PMP_SCHED_ORDERED_DYNAMIC: { __pmp_sample(PMP_PROFILE_SCHED_ORDERED_DYNAMIC); break; } case PMP_SCHED_ORDERED_GUIDED: { __pmp_sample(PMP_PROFILE_SCHED_ORDERED_GUIDED); break; } default: { __pmp_fatal("unknown dynamic scheduling type %d\n", sched); break; } } #endif }
static inline int __pmp_schedule_next (int global_id, int64_t *lowerp, int64_t *upperp, int64_t *incp) { pmp_thread_t *thread = __pmp_get_thread(global_id); int team_size = __pmp_get_team_size(thread->team); int64_t iteration = thread->iteration; pmp_local_id_t local_id = thread->local_id; pmp_loop_t *loop = thread->loop; assert(loop != NULL); assert(local_id < team_size); if (team_size == 1) { if (iteration == 0) { *lowerp = loop->lower; *upperp = loop->upper; *incp = loop->inc; thread->ticket_number = loop->lower; thread->iteration = 1; __pmp_loop_analyser(thread, loop->sched, global_id, local_id, loop->lower, loop->upper, *lowerp, *upperp, *incp, 0, 0); return 1; } else { assert(iteration == 1); __pmp_loop_free(thread); return 0; } } else { int sched = loop->sched; int64_t lower = loop->lower; int64_t upper = loop->upper; int64_t inc = loop->inc; int64_t chunk = loop->chunk; switch (sched) { case PMP_SCHED_STATIC: case PMP_SCHED_ORDERED_STATIC: { /* NOTE: setting a small value of chunk causes (unnecessary) iteration * through this code. If the chunk is ignored, the code degenerates * into the static even case (which is the default). */ int64_t size = (upper - lower) / inc + 1; int64_t size_per_thread = ((size - 1) / team_size + 1) * inc; int64_t thread_lower = lower + (local_id * size_per_thread); int64_t thread_upper = thread_lower + size_per_thread - inc; int64_t this_lower = thread_lower + (iteration * chunk * inc); int64_t this_upper = this_lower + (chunk - 1) * inc; thread_upper = LOOPMIN(inc, thread_upper, upper); this_upper = LOOPMIN(inc, this_upper, thread_upper); if ((inc >= 0) ? (this_lower > thread_upper) : (this_lower < thread_upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = this_lower; *upperp = this_upper; thread->ticket_number = this_lower; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } case PMP_SCHED_STATIC_EVEN: case PMP_SCHED_ORDERED_STATIC_EVEN: { if (iteration == 0) { int64_t size = (upper - lower) / inc + 1; int64_t thread_lower; int64_t thread_upper; if (!__pmp_get_param()->static_fair) { int64_t size_per_thread = ((size - 1) / team_size + 1) * inc; thread_lower = lower + (local_id * size_per_thread); thread_upper = thread_lower + size_per_thread - inc; } else { int64_t chunk = size / team_size; int64_t remainder = size - (chunk * team_size); int64_t index = MIN(local_id, remainder) * (chunk + 1); if (local_id > remainder) { index += (local_id - remainder) * chunk; } thread_lower = lower + (index * inc); chunk += (local_id < remainder); thread_upper = thread_lower + (chunk - 1) * inc; } thread_upper = LOOPMIN(inc, thread_upper, upper); if ((inc >= 0) ? (thread_lower > thread_upper) : (thread_lower < thread_upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = thread_lower; *upperp = thread_upper; thread->ticket_number = thread_lower; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } } else { assert(iteration == 1); __pmp_loop_free(thread); return 0; } /* NOT REACHED */ break; } case PMP_SCHED_DYNAMIC: case PMP_SCHED_ORDERED_DYNAMIC: { int64_t stride = inc * chunk; #if __WORDSIZE == 64 int64_t current = __pmp_atomic_xadd64(&loop->current, stride); #else /* TODO: the atomic xadd64 is a problem for 32-bit compilation */ /* the workaround below is just to do a 32-bit atomic add */ int64_t current; current = (int64_t) __pmp_atomic_xadd32((int32_t *) &loop->current, (int32_t) stride); #endif if ((inc >= 0) ? (current > upper) : (current < upper)) { __pmp_loop_free(thread); return 0; } else { *incp = inc; *lowerp = current; *upperp = *lowerp + stride - inc; *upperp = LOOPMIN(inc, upper, *upperp); thread->ticket_number = current; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } case PMP_SCHED_GUIDED: case PMP_SCHED_ORDERED_GUIDED: { /* NOTE: guided scheduling uses a heuristic to choose a good * chunk size to divide up the remaining iterations amongst * the team (subject to a minimum). An exact implementation of * this would require a lock on the loop data. However, the * heuristic can be approximated using (possibly) stale values * and this should be good enough. The value of "remaining" * is monotonically decreasing. The worst that could happen * is that an update to loop->chunk is lost slightly unbalancing * the distribution. The most important point is that loop->current * is maintained atomically. */ /* UPDATE: if cmpxchg64 is available then this is used to protect * the update of loop->chunk. This is fairly cunning, and makes * the chunk update more accurate in this case! */ int64_t min_chunk = loop->min_chunk; int64_t remaining = upper - loop->current + 1; /* estimate */ int64_t my_chunk = MAX(min_chunk, MIN(chunk, remaining));/* estimate */ int64_t stride = inc * my_chunk; #if __WORDSIZE == 64 int64_t current = __pmp_atomic_xadd64(&loop->current, stride); #else /* TODO: the atomic xadd64 is a problem for 32-bit compilation */ /* the workaround below is just to do a 32-bit atomic add */ int64_t current = __pmp_atomic_xadd32((int32_t *) &loop->current, (int32_t) stride); #endif assert(stride != 0); #ifdef SUPER_DEBUG if (Enabled_Libomp_Loop_Debug) __pmp_debug("LOOPS_DEBUG", "__pmp_schedule_next: global_id=%d, " "remaining=%d, my_chunk=%d, stride=%d, current=%d\n", global_id, remaining, my_chunk, stride, current); #endif if ((inc >= 0) ? (current > upper) : (current < upper)) { __pmp_loop_free(thread); return 0; } else { pmp_param_t *param = __pmp_get_param(); int64_t my_upper = LOOPMIN(inc, upper, current + stride - inc); int64_t new_chunk; int64_t divisor; remaining = upper - my_upper; /* estimate */ divisor = team_size * param->guided_chunk_divisor; new_chunk = (remaining + divisor - 1) / divisor; new_chunk = MIN(param->guided_chunk_max, new_chunk); new_chunk = MAX(min_chunk, new_chunk); #if __WORDSIZE == 64 (void) __pmp_atomic_cmpxchg64(&loop->chunk, chunk, new_chunk); #else loop->chunk = new_chunk; /* estimate */ #endif *incp = inc; *lowerp = current; *upperp = my_upper; thread->ticket_number = current; thread->iteration++; __pmp_loop_analyser(thread, sched, global_id, local_id, lower, upper, *lowerp, *upperp, *incp, 0, 0); return 1; } /* NOT REACHED */ break; } default: { __pmp_fatal("unknown dynamic scheduling type %d\n", sched); break; } } /* NOT REACHED */ assert(0); __pmp_loop_free(thread); return 0; } /* NOT REACHED */ }
static inline void __pmp_static_init (int global_id, int sched, int64_t *lowerp, int64_t *upperp, int64_t *stridep, int64_t inc, int64_t chunk) { /* NOTE: chunk parameter is undefined/unused for static even scheduling */ pmp_thread_t *thread = __pmp_get_thread(global_id); int team_size = __pmp_get_team_size(thread->team); int64_t loop_lower = *lowerp; int64_t loop_upper = *upperp; int64_t lower; int64_t upper; assert(team_size > 0); if (chunk <= 0) { if (thread->global_id == 0) __pmp_warning("Chunk size is non-positive, set to default '1'\n"); chunk = 1; } if (team_size == 1) { *stridep = (inc > 0) ? (loop_upper - loop_lower + 1) : (loop_upper - loop_lower - 1); } else { pmp_local_id_t local_id = thread->local_id; int64_t stride; switch (sched) { case PMP_SCHED_STATIC_EVEN: { int64_t size = (loop_upper - loop_lower) / inc + 1; assert(size >= 0); if (!__pmp_get_param()->static_fair) { /* The size is divided by the team_size and rounded up to give * the chunk size. Chunks of this size are assigned to threads * in increased local_id order. If the division was not exact * then the last thread will have fewer iterations, and possibly * none at all. */ chunk = (size + team_size - 1) / team_size; lower = loop_lower + (local_id * chunk * inc); } else { /* The size is divided by the team_size and rounded down to * give the chunk. Each thread will have at least this many * iterations. If the division was not exact then the remainder * iterations are scheduled across the threads in increasing * thread order. Note that the difference between the minimum * and maximum number of iterations assigned to the threads * across the team is at most 1. The maximum number of iterations * assigned to a thread (the worst case path through the schedule) * is the same as for default behavior. */ int64_t remainder; int64_t index; chunk = size / team_size; remainder = size - (chunk * team_size); index = MIN(local_id, remainder) * (chunk + 1); if (local_id > remainder) { index += (local_id - remainder) * chunk; } lower = loop_lower + (index * inc); chunk += (local_id < remainder); } if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) { upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper); stride = size * inc; } else { /* If the entire set of iterations falls out of the loop bounds * then arrange for a non-iterating loop which will not trigger * the LASTPRIVATE check made by the compiler. This means that * the final value of the loop induction variable must not exceed * the loop upper bound. */ lower = loop_lower - inc; upper = lower - inc; stride = inc; } __pmp_loop_analyser(thread, sched, global_id, local_id, loop_lower, loop_upper, lower, upper, inc, chunk, stride); break; } case PMP_SCHED_STATIC: { stride = chunk * inc; lower = loop_lower + (local_id * stride); if ((inc >= 0) ? (lower <= loop_upper) : (lower >= loop_upper)) { upper = LOOPMIN(inc, lower + (chunk - 1) * inc, loop_upper); stride *= team_size; } else { /* If the entire set of iterations falls out of the loop bounds * then arrange for a non-iterating loop which will not trigger * the LASTPRIVATE check made by the compiler. This means that * the final value of the loop induction variable must not exceed * the loop upper bound. */ lower = loop_lower - inc; upper = lower - inc; stride = inc; } __pmp_loop_analyser(thread, sched, global_id, local_id, loop_lower, loop_upper, lower, upper, inc, chunk, stride); break; } default: { __pmp_fatal("unknown static scheduling type %d\n", sched); stride = 0; lower = loop_lower; upper = loop_upper; } } *lowerp = lower; *upperp = upper; *stridep = stride; } __pmp_scheduler_sample(sched); }
void __ompc_get_thdprv (void ***thdprv, int64_t size, void *data, int global_id) { __pmp_debug(PMP_DEBUG_CALLS, "__ompc_get_thdprv: thdprv=%p, size=%ld, " "data=%p, global_id=%d\n", thdprv, (long) size, data, global_id); __pmp_sample(PMP_PROFILE_OMPC_GET_THDPRV); if (__pmp_get_param()->disabled) { void **t = (void **) calloc (1, sizeof(void *)); if (t == NULL) { __pmp_fatal("failed to allocate thread private data\n"); } t[0] = data; *thdprv = t; } else { void **t = *thdprv; if (t == NULL) { /* TODO: can I reduce the size of this array? Note that it is indexed * by global_id and global_id's can be arbitrarily assigned to threads * in general, so this may be difficult. */ void *t_new; void *t_cur; t = (void **) calloc(PMP_MAX_THREADS, sizeof(void *)); if (t == NULL) { __pmp_fatal("failed to allocate thread private data\n"); } t_new = (void *) t; t_cur = __pmp_atomic_cmpxchgptr((volatile voidptr_t *) thdprv, NULL, t_new); if (t_cur != NULL) { /* This thread lost the race and another thread has already * installed a thdprv array. Simply back out this allocation * and use *thdprv. */ free(t); t = (void **) t_cur; } } if (t[global_id] == NULL) { /* The OpenMP 2.5 standard says: * * "Each copy of a threadprivate object is initialized once, in the manner * specified by the program, but at an unspecified point in the program * prior to the first reference to that copy." * * Since the initial values live in the statically allocated block of * memory passed to our "data" argument, the master thread needs to use * a dynamically allocated block, just as the additional threads do, so * that it if it changes its copies of the variables before the program * enters the first parallel region, those changes have no effect on the * copies in the additional threads. Observation shows that the code * generator calls __ompc_get_thdprv from the serial portion of the * program, for the master thread, before it changes any values. * * Note the copying is done without synchronization, which is safe only * because we're copying statically initialized and subsequently * unchanged values: copying from the main thread would require a * barrier. */ t[global_id] = (void *) malloc(size); if (t[global_id] == NULL) { __pmp_fatal("failed to allocate thread private data"); } memcpy(t[global_id], data, size); } } }
void __pmp_thread_create (pmp_thread_t *thread) { pmp_thread_t *creator = __pmp_get_current_thread(); pthread_t pthread_id; int result; pmp_param_t *param = __pmp_get_param(); thread->creator = creator; if (param->thread_guard_size > 0) { void *guard; /* NOTE: this lock is to give a better chance of the guard page * allocation to immediately follow the pthread stack allocation. */ __pmp_lock(thread->global_id, &__pmp_manager.pthread_create_lock); /* NOTE: it seems that mmap tends to allocate in an upwards direction so allocate the guard page first. */ guard = mmap(0, param->thread_guard_size, PROT_NONE, #if defined(BUILD_OS_DARWIN) MAP_PRIVATE | MAP_ANON, #else /* defined(BUILD_OS_DARWIN) */ MAP_PRIVATE | MAP_ANONYMOUS, #endif /* defined(BUILD_OS_DARWIN) */ 0, 0); if (guard == MAP_FAILED) { __pmp_warning("unable to allocate a guard page of %ld bytes\n", (long) param->thread_guard_size); } else { __pmp_debug(PMP_DEBUG_THREAD, "guard page allocated at address %p\n", guard); thread->guard_page = guard; } } if ((result = pthread_create(&pthread_id, &__pmp_manager.pthread_attr, __pmp_thread_run, thread)) != 0) { if (__pmp_manager.allocated_threads > param->initial_team_size) { __pmp_warning( "pthread_create failed when trying to allocate thread %d\n", __pmp_manager.allocated_threads); __pmp_warning( "note this is more than the initial number of threads (%d)\n", param->initial_team_size); #if defined(BUILD_OS_DARWIN) if (sizeof(long) == 4) #else /* defined(BUILD_OS_DARWIN) */ if (__WORDSIZE == 32) #endif /* defined(BUILD_OS_DARWIN) */ { int64_t total_stack = ((int64_t) param->thread_stack_size) * ((int64_t) __pmp_manager.allocated_threads); if (total_stack > 0x40000000LL) { __pmp_warning( "the failure may be due to excessive thread stack size\n"); __pmp_warning( "try using a smaller setting for PSC_OMP_STACK_SIZE\n"); } } } __pmp_fatal("unable to create thread (result code %d)\n", result); } if (param->thread_guard_size > 0) { __pmp_unlock(thread->global_id, &__pmp_manager.pthread_create_lock); } __pmp_atomic_xadd32(&__pmp_manager.waiting_threads, 1); __pmp_debug(PMP_DEBUG_THREAD, "created thread global_id %d\n", thread->global_id); }
static void *__pmp_thread_run (void *arg) { pmp_thread_t *thread = (pmp_thread_t *) arg; pmp_team_t *team; thread->pthread_id = pthread_self(); #ifdef PMP_NO_NPTL thread->tid = getpid(); #else thread->tid = gettid(); #endif #ifndef PMP_NO_TLS #ifdef PMP_TLS_THREAD __pmp_tls_current_thread = thread; #endif #ifdef PMP_TLS_LOCAL_ID __pmp_tls_current_local_id = thread->local_id; #endif #ifdef PMP_TLS_GLOBAL_ID __pmp_tls_current_global_id = thread->global_id; #endif #endif #ifdef PMP_USE_PTHREAD_SIGNALS if (pthread_sigmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) { __pmp_fatal("unable to set thread-specific sigmask\n"); } #else if (sigprocmask(SIG_BLOCK, &__pmp_manager.mask_block_sigpmp, NULL) != 0) { __pmp_fatal("unable to set thread-specific sigmask\n"); } #endif if (pthread_setspecific(__pmp_manager.thread_key, (void *) thread) != 0) { __pmp_fatal("unable to set thread-specific data\n"); } __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is running\n", thread->global_id); /* Note that there is no synchronization between the creating thread and * the created thread until here. This is the point where the created * thread is assigned to do some work. The reason that this is sufficient * is because the __pmp_thread_wait/wake mechanism is "protected" by * the thread->sync value which has been pre-initialized. If the creator * gets to the wake first, then it will just swap in PMP_THREAD_UNBLOCKED * and its work is done. If it gets to the wake second, then the created * thread must have got there first and this guarantees that the other * thread fields will already be initialized by the created thread. * * With nested forking, there is the possibility that the creator thread * will be usurped by another forking thread (there is no lock between * creation of a thread and that thread being assigned to do work). This * works for the same reason as described above. */ __pmp_thread_wait(thread); /* wait for first assignment */ #ifdef PMP_NO_NPTL __pmp_shared_catch_segv(thread); /* set up shared segv handler */ #else __pmp_catch_segv(); /* set up thread's segv handler */ #endif __pmp_thread_bind(thread); /* bind to the assigned local_id */ while (1) { __pmp_debug(PMP_DEBUG_THREAD, "__pmp_thread_run: thread tid=%d, pthread_id=0x%08x " "global_id=%d, local_id=%d\n", (int) thread->tid, (int) thread->pthread_id, (int) thread->global_id, (int) thread->local_id); team = thread->team; assert(team != NULL); #ifndef PMP_NO_TLS #ifdef PMP_TLS_LOCAL_ID __pmp_tls_current_local_id = thread->local_id; #endif #ifdef PMP_TLS_TEAM __pmp_tls_current_team = team; #endif #endif __pmp_memory_fence(); __pmp_thread_work(thread); /* do the work */ __pmp_thread_worker_join(team); /* wake up team master */ __pmp_memory_fence(); __pmp_thread_idle(thread); /* thread is now idle */ __pmp_thread_wait(thread); /* wait for more work */ __pmp_thread_bind(thread); /* update binding */ } /* Currently unreachable */ __pmp_debug(PMP_DEBUG_THREAD, "thread global_id %d is exiting\n", thread->global_id); return NULL; }