bool gomp_affinity_init_level (int level, unsigned long count, bool quiet) { unsigned long i, max = 8 * gomp_cpuset_size; if (gomp_cpusetp) { unsigned long maxcount = gomp_cpuset_popcount (gomp_cpuset_size, gomp_cpusetp); if (count > maxcount) count = maxcount; } gomp_places_list = gomp_affinity_alloc (count, quiet); gomp_places_list_len = 0; if (gomp_places_list == NULL) return false; /* SMT (threads). */ if (level == 1) { for (i = 0; i < max && gomp_places_list_len < count; i++) if (CPU_ISSET_S (i, gomp_cpuset_size, gomp_cpusetp)) { gomp_affinity_init_place (gomp_places_list[gomp_places_list_len]); gomp_affinity_add_cpus (gomp_places_list[gomp_places_list_len], i, 1, 0, true); ++gomp_places_list_len; } return true; } else { char name[sizeof ("/sys/devices/system/cpu/cpu/topology/" "thread_siblings_list") + 3 * sizeof (unsigned long)]; size_t prefix_len = sizeof ("/sys/devices/system/cpu/cpu") - 1; cpu_set_t *copy = gomp_alloca (gomp_cpuset_size); FILE *f; char *line = NULL; size_t linelen = 0; memcpy (name, "/sys/devices/system/cpu/cpu", prefix_len); memcpy (copy, gomp_cpusetp, gomp_cpuset_size); for (i = 0; i < max && gomp_places_list_len < count; i++) if (CPU_ISSET_S (i, gomp_cpuset_size, copy)) { sprintf (name + prefix_len, "%lu/topology/%s_siblings_list", i, level == 2 ? "thread" : "core"); f = fopen (name, "r"); if (f != NULL) { if (getline (&line, &linelen, f) > 0) { char *p = line; bool seen_i = false; void *pl = gomp_places_list[gomp_places_list_len]; gomp_affinity_init_place (pl); while (*p && *p != '\n') { unsigned long first, last; errno = 0; first = strtoul (p, &p, 10); if (errno) break; last = first; if (*p == '-') { errno = 0; last = strtoul (p + 1, &p, 10); if (errno || last < first) break; } for (; first <= last; first++) if (CPU_ISSET_S (first, gomp_cpuset_size, copy) && gomp_affinity_add_cpus (pl, first, 1, 0, true)) { CPU_CLR_S (first, gomp_cpuset_size, copy); if (first == i) seen_i = true; } if (*p == ',') ++p; } if (seen_i) gomp_places_list_len++; } fclose (f); } } if (gomp_places_list_len == 0) { if (!quiet) gomp_error ("Error reading %s topology", level == 2 ? "core" : "socket"); free (gomp_places_list); gomp_places_list = NULL; return false; } return true; } return false; }
void gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, struct gomp_work_share *work_share) { struct gomp_thread_start_data *start_data; struct gomp_thread *thr, *nthr; struct gomp_team *team; bool nested; unsigned i, n, old_threads_used = 0; pthread_attr_t thread_attr, *attr; thr = gomp_thread (); nested = thr->ts.team != NULL; team = new_team (nthreads, work_share); /* Always save the previous state, even if this isn't a nested team. In particular, we should save any work share state from an outer orphaned work share construct. */ team->prev_ts = thr->ts; thr->ts.team = team; thr->ts.work_share = work_share; thr->ts.team_id = 0; thr->ts.work_share_generation = 0; thr->ts.static_trip = 0; if (nthreads == 1) return; i = 1; /* We only allow the reuse of idle threads for non-nested PARALLEL regions. This appears to be implied by the semantics of threadprivate variables, but perhaps that's reading too much into things. Certainly it does prevent any locking problems, since only the initial program thread will modify gomp_threads. */ if (!nested) { old_threads_used = gomp_threads_used; if (nthreads <= old_threads_used) n = nthreads; else if (old_threads_used == 0) { n = 0; gomp_barrier_init (&gomp_threads_dock, nthreads); } else { n = old_threads_used; /* Increase the barrier threshold to make sure all new threads arrive before the team is released. */ gomp_barrier_reinit (&gomp_threads_dock, nthreads); } /* Not true yet, but soon will be. We're going to release all threads from the dock, and those that aren't part of the team will exit. */ gomp_threads_used = nthreads; /* Release existing idle threads. */ for (; i < n; ++i) { nthr = gomp_threads[i]; nthr->ts.team = team; nthr->ts.work_share = work_share; nthr->ts.team_id = i; nthr->ts.work_share_generation = 0; nthr->ts.static_trip = 0; nthr->fn = fn; nthr->data = data; team->ordered_release[i] = &nthr->release; } if (i == nthreads) goto do_release; /* If necessary, expand the size of the gomp_threads array. It is expected that changes in the number of threads is rare, thus we make no effort to expand gomp_threads_size geometrically. */ if (nthreads >= gomp_threads_size) { gomp_threads_size = nthreads + 1; gomp_threads = gomp_realloc (gomp_threads, gomp_threads_size * sizeof (struct gomp_thread_data *)); } } attr = &gomp_thread_attr; if (gomp_cpu_affinity != NULL) { size_t stacksize; pthread_attr_init (&thread_attr); pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED); if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize)) pthread_attr_setstacksize (&thread_attr, stacksize); attr = &thread_attr; } start_data = gomp_alloca (sizeof (struct gomp_thread_start_data) * (nthreads-i)); /* Launch new threads. */ for (; i < nthreads; ++i, ++start_data) { pthread_t pt; int err; start_data->ts.team = team; start_data->ts.work_share = work_share; start_data->ts.team_id = i; start_data->ts.work_share_generation = 0; start_data->ts.static_trip = 0; start_data->fn = fn; start_data->fn_data = data; start_data->nested = nested; if (gomp_cpu_affinity != NULL) gomp_init_thread_affinity (attr); err = pthread_create (&pt, attr, gomp_thread_start, start_data); if (err != 0) gomp_fatal ("Thread creation failed: %s", strerror (err)); } if (gomp_cpu_affinity != NULL) pthread_attr_destroy (&thread_attr); do_release: gomp_barrier_wait (nested ? &team->barrier : &gomp_threads_dock); /* Decrease the barrier threshold to match the number of threads that should arrive back at the end of this team. The extra threads should be exiting. Note that we arrange for this test to never be true for nested teams. */ if (nthreads < old_threads_used) gomp_barrier_reinit (&gomp_threads_dock, nthreads); }
void GOACC_parallel_keyed (int device, void (*fn) (void *), size_t mapnum, void **hostaddrs, size_t *sizes, unsigned short *kinds, ...) { bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; va_list ap; struct goacc_thread *thr; struct gomp_device_descr *acc_dev; struct target_mem_desc *tgt; void **devaddrs; unsigned int i; struct splay_tree_key_s k; splay_tree_key tgt_fn_key; void (*tgt_fn); int async = GOMP_ASYNC_SYNC; unsigned dims[GOMP_DIM_MAX]; unsigned tag; #ifdef HAVE_INTTYPES_H gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n", __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds); #else gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n", __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds); #endif goacc_lazy_initialize (); thr = goacc_thread (); acc_dev = thr->dev; /* Host fallback if "if" clause is false or if the current device is set to the host. */ if (host_fallback) { goacc_save_and_set_bind (acc_device_host); fn (hostaddrs); goacc_restore_bind (); return; } else if (acc_device_type (acc_dev->type) == acc_device_host) { fn (hostaddrs); return; } va_start (ap, kinds); /* TODO: This will need amending when device_type is implemented. */ while ((tag = va_arg (ap, unsigned)) != 0) { if (GOMP_LAUNCH_DEVICE (tag)) gomp_fatal ("device_type '%d' offload parameters, libgomp is too old", GOMP_LAUNCH_DEVICE (tag)); switch (GOMP_LAUNCH_CODE (tag)) { case GOMP_LAUNCH_DIM: { unsigned mask = GOMP_LAUNCH_OP (tag); for (i = 0; i != GOMP_DIM_MAX; i++) if (mask & GOMP_DIM_MASK (i)) dims[i] = va_arg (ap, unsigned); } break; case GOMP_LAUNCH_ASYNC: { /* Small constant values are encoded in the operand. */ async = GOMP_LAUNCH_OP (tag); if (async == GOMP_LAUNCH_OP_MAX) async = va_arg (ap, unsigned); break; } case GOMP_LAUNCH_WAIT: { unsigned num_waits = GOMP_LAUNCH_OP (tag); if (num_waits) goacc_wait (async, num_waits, &ap); break; } default: gomp_fatal ("unrecognized offload code '%d'," " libgomp is too old", GOMP_LAUNCH_CODE (tag)); } } va_end (ap); acc_dev->openacc.async_set_async_func (async); if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC)) { k.host_start = (uintptr_t) fn; k.host_end = k.host_start + 1; gomp_mutex_lock (&acc_dev->lock); tgt_fn_key = splay_tree_lookup (&acc_dev->mem_map, &k); gomp_mutex_unlock (&acc_dev->lock); if (tgt_fn_key == NULL) gomp_fatal ("target function wasn't mapped"); tgt_fn = (void (*)) tgt_fn_key->tgt_offset; } else tgt_fn = (void (*)) fn; tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); devaddrs = gomp_alloca (sizeof (void *) * mapnum); for (i = 0; i < mapnum; i++) devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start + tgt->list[i].key->tgt_offset); acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, async, dims, tgt); /* If running synchronously, unmap immediately. */ if (async < acc_async_noval) gomp_unmap_vars (tgt, true); else { gomp_copy_from_async (tgt); acc_dev->openacc.register_async_cleanup_func (tgt); } acc_dev->openacc.async_set_async_func (acc_async_sync); }