int acc_is_present (void *h, size_t s) { splay_tree_key n; if (!s || !h) return 0; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return h != NULL; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); if (n && ((uintptr_t)h < n->host_start || (uintptr_t)h + s > n->host_end || s > n->host_end - n->host_start)) n = NULL; gomp_mutex_unlock (&acc_dev->lock); return n != NULL; }
void GOMP_critical_start (void) { /* There is an implicit flush on entry to a critical region. */ __atomic_thread_fence (MEMMODEL_RELEASE); gomp_mutex_lock (&default_lock); }
void * acc_hostptr (void *d) { splay_tree_key n; void *h; void *offset; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return d; gomp_mutex_lock (&acc_dev->lock); n = lookup_dev (acc_dev->openacc.data_environ, d, 1); if (!n) { gomp_mutex_unlock (&acc_dev->lock); return NULL; } offset = d - n->tgt->tgt_start + n->tgt_offset; h = n->host_start + offset; gomp_mutex_unlock (&acc_dev->lock); return h; }
void * acc_deviceptr (void *h) { splay_tree_key n; void *d; void *offset; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *dev = thr->dev; if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return h; gomp_mutex_lock (&dev->lock); n = lookup_host (dev, h, 1); if (!n) { gomp_mutex_unlock (&dev->lock); return NULL; } offset = h - n->host_start; d = n->tgt->tgt_start + n->tgt_offset + offset; gomp_mutex_unlock (&dev->lock); return d; }
unsigned GOMP_sections_start (unsigned count) { struct gomp_thread *thr = gomp_thread (); long s, e, ret; if (gomp_work_share_start (false)) { gomp_sections_init (thr->ts.work_share, count); gomp_work_share_init_done (); } #ifdef HAVE_SYNC_BUILTINS if (gomp_iter_dynamic_next (&s, &e)) ret = s; else ret = 0; #else gomp_mutex_lock (&thr->ts.work_share->lock); if (gomp_iter_dynamic_next_locked (&s, &e)) ret = s; else ret = 0; gomp_mutex_unlock (&thr->ts.work_share->lock); #endif return ret; }
static struct gomp_device_descr * acc_init_1 (acc_device_t d) { struct gomp_device_descr *base_dev, *acc_dev; int ndevs; base_dev = resolve_device (d, true); ndevs = base_dev->get_num_devices_func (); if (ndevs <= 0 || goacc_device_num >= ndevs) acc_dev_num_out_of_range (d, goacc_device_num, ndevs); acc_dev = &base_dev[goacc_device_num]; gomp_mutex_lock (&acc_dev->lock); if (acc_dev->is_initialized) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("device already active"); } gomp_init_device (acc_dev); gomp_mutex_unlock (&acc_dev->lock); return base_dev; }
static void update_dev_host (int is_dev, void *h, size_t s) { splay_tree_key n; void *d; struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); /* No need to call lazy open, as the data must already have been mapped. */ if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset); gomp_mutex_unlock (&acc_dev->lock); if (is_dev) acc_dev->host2dev_func (acc_dev->target_id, d, h, s); else acc_dev->dev2host_func (acc_dev->target_id, h, d, s); }
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event the device address is mapped. We choose to check if it mapped, and if it is, to unmap it. */ void acc_free (void *d) { splay_tree_key k; if (!d) return; struct goacc_thread *thr = goacc_thread (); assert (thr && thr->dev); struct gomp_device_descr *acc_dev = thr->dev; gomp_mutex_lock (&acc_dev->lock); /* We don't have to call lazy open here, as the ptr value must have been returned by acc_malloc. It's not permitted to pass NULL in (unless you got that null from acc_malloc). */ if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1))) { void *offset; offset = d - k->tgt->tgt_start + k->tgt_offset; gomp_mutex_unlock (&acc_dev->lock); acc_unmap_data ((void *)(k->host_start + offset)); } else gomp_mutex_unlock (&acc_dev->lock); acc_dev->free_func (acc_dev->target_id, d); }
static bool gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size, gomp_ull *istart, gomp_ull *iend) { struct gomp_thread *thr = gomp_thread (); bool ret; if (gomp_work_share_start (false)) { gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, GFS_GUIDED, chunk_size); gomp_doacross_ull_init (ncounts, counts, chunk_size); gomp_work_share_init_done (); } #if defined HAVE_SYNC_BUILTINS && defined __LP64__ ret = gomp_iter_ull_guided_next (istart, iend); #else gomp_mutex_lock (&thr->ts.work_share->lock); ret = gomp_iter_ull_guided_next_locked (istart, iend); gomp_mutex_unlock (&thr->ts.work_share->lock); #endif return ret; }
static void update_dev_host (int is_dev, void *h, size_t s) { splay_tree_key n; void *d; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); if (is_dev) acc_dev->host2dev_func (acc_dev->target_id, d, h, s); else acc_dev->dev2host_func (acc_dev->target_id, h, d, s); gomp_mutex_unlock (&acc_dev->lock); }
static bool gomp_loop_ull_guided_start (bool up, gomp_ull start, gomp_ull end, gomp_ull incr, gomp_ull chunk_size, gomp_ull *istart, gomp_ull *iend) { struct gomp_thread *thr = gomp_thread (); bool ret; if (gomp_work_share_start (false)) { gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, GFS_GUIDED, chunk_size); gomp_work_share_init_done (); } #if defined HAVE_SYNC_BUILTINS && defined __LP64__ ret = gomp_iter_ull_guided_next (istart, iend); #else gomp_mutex_lock (&thr->ts.work_share->lock); ret = gomp_iter_ull_guided_next_locked (istart, iend); gomp_mutex_unlock (&thr->ts.work_share->lock); #endif return ret; }
bool GOMP_cancel (int which, bool do_cancel) { if (!gomp_cancel_var) return false; if (!do_cancel) return ialias_call (GOMP_cancellation_point) (which); struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; if (which & (GOMP_CANCEL_LOOP | GOMP_CANCEL_SECTIONS)) { /* In orphaned worksharing region, all we want to cancel is current thread. */ if (team != NULL) team->work_share_cancelled = 1; return true; } else if (which & GOMP_CANCEL_TASKGROUP) { if (thr->task->taskgroup && !thr->task->taskgroup->cancelled) { gomp_mutex_lock (&team->task_lock); thr->task->taskgroup->cancelled = true; gomp_mutex_unlock (&team->task_lock); } return true; } team->team_cancelled = 1; gomp_team_barrier_cancel (team); return true; }
void GOMP_parallel_end (void) { struct gomp_task_icv *icv = gomp_icv (false); if (__builtin_expect (icv->thread_limit_var != UINT_MAX, 0)) { struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; unsigned int nthreads = team ? team->nthreads : 1; gomp_team_end (); if (nthreads > 1) { /* If not nested, there is just one thread in the contention group left, no need for atomicity. */ if (thr->ts.team == NULL) thr->thread_pool->threads_busy = 1; else { #ifdef HAVE_SYNC_BUILTINS __sync_fetch_and_add (&thr->thread_pool->threads_busy, 1UL - nthreads); #else gomp_mutex_lock (&gomp_managed_threads_lock); thr->thread_pool->threads_busy -= nthreads - 1; gomp_mutex_unlock (&gomp_managed_threads_lock); #endif } } } else gomp_team_end (); }
void gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; splay_tree_key n; struct target_mem_desc *t; int minrefs = (mapnum == 1) ? 2 : 3; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, 1); if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("%p is not a mapped block", (void *)h); } gomp_debug (0, " %s: restore mappings\n", __FUNCTION__); t = n->tgt; struct target_mem_desc *tp; if (t->refcount == minrefs) { /* This is the last reference, so pull the descriptor off the chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from freeing the device memory. */ t->tgt_end = 0; t->to_free = 0; for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; tp = t, t = t->prev) { if (n->tgt == t) { if (tp) tp->prev = t->prev; else acc_dev->openacc.data_environ = t->prev; break; } } } if (force_copyfrom) t->list[0].copy_from = 1; gomp_mutex_unlock (&acc_dev->lock); /* If running synchronously, unmap immediately. */ if (async < acc_async_noval) gomp_unmap_vars (t, true); else t->device_descr->openacc.register_async_cleanup_func (t, async); gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); }
void acc_unmap_data (void *h) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; /* No need to call lazy open, as the address must have been mapped. */ size_t host_size; gomp_mutex_lock (&acc_dev->lock); splay_tree_key n = lookup_host (acc_dev, h, 1); struct target_mem_desc *t; if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("%p is not a mapped block", (void *)h); } host_size = n->host_end - n->host_start; if (n->host_start != (uintptr_t) h) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] surrounds %p", (void *) n->host_start, (int) host_size, (void *) h); } t = n->tgt; if (t->refcount == 2) { struct target_mem_desc *tp; /* This is the last reference, so pull the descriptor off the chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from freeing the device memory. */ t->tgt_end = 0; t->to_free = 0; for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; tp = t, t = t->prev) if (n->tgt == t) { if (tp) tp->prev = t->prev; else acc_dev->openacc.data_environ = t->prev; break; } } gomp_mutex_unlock (&acc_dev->lock); gomp_unmap_vars (t, true); }
void GOMP_critical_start (void) { gomp_mutex_lock (&default_lock); /* OMP v3.1, 2.8.6 p81,l16 - "At entry to critical regions" */ gomp_flush0(); }
void acc_shutdown (acc_device_t d) { gomp_mutex_lock (&acc_device_lock); acc_shutdown_1 (d); gomp_mutex_unlock (&acc_device_lock); }
void GOMP_atomic_start (void) { gomp_mutex_lock (&atomic_lock); /* OMP v3.1, 2.8.6 p82,l1 - "At entry to atomic operation" TODO: Replace gomp_flush0 with a targeted flush containing just the storage locations involved in the atomic construct */ gomp_flush0(); }
void acc_set_device_num (int ord, acc_device_t d) { struct gomp_device_descr *base_dev, *acc_dev; int num_devices; if (!cached_base_dev) gomp_init_targets_once (); if (ord < 0) ord = goacc_device_num; if ((int) d == 0) /* Set whatever device is being used by the current host thread to use device instance ORD. It's unclear if this is supposed to affect other host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num). */ goacc_attach_host_thread_to_device (ord); else { gomp_mutex_lock (&acc_device_lock); cached_base_dev = base_dev = resolve_device (d); num_devices = base_dev->get_num_devices_func (); if (ord >= num_devices) gomp_fatal ("device %u out of range", ord); acc_dev = &base_dev[ord]; gomp_mutex_lock (&acc_dev->lock); if (!acc_dev->is_initialized) gomp_init_device (acc_dev); gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_device_lock); goacc_attach_host_thread_to_device (ord); } goacc_device_num = ord; }
void acc_shutdown (acc_device_t d) { gomp_init_targets_once (); gomp_mutex_lock (&acc_device_lock); acc_shutdown_1 (d); gomp_mutex_unlock (&acc_device_lock); }
void gomp_team_barrier_cancel (struct gomp_team *team) { gomp_mutex_lock (&team->task_lock); if (team->barrier.generation & BAR_CANCELLED) { gomp_mutex_unlock (&team->task_lock); return; } team->barrier.generation |= BAR_CANCELLED; gomp_mutex_unlock (&team->task_lock); futex_wake ((int *) &team->barrier.generation, INT_MAX); }
void gomp_set_nest_lock_30 (omp_nest_lock_t *lock) { void *me = gomp_icv (true); if (lock->owner != me) { gomp_mutex_lock (&lock->lock); lock->owner = me; } lock->count++; }
static void lazy_init_and_open (acc_device_t d) { if (!base_dev) gomp_init_targets_once (); gomp_mutex_lock (&acc_device_lock); base_dev = lazy_init (d); lazy_open (-1); gomp_mutex_unlock (&acc_device_lock); }
void acc_init (acc_device_t d) { if (!cached_base_dev) gomp_init_targets_once (); gomp_mutex_lock (&acc_device_lock); cached_base_dev = acc_init_1 (d); gomp_mutex_unlock (&acc_device_lock); goacc_attach_host_thread_to_device (-1); }
void acc_set_device_num (int n, acc_device_t d) { const struct gomp_device_descr *dev; int num_devices; if (!base_dev) gomp_init_targets_once (); if ((int) d == 0) { int i; /* A device setting of zero sets all device types on the system to use the Nth instance of that device type. Only attempt it for initialized devices though. */ for (i = acc_device_not_host + 1; i < _ACC_device_hwm; i++) { dev = resolve_device (d); if (dev && dev->is_initialized) dev->openacc.set_device_num_func (n); } /* ...and for future calls to acc_init/acc_set_device_type, etc. */ goacc_device_num = n; } else { struct goacc_thread *thr = goacc_thread (); gomp_mutex_lock (&acc_device_lock); base_dev = lazy_init (d); num_devices = base_dev->get_num_devices_func (); if (n >= num_devices) gomp_fatal ("device %u out of range", n); /* If we're changing the device number, de-associate this thread with the device (but don't close the device, since it may be in use by other threads). */ if (thr && thr->dev && n != thr->dev->target_id) thr->dev = NULL; lazy_open (n); gomp_mutex_unlock (&acc_device_lock); } }
void acc_init (acc_device_t d) { if (!base_dev) gomp_init_targets_once (); gomp_mutex_lock (&acc_device_lock); base_dev = acc_init_1 (d); lazy_open (-1); gomp_mutex_unlock (&acc_device_lock); }
static bool gomp_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend) { struct gomp_thread *thr = gomp_thread (); int test; gomp_ordered_sync (); gomp_mutex_lock (&thr->ts.work_share->lock); test = gomp_iter_ull_static_next (istart, iend); if (test >= 0) gomp_ordered_static_next (); gomp_mutex_unlock (&thr->ts.work_share->lock); return test == 0; }
static bool gomp_loop_ull_guided_next (gomp_ull *istart, gomp_ull *iend) { bool ret; #if defined HAVE_SYNC_BUILTINS && defined __LP64__ ret = gomp_iter_ull_guided_next (istart, iend); #else struct gomp_thread *thr = gomp_thread (); gomp_mutex_lock (&thr->ts.work_share->lock); ret = gomp_iter_ull_guided_next_locked (istart, iend); gomp_mutex_unlock (&thr->ts.work_share->lock); #endif return ret; }
static void lazy_open (int ord) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev; if (thr && thr->dev) { assert (ord < 0 || ord == thr->dev->target_id); return; } assert (base_dev); if (ord < 0) ord = goacc_device_num; /* The OpenACC 2.0 spec leaves the runtime's behaviour when an out-of-range device is requested as implementation-defined (4.2 ACC_DEVICE_NUM). We choose to raise an error in such a case. */ if (ord >= base_dev->get_num_devices_func ()) gomp_fatal ("device %u does not exist", ord); if (!thr) thr = goacc_new_thread (); acc_dev = thr->dev = &base_dev[ord]; assert (acc_dev->target_id == ord); thr->saved_bound_dev = NULL; thr->mapped_data = NULL; if (!acc_dev->openacc.target_data) acc_dev->openacc.target_data = acc_dev->openacc.open_device_func (ord); thr->target_tls = acc_dev->openacc.create_thread_data_func (acc_dev->openacc.target_data); acc_dev->openacc.async_set_async_func (acc_async_sync); struct gomp_memory_mapping *mem_map = &acc_dev->mem_map; gomp_mutex_lock (&mem_map->lock); if (!mem_map->is_initialized) gomp_init_tables (acc_dev, mem_map); gomp_mutex_unlock (&mem_map->lock); }
static void delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) { size_t host_size; splay_tree_key n; void *d; struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); /* No need to call lazy open, as the data must already have been mapped. */ if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); host_size = n->host_end - n->host_start; if (n->host_start != (uintptr_t) h || host_size != s) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]", (void *) n->host_start, (int) host_size, (void *) h, (int) s); } gomp_mutex_unlock (&acc_dev->lock); if (f & FLAG_COPYOUT) acc_dev->dev2host_func (acc_dev->target_id, h, d, s); acc_unmap_data (h); if (!acc_dev->free_func (acc_dev->target_id, d)) gomp_fatal ("error in freeing device memory in %s", libfnname); }