void acc_map_data (void *h, void *d, size_t s) { struct target_mem_desc *tgt; size_t mapnum = 1; void *hostaddrs = h; void *devaddrs = d; size_t sizes = s; unsigned short kinds = GOMP_MAP_ALLOC; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) { if (d != h) gomp_fatal ("cannot map data on shared-memory system"); tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, GOMP_MAP_VARS_OPENACC); } else { struct goacc_thread *thr = goacc_thread (); if (!d || !h || !s) gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map", (void *)h, (int)s, (void *)d, (int)s); gomp_mutex_lock (&acc_dev->lock); if (lookup_host (acc_dev, h, s)) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("host address [%p, +%d] is already mapped", (void *)h, (int)s); } if (lookup_dev (thr->dev->openacc.data_environ, d, s)) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d, (int)s); } gomp_mutex_unlock (&acc_dev->lock); tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes, &kinds, true, GOMP_MAP_VARS_OPENACC); } gomp_mutex_lock (&acc_dev->lock); tgt->prev = acc_dev->openacc.data_environ; acc_dev->openacc.data_environ = tgt; gomp_mutex_unlock (&acc_dev->lock); }
static void update_dev_host (int is_dev, void *h, size_t s) { splay_tree_key n; void *d; struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); /* No need to call lazy open, as the data must already have been mapped. */ if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset); gomp_mutex_unlock (&acc_dev->lock); if (is_dev) acc_dev->host2dev_func (acc_dev->target_id, d, h, s); else acc_dev->dev2host_func (acc_dev->target_id, h, d, s); }
void * acc_deviceptr (void *h) { splay_tree_key n; void *d; void *offset; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *dev = thr->dev; if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return h; gomp_mutex_lock (&dev->lock); n = lookup_host (dev, h, 1); if (!n) { gomp_mutex_unlock (&dev->lock); return NULL; } offset = h - n->host_start; d = n->tgt->tgt_start + n->tgt_offset + offset; gomp_mutex_unlock (&dev->lock); return d; }
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event the device address is mapped. We choose to check if it mapped, and if it is, to unmap it. */ void acc_free (void *d) { splay_tree_key k; if (!d) return; struct goacc_thread *thr = goacc_thread (); assert (thr && thr->dev); struct gomp_device_descr *acc_dev = thr->dev; gomp_mutex_lock (&acc_dev->lock); /* We don't have to call lazy open here, as the ptr value must have been returned by acc_malloc. It's not permitted to pass NULL in (unless you got that null from acc_malloc). */ if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1))) { void *offset; offset = d - k->tgt->tgt_start + k->tgt_offset; gomp_mutex_unlock (&acc_dev->lock); acc_unmap_data ((void *)(k->host_start + offset)); } else gomp_mutex_unlock (&acc_dev->lock); acc_dev->free_func (acc_dev->target_id, d); }
void acc_set_device_type (acc_device_t d) { struct gomp_device_descr *base_dev, *acc_dev; struct goacc_thread *thr = goacc_thread (); gomp_mutex_lock (&acc_device_lock); if (!cached_base_dev) gomp_init_targets_once (); cached_base_dev = base_dev = resolve_device (d); acc_dev = &base_dev[goacc_device_num]; gomp_mutex_lock (&acc_dev->lock); if (!acc_dev->is_initialized) gomp_init_device (acc_dev); gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_device_lock); /* We're changing device type: invalidate the current thread's dev and base_dev pointers. */ if (thr && thr->base_dev != base_dev) { thr->base_dev = thr->dev = NULL; if (thr->mapped_data) gomp_fatal ("acc_set_device_type in 'acc data' region"); } goacc_attach_host_thread_to_device (-1); }
void gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; splay_tree_key n; struct target_mem_desc *t; int minrefs = (mapnum == 1) ? 2 : 3; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, 1); if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("%p is not a mapped block", (void *)h); } gomp_debug (0, " %s: restore mappings\n", __FUNCTION__); t = n->tgt; struct target_mem_desc *tp; if (t->refcount == minrefs) { /* This is the last reference, so pull the descriptor off the chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from freeing the device memory. */ t->tgt_end = 0; t->to_free = 0; for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; tp = t, t = t->prev) { if (n->tgt == t) { if (tp) tp->prev = t->prev; else acc_dev->openacc.data_environ = t->prev; break; } } } if (force_copyfrom) t->list[0].copy_from = 1; gomp_mutex_unlock (&acc_dev->lock); /* If running synchronously, unmap immediately. */ if (async < acc_async_noval) gomp_unmap_vars (t, true); else t->device_descr->openacc.register_async_cleanup_func (t, async); gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); }
void GOACC_data_start (int device, const void *offload_table, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned short *kinds) { bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; struct target_mem_desc *tgt; gomp_debug (0, "%s: mapnum=%zd, hostaddrs=%p, sizes=%p, kinds=%p\n", __FUNCTION__, mapnum, hostaddrs, sizes, kinds); select_acc_device (device); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; /* Host fallback or 'do nothing'. */ if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) || host_fallback) { tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, false); tgt->prev = thr->mapped_data; thr->mapped_data = tgt; return; } gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__); tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true, false); gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__); tgt->prev = thr->mapped_data; thr->mapped_data = tgt; }
void * acc_hostptr (void *d) { splay_tree_key n; void *h; void *offset; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return d; gomp_mutex_lock (&acc_dev->lock); n = lookup_dev (acc_dev->openacc.data_environ, d, 1); if (!n) { gomp_mutex_unlock (&acc_dev->lock); return NULL; } offset = d - n->tgt->tgt_start + n->tgt_offset; h = n->host_start + offset; gomp_mutex_unlock (&acc_dev->lock); return h; }
int acc_is_present (void *h, size_t s) { splay_tree_key n; if (!s || !h) return 0; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return h != NULL; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); if (n && ((uintptr_t)h < n->host_start || (uintptr_t)h + s > n->host_end || s > n->host_end - n->host_start)) n = NULL; gomp_mutex_unlock (&acc_dev->lock); return n != NULL; }
static void update_dev_host (int is_dev, void *h, size_t s) { splay_tree_key n; void *d; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); if (is_dev) acc_dev->host2dev_func (acc_dev->target_id, d, h, s); else acc_dev->dev2host_func (acc_dev->target_id, h, d, s); gomp_mutex_unlock (&acc_dev->lock); }
void acc_unmap_data (void *h) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; /* No need to call lazy open, as the address must have been mapped. */ size_t host_size; gomp_mutex_lock (&acc_dev->lock); splay_tree_key n = lookup_host (acc_dev, h, 1); struct target_mem_desc *t; if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("%p is not a mapped block", (void *)h); } host_size = n->host_end - n->host_start; if (n->host_start != (uintptr_t) h) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] surrounds %p", (void *) n->host_start, (int) host_size, (void *) h); } t = n->tgt; if (t->refcount == 2) { struct target_mem_desc *tp; /* This is the last reference, so pull the descriptor off the chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from freeing the device memory. */ t->tgt_end = 0; t->to_free = 0; for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; tp = t, t = t->prev) if (n->tgt == t) { if (tp) tp->prev = t->prev; else acc_dev->openacc.data_environ = t->prev; break; } } gomp_mutex_unlock (&acc_dev->lock); gomp_unmap_vars (t, true); }
attribute_hidden void goacc_restore_bind (void) { struct goacc_thread *thr = goacc_thread (); thr->dev = thr->saved_bound_dev; thr->saved_bound_dev = NULL; }
void GOACC_update (int device, const void *offload_table, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned short *kinds, int async, int num_waits, ...) { bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; size_t i; select_acc_device (device); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) || host_fallback) return; if (num_waits > 0) { va_list ap; va_start (ap, num_waits); goacc_wait (async, num_waits, ap); va_end (ap); } acc_dev->openacc.async_set_async_func (async); for (i = 0; i < mapnum; ++i) { unsigned char kind = kinds[i] & 0xff; switch (kind) { case GOMP_MAP_POINTER: case GOMP_MAP_TO_PSET: break; case GOMP_MAP_FORCE_TO: acc_update_device (hostaddrs[i], sizes[i]); break; case GOMP_MAP_FORCE_FROM: acc_update_self (hostaddrs[i], sizes[i]); break; default: gomp_fatal (">>>> GOACC_update UNHANDLED kind 0x%.2x", kind); break; } } acc_dev->openacc.async_set_async_func (acc_async_sync); }
void acc_wait_all (void) { struct goacc_thread *thr = goacc_thread (); if (!thr || !thr->dev) gomp_fatal ("no device active"); thr->dev->openacc.async_wait_all_func (); }
void acc_wait_async (int async1, int async2) { struct goacc_thread *thr = goacc_thread (); if (!thr || !thr->dev) gomp_fatal ("no device active"); thr->dev->openacc.async_wait_async_func (async1, async2); }
attribute_hidden void goacc_save_and_set_bind (acc_device_t d) { struct goacc_thread *thr = goacc_thread (); assert (!thr->saved_bound_dev); thr->saved_bound_dev = thr->dev; thr->dev = dispatchers[d]; }
void GOACC_data_end (void) { struct goacc_thread *thr = goacc_thread (); struct target_mem_desc *tgt = thr->mapped_data; gomp_debug (0, " %s: restore mappings\n", __FUNCTION__); thr->mapped_data = tgt->prev; gomp_unmap_vars (tgt, true); gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); }
void acc_memcpy_from_device (void *h, void *d, size_t s) { /* No need to call lazy open here, as the device pointer must have been obtained from a routine that did that. */ struct goacc_thread *thr = goacc_thread (); assert (thr && thr->dev); thr->dev->dev2host_func (thr->dev->target_id, h, d, s); }
int acc_on_device (acc_device_t dev) { struct goacc_thread *thr = goacc_thread (); if (thr && thr->dev && acc_device_type (thr->dev->type) == acc_device_host_nonshm) return dev == acc_device_host_nonshm || dev == acc_device_not_host; /* Just rely on the compiler builtin. */ return __builtin_acc_on_device (dev); }
void acc_wait_all_async (int async) { if (async < acc_async_sync) gomp_fatal ("invalid async argument: %d", async); struct goacc_thread *thr = goacc_thread (); if (!thr || !thr->dev) gomp_fatal ("no device active"); thr->dev->openacc.async_wait_all_async_func (async); }
attribute_hidden void goacc_lazy_initialize (void) { struct goacc_thread *thr = goacc_thread (); if (thr && thr->dev) return; if (!cached_base_dev) acc_init (acc_device_default); else goacc_attach_host_thread_to_device (-1); }
void * acc_malloc (size_t s) { if (!s) return NULL; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); assert (thr->dev); return thr->dev->alloc_func (thr->dev->target_id, s); }
void acc_set_device_num (int n, acc_device_t d) { const struct gomp_device_descr *dev; int num_devices; if (!base_dev) gomp_init_targets_once (); if ((int) d == 0) { int i; /* A device setting of zero sets all device types on the system to use the Nth instance of that device type. Only attempt it for initialized devices though. */ for (i = acc_device_not_host + 1; i < _ACC_device_hwm; i++) { dev = resolve_device (d); if (dev && dev->is_initialized) dev->openacc.set_device_num_func (n); } /* ...and for future calls to acc_init/acc_set_device_type, etc. */ goacc_device_num = n; } else { struct goacc_thread *thr = goacc_thread (); gomp_mutex_lock (&acc_device_lock); base_dev = lazy_init (d); num_devices = base_dev->get_num_devices_func (); if (n >= num_devices) gomp_fatal ("device %u out of range", n); /* If we're changing the device number, de-associate this thread with the device (but don't close the device, since it may be in use by other threads). */ if (thr && thr->dev && n != thr->dev->target_id) thr->dev = NULL; lazy_open (n); gomp_mutex_unlock (&acc_device_lock); } }
static void goacc_wait (int async, int num_waits, va_list ap) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; int i; assert (num_waits >= 0); if (async == acc_async_sync && num_waits == 0) { acc_wait_all (); return; } if (async == acc_async_sync && num_waits) { for (i = 0; i < num_waits; i++) { int qid = va_arg (ap, int); if (acc_async_test (qid)) continue; acc_wait (qid); } return; } if (async == acc_async_noval && num_waits == 0) { acc_dev->openacc.async_wait_all_async_func (acc_async_noval); return; } for (i = 0; i < num_waits; i++) { int qid = va_arg (ap, int); if (acc_async_test (qid)) continue; /* If we're waiting on the same asynchronous queue as we're launching on, the queue itself will order work as required, so there's no need to wait explicitly. */ if (qid != async) acc_dev->openacc.async_wait_async_func (qid, async); } }
int acc_on_device (acc_device_t dev) { struct goacc_thread *thr = goacc_thread (); /* We only want to appear to be the "host_nonshm" plugin from "offloaded" code -- i.e. within a parallel region. Test a flag set by the openacc_parallel hook of the host_nonshm plugin to determine that. */ if (acc_get_device_type () == acc_device_host_nonshm && thr && thr->target_tls && ((struct nonshm_thread *)thr->target_tls)->nonshm_exec) return dev == acc_device_host_nonshm || dev == acc_device_not_host; /* For OpenACC, libgomp is only built for the host, so this is sufficient. */ return dev == acc_device_host || dev == acc_device_none; }
void GOACC_wait (int async, int num_waits, ...) { if (num_waits) { va_list ap; va_start (ap, num_waits); goacc_wait (async, num_waits, &ap); va_end (ap); } else if (async == acc_async_sync) acc_wait_all (); else if (async == acc_async_noval) goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval); }
static void lazy_open (int ord) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev; if (thr && thr->dev) { assert (ord < 0 || ord == thr->dev->target_id); return; } assert (base_dev); if (ord < 0) ord = goacc_device_num; /* The OpenACC 2.0 spec leaves the runtime's behaviour when an out-of-range device is requested as implementation-defined (4.2 ACC_DEVICE_NUM). We choose to raise an error in such a case. */ if (ord >= base_dev->get_num_devices_func ()) gomp_fatal ("device %u does not exist", ord); if (!thr) thr = goacc_new_thread (); acc_dev = thr->dev = &base_dev[ord]; assert (acc_dev->target_id == ord); thr->saved_bound_dev = NULL; thr->mapped_data = NULL; if (!acc_dev->openacc.target_data) acc_dev->openacc.target_data = acc_dev->openacc.open_device_func (ord); thr->target_tls = acc_dev->openacc.create_thread_data_func (acc_dev->openacc.target_data); acc_dev->openacc.async_set_async_func (acc_async_sync); struct gomp_memory_mapping *mem_map = &acc_dev->mem_map; gomp_mutex_lock (&mem_map->lock); if (!mem_map->is_initialized) gomp_init_tables (acc_dev, mem_map); gomp_mutex_unlock (&mem_map->lock); }
static void delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) { size_t host_size; splay_tree_key n; void *d; struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); /* No need to call lazy open, as the data must already have been mapped. */ if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); host_size = n->host_end - n->host_start; if (n->host_start != (uintptr_t) h || host_size != s) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]", (void *) n->host_start, (int) host_size, (void *) h, (int) s); } gomp_mutex_unlock (&acc_dev->lock); if (f & FLAG_COPYOUT) acc_dev->dev2host_func (acc_dev->target_id, h, d, s); acc_unmap_data (h); if (!acc_dev->free_func (acc_dev->target_id, d)) gomp_fatal ("error in freeing device memory in %s", libfnname); }
attribute_hidden void goacc_lazy_initialize (void) { struct goacc_thread *thr = goacc_thread (); if (thr && thr->dev) return; if (!base_dev) lazy_init_and_open (acc_device_default); else { gomp_mutex_lock (&acc_device_lock); lazy_open (-1); gomp_mutex_unlock (&acc_device_lock); } }
void * acc_malloc (size_t s) { if (!s) return NULL; goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); assert (thr->dev); if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return malloc (s); return thr->dev->alloc_func (thr->dev->target_id, s); }