/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event the device address is mapped. We choose to check if it mapped, and if it is, to unmap it. */ void acc_free (void *d) { splay_tree_key k; if (!d) return; struct goacc_thread *thr = goacc_thread (); assert (thr && thr->dev); struct gomp_device_descr *acc_dev = thr->dev; gomp_mutex_lock (&acc_dev->lock); /* We don't have to call lazy open here, as the ptr value must have been returned by acc_malloc. It's not permitted to pass NULL in (unless you got that null from acc_malloc). */ if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1))) { void *offset; offset = d - k->tgt->tgt_start + k->tgt_offset; gomp_mutex_unlock (&acc_dev->lock); acc_unmap_data ((void *)(k->host_start + offset)); } else gomp_mutex_unlock (&acc_dev->lock); acc_dev->free_func (acc_dev->target_id, d); }
static void delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) { size_t host_size; splay_tree_key n; void *d; struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); /* No need to call lazy open, as the data must already have been mapped. */ if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); host_size = n->host_end - n->host_start; if (n->host_start != (uintptr_t) h || host_size != s) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]", (void *) n->host_start, (int) host_size, (void *) h, (int) s); } gomp_mutex_unlock (&acc_dev->lock); if (f & FLAG_COPYOUT) acc_dev->dev2host_func (acc_dev->target_id, h, d, s); acc_unmap_data (h); if (!acc_dev->free_func (acc_dev->target_id, d)) gomp_fatal ("error in freeing device memory in %s", libfnname); }
static void delete_copyout (unsigned f, void *h, size_t s) { size_t host_size; splay_tree_key n; void *d; struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; gomp_mutex_lock (&acc_dev->lock); n = lookup_host (acc_dev, h, s); /* No need to call lazy open, as the data must already have been mapped. */ if (!n) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s); } d = (void *) (n->tgt->tgt_start + n->tgt_offset); host_size = n->host_end - n->host_start; if (n->host_start != (uintptr_t) h || host_size != s) { gomp_mutex_unlock (&acc_dev->lock); gomp_fatal ("[%p,%d] surrounds2 [%p,+%d]", (void *) n->host_start, (int) host_size, (void *) h, (int) s); } gomp_mutex_unlock (&acc_dev->lock); if (f & FLAG_COPYOUT) acc_dev->dev2host_func (acc_dev->target_id, h, d, s); acc_unmap_data (h); acc_dev->free_func (acc_dev->target_id, d); }
int main (int argc, char **argv) { const int N = 256; unsigned char *h; void *d; h = (unsigned char *) malloc (N); d = acc_malloc (N); fprintf (stderr, "CheCKpOInT\n"); acc_map_data (h, d, 0); acc_unmap_data (h); acc_free (d); free (h); return 0; }
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event the device address is mapped. We choose to check if it mapped, and if it is, to unmap it. */ void acc_free (void *d) { splay_tree_key k; if (!d) return; struct goacc_thread *thr = goacc_thread (); assert (thr && thr->dev); struct gomp_device_descr *acc_dev = thr->dev; if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) return free (d); gomp_mutex_lock (&acc_dev->lock); /* We don't have to call lazy open here, as the ptr value must have been returned by acc_malloc. It's not permitted to pass NULL in (unless you got that null from acc_malloc). */ if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1))) { void *offset; offset = d - k->tgt->tgt_start + k->tgt_offset; gomp_mutex_unlock (&acc_dev->lock); acc_unmap_data ((void *)(k->host_start + offset)); } else gomp_mutex_unlock (&acc_dev->lock); if (!acc_dev->free_func (acc_dev->target_id, d)) gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); }
int main (int argc, char **argv) { const int N = 256; unsigned char *h; void *d; h = (unsigned char *) malloc (N); d = acc_malloc (N); acc_map_data (h, d, N); if (acc_is_present (h, N) != 1) abort (); acc_unmap_data (h); acc_free (d); free (h); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } if (acc_async_test_all () != 0) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } sleep ((int) (dtime / 1000.f) + 1); if (acc_async_test_all () != 1) { fprintf (stderr, "found asynchronous operation still running\n"); abort (); } acc_unmap_data (a); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); exit (0); }