static void goacc_wait (int async, int num_waits, va_list ap) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; int i; assert (num_waits >= 0); if (async == acc_async_sync && num_waits == 0) { acc_wait_all (); return; } if (async == acc_async_sync && num_waits) { for (i = 0; i < num_waits; i++) { int qid = va_arg (ap, int); if (acc_async_test (qid)) continue; acc_wait (qid); } return; } if (async == acc_async_noval && num_waits == 0) { acc_dev->openacc.async_wait_all_async_func (acc_async_noval); return; } for (i = 0; i < num_waits; i++) { int qid = va_arg (ap, int); if (acc_async_test (qid)) continue; /* If we're waiting on the same asynchronous queue as we're launching on, the queue itself will order work as required, so there's no need to wait explicitly. */ if (qid != async) acc_dev->openacc.async_wait_async_func (qid, async); } }
int main (int argc, char **argv) { float atime; CUstream stream; CUresult r; acc_init (acc_device_nvidia); (void) acc_get_device_num (acc_device_nvidia); init_timers (1); stream = (CUstream) acc_get_cuda_stream (0); if (stream != NULL) abort (); r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); start_timer (0); acc_wait_all_async (0); acc_wait (0); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time too long\n"); abort (); } fini_timers (); acc_shutdown (acc_device_nvidia); exit (0); }
int main (int argc, char **argv) { CUstream stream; CUresult r; struct timeval tv1, tv2; time_t t1; acc_init (acc_device_nvidia); stream = (CUstream) acc_get_cuda_stream (0); if (stream != NULL) abort (); r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); gettimeofday (&tv1, NULL); acc_wait_all_async (0); acc_wait (0); gettimeofday (&tv2, NULL); t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec); if (t1 > 1000) { fprintf (stderr, "too long\n"); abort (); } acc_shutdown (acc_device_nvidia); exit (0); }
static void goacc_wait (int async, int num_waits, va_list *ap) { struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; while (num_waits--) { int qid = va_arg (*ap, int); if (acc_async_test (qid)) continue; if (async == acc_async_sync) acc_wait (qid); else if (qid == async) ;/* If we're waiting on the same asynchronous queue as we're launching on, the queue itself will order work as required, so there's no need to wait explicitly. */ else acc_dev->openacc.async_wait_async_func (qid, async); } }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }