static void nvptx_wait_all (void) { CUresult r; struct ptx_stream *s; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); /* Wait for active streams initiated by this thread (or by multiple threads) to complete. */ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if (s->multithreaded || pthread_equal (s->host_thread, self)) { r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) continue; else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); r = cuStreamSynchronize (s->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); event_gc (true); }
static int nvptx_async_test_all (void) { struct ptx_stream *s; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if ((s->multithreaded || pthread_equal (s->host_thread, self)) && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) { pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); return 0; } } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); event_gc (true); return 1; }
static int nvptx_async_test (int async) { CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) { /* The oacc-parallel.c:goacc_wait function calls this hook to determine whether all work has completed on this stream, and if so omits the call to the wait hook. If that happens, event_gc might not get called (which prevents variables from getting unmapped and their associated device storage freed), so call it here. */ event_gc (true); return 1; } else if (r == CUDA_ERROR_NOT_READY) return 0; GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); return 0; }
static void nvptx_wait_async (int async1, int async2) { CUevent *e; struct ptx_stream *s1, *s2; pthread_t self = pthread_self (); /* The stream that is waiting (rather than being waited for) doesn't necessarily have to exist already. */ s2 = select_stream_for_async (async2, self, true, NULL); s1 = select_stream_for_async (async1, self, false, NULL); if (!s1) GOMP_PLUGIN_fatal ("invalid async 1\n"); if (s1 == s2) GOMP_PLUGIN_fatal ("identical parameters"); e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (true); CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream); event_add (PTX_EVT_SYNC, e, NULL, 0); CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0); }
static bool nvptx_dev2host (void *h, const void *d, size_t s) { CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return true; if (!d) { GOMP_PLUGIN_error ("invalid device address"); return false; } CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); if (!pb) { GOMP_PLUGIN_error ("invalid device address"); return false; } if (!h) { GOMP_PLUGIN_error ("invalid host address"); return false; } if (d == h) { GOMP_PLUGIN_error ("invalid host or device address"); return false; } if ((void *)(d + s) > (void *)(pb + ps)) { GOMP_PLUGIN_error ("invalid size"); return false; } #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (false); CUDA_CALL (cuMemcpyDtoHAsync, h, (CUdeviceptr) d, s, nvthd->current_stream->stream); CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_MEM, e, (void *)h, 0); } else #endif CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s); return true; }
static void nvptx_wait (int async) { struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); event_gc (true); }
static void nvptx_wait_all_async (int async) { CUresult r; struct ptx_stream *waiting_stream, *other_stream; CUevent *e; struct nvptx_thread *nvthd = nvptx_thread (); pthread_t self = pthread_self (); /* The stream doing the waiting. This could be the first mention of the stream, so create it if necessary. */ waiting_stream = select_stream_for_async (async, pthread_self (), true, NULL); /* Launches on the null stream already block on other streams in the context. */ if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) return; event_gc (true); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); for (other_stream = nvthd->ptx_dev->active_streams; other_stream != NULL; other_stream = other_stream->next) { if (!other_stream->multithreaded && !pthread_equal (other_stream->host_thread, self)) continue; e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); /* Record an event on the waited-for stream. */ r = cuEventRecord (*e, other_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_SYNC, e, NULL); r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); }
static void nvptx_wait (int async) { CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); r = cuStreamSynchronize (s->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); event_gc (true); }
void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int async, unsigned *dims, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; CUresult r; int i; struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ for (i = 0; i != 3; i++) if (targ_fn->launch->dim[i]) dims[i] = targ_fn->launch->dim[i]; /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[0], dims[1], dims[2]); // OpenACC CUDA // // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); #ifndef DISABLE_ASYNC if (async < acc_async_noval) { r = cuStreamSynchronize (dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } else { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, dev_str->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif map_pop (dev_str); }
static void * nvptx_dev2host (void *h, const void *d, size_t s) { CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return 0; if (!d) GOMP_PLUGIN_fatal ("invalid device address"); r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); if (!pb) GOMP_PLUGIN_fatal ("invalid device address"); if (!h) GOMP_PLUGIN_fatal ("invalid host address"); if (d == h) GOMP_PLUGIN_fatal ("invalid host or device address"); if ((void *)(d + s) > (void *)(pb + ps)) GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r)); event_gc (false); r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r)); r = cuEventRecord (*e, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif { r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); } return 0; }