static int nvptx_async_test_all (void) { struct ptx_stream *s; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if ((s->multithreaded || pthread_equal (s->host_thread, self)) && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) { pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); return 0; } } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); event_gc (true); return 1; }
static void nvptx_wait_all (void) { CUresult r; struct ptx_stream *s; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); /* Wait for active streams initiated by this thread (or by multiple threads) to complete. */ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if (s->multithreaded || pthread_equal (s->host_thread, self)) { r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) continue; else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); r = cuStreamSynchronize (s->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); event_gc (true); }
static void nvptx_set_async (int async) { struct nvptx_thread *nvthd = nvptx_thread (); nvthd->current_stream = select_stream_for_async (async, pthread_self (), true, NULL); }
static void * nvptx_get_current_cuda_context (void) { struct nvptx_thread *nvthd = nvptx_thread (); if (!nvthd || !nvthd->ptx_dev) return NULL; return nvthd->ptx_dev->ctx; }
static void * nvptx_get_current_cuda_device (void) { struct nvptx_thread *nvthd = nvptx_thread (); if (!nvthd || !nvthd->ptx_dev) return NULL; return &nvthd->ptx_dev->dev; }
void GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async) { struct nvptx_thread *nvthd = nvptx_thread (); CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async); }
static bool nvptx_dev2host (void *h, const void *d, size_t s) { CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return true; if (!d) { GOMP_PLUGIN_error ("invalid device address"); return false; } CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); if (!pb) { GOMP_PLUGIN_error ("invalid device address"); return false; } if (!h) { GOMP_PLUGIN_error ("invalid host address"); return false; } if (d == h) { GOMP_PLUGIN_error ("invalid host or device address"); return false; } if ((void *)(d + s) > (void *)(pb + ps)) { GOMP_PLUGIN_error ("invalid size"); return false; } #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (false); CUDA_CALL (cuMemcpyDtoHAsync, h, (CUdeviceptr) d, s, nvthd->current_stream->stream); CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_MEM, e, (void *)h, 0); } else #endif CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s); return true; }
static void * nvptx_get_cuda_stream (int async) { struct ptx_stream *s; struct nvptx_thread *nvthd = nvptx_thread (); if (!nvthd || !nvthd->ptx_dev) return NULL; s = select_stream_for_async (async, pthread_self (), false, NULL); return s ? s->stream : NULL; }
static void nvptx_wait_all_async (int async) { CUresult r; struct ptx_stream *waiting_stream, *other_stream; CUevent *e; struct nvptx_thread *nvthd = nvptx_thread (); pthread_t self = pthread_self (); /* The stream doing the waiting. This could be the first mention of the stream, so create it if necessary. */ waiting_stream = select_stream_for_async (async, pthread_self (), true, NULL); /* Launches on the null stream already block on other streams in the context. */ if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) return; event_gc (true); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); for (other_stream = nvthd->ptx_dev->active_streams; other_stream != NULL; other_stream = other_stream->next) { if (!other_stream->multithreaded && !pthread_equal (other_stream->host_thread, self)) continue; e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); /* Record an event on the waited-for stream. */ r = cuEventRecord (*e, other_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_SYNC, e, NULL); r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); }
static int nvptx_set_cuda_stream (int async, void *stream) { struct ptx_stream *oldstream; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); /* We have a list of active streams and an array mapping async values to entries of that list. We need to take "ownership" of the passed-in stream, and add it to our list, removing the previous entry also (if there was one) in order to prevent resource leaks. Note the potential for surprise here: maybe we should keep track of passed-in streams and leave it up to the user to tidy those up, but that doesn't work for stream handles returned from acc_get_cuda_stream above... */ oldstream = select_stream_for_async (async, self, false, NULL); if (oldstream) { if (nvthd->ptx_dev->active_streams == oldstream) nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; else { struct ptx_stream *s = nvthd->ptx_dev->active_streams; while (s->next != oldstream) s = s->next; s->next = s->next->next; } CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream); if (!map_fini (oldstream)) GOMP_PLUGIN_fatal ("error when freeing host memory"); free (oldstream); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); (void) select_stream_for_async (async, self, true, (CUstream) stream); return 1; }
void GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) { CUevent *e; CUresult r; struct nvptx_thread *nvthd = nvptx_thread (); e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); r = cuEventRecord (*e, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); }
static void event_add (enum ptx_event_type type, CUevent *e, void *h) { struct ptx_event *ptx_event; struct nvptx_thread *nvthd = nvptx_thread (); assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC || type == PTX_EVT_ASYNC_CLEANUP); ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event)); ptx_event->type = type; ptx_event->evt = e; ptx_event->addr = h; ptx_event->ord = nvthd->ptx_dev->ord; pthread_mutex_lock (&ptx_event_lock); ptx_event->next = ptx_events; ptx_events = ptx_event; pthread_mutex_unlock (&ptx_event_lock); }
void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int async, unsigned *dims, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; CUresult r; int i; struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ for (i = 0; i != 3; i++) if (targ_fn->launch->dim[i]) dims[i] = targ_fn->launch->dim[i]; /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[0], dims[1], dims[2]); // OpenACC CUDA // // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); #ifndef DISABLE_ASYNC if (async < acc_async_noval) { r = cuStreamSynchronize (dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } else { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, dev_str->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif map_pop (dev_str); }
static void event_gc (bool memmap_lockable) { struct ptx_event *ptx_event = ptx_events; struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&ptx_event_lock); while (ptx_event != NULL) { CUresult r; struct ptx_event *e = ptx_event; ptx_event = ptx_event->next; if (e->ord != nvthd->ptx_dev->ord) continue; r = cuEventQuery (*e->evt); if (r == CUDA_SUCCESS) { CUevent *te; te = e->evt; switch (e->type) { case PTX_EVT_MEM: case PTX_EVT_SYNC: break; case PTX_EVT_KNL: map_pop (e->addr); break; case PTX_EVT_ASYNC_CLEANUP: { /* The function gomp_plugin_async_unmap_vars needs to claim the memory-map splay tree lock for the current device, so we can't call it when one of our callers has already claimed the lock. In that case, just delay the GC for this event until later. */ if (!memmap_lockable) continue; GOMP_PLUGIN_async_unmap_vars (e->addr); } break; } cuEventDestroy (*te); free ((void *)te); if (ptx_events == e) ptx_events = ptx_events->next; else { struct ptx_event *e_ = ptx_events; while (e_->next != e) e_ = e_->next; e_->next = e_->next->next; } free (e); } } pthread_mutex_unlock (&ptx_event_lock); }
static struct ptx_stream * select_stream_for_async (int async, pthread_t thread, bool create, CUstream existing) { struct nvptx_thread *nvthd = nvptx_thread (); /* Local copy of TLS variable. */ struct ptx_device *ptx_dev = nvthd->ptx_dev; struct ptx_stream *stream = NULL; int orig_async = async; /* The special value acc_async_noval (-1) maps (for now) to an implicitly-created stream, which is then handled the same as any other numbered async stream. Other options are available, e.g. using the null stream for anonymous async operations, or choosing an idle stream from an active set. But, stick with this for now. */ if (async > acc_async_sync) async++; if (create) pthread_mutex_lock (&ptx_dev->stream_lock); /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the null stream, and in fact better performance may be obtainable if it doesn't (because the null stream enforces overly-strict synchronisation with respect to other streams for legacy reasons, and that's probably not needed with OpenACC). Maybe investigate later. */ if (async == acc_async_sync) stream = ptx_dev->null_stream; else if (async >= 0 && async < ptx_dev->async_streams.size && ptx_dev->async_streams.arr[async] && !(create && existing)) stream = ptx_dev->async_streams.arr[async]; else if (async >= 0 && create) { if (async >= ptx_dev->async_streams.size) { int i, newsize = ptx_dev->async_streams.size * 2; if (async >= newsize) newsize = async + 1; ptx_dev->async_streams.arr = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr, newsize * sizeof (struct ptx_stream *)); for (i = ptx_dev->async_streams.size; i < newsize; i++) ptx_dev->async_streams.arr[i] = NULL; ptx_dev->async_streams.size = newsize; } /* Create a new stream on-demand if there isn't one already, or if we're setting a particular async value to an existing (externally-provided) stream. */ if (!ptx_dev->async_streams.arr[async] || existing) { CUresult r; struct ptx_stream *s = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); if (existing) s->stream = existing; else { r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r)); } /* If CREATE is true, we're going to be queueing some work on this stream. Associate it with the current host thread. */ s->host_thread = thread; s->multithreaded = false; s->d = (CUdeviceptr) NULL; s->h = NULL; map_init (s); s->next = ptx_dev->active_streams; ptx_dev->active_streams = s; ptx_dev->async_streams.arr[async] = s; } stream = ptx_dev->async_streams.arr[async]; } else if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); if (create) { assert (stream != NULL); /* If we're trying to use the same stream from different threads simultaneously, set stream->multithreaded to true. This affects the behaviour of acc_async_test_all and acc_wait_all, which are supposed to only wait for asynchronous launches from the same host thread they are invoked on. If multiple threads use the same async value, we make note of that here and fall back to testing/waiting for all threads in those functions. */ if (thread != stream->host_thread) stream->multithreaded = true; pthread_mutex_unlock (&ptx_dev->stream_lock); } else if (stream && !stream->multithreaded && !pthread_equal (stream->host_thread, thread)) GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async); return stream; }
static void * nvptx_dev2host (void *h, const void *d, size_t s) { CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return 0; if (!d) GOMP_PLUGIN_fatal ("invalid device address"); r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); if (!pb) GOMP_PLUGIN_fatal ("invalid device address"); if (!h) GOMP_PLUGIN_fatal ("invalid host address"); if (d == h) GOMP_PLUGIN_fatal ("invalid host or device address"); if ((void *)(d + s) > (void *)(pb + ps)) GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r)); event_gc (false); r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r)); r = cuEventRecord (*e, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif { r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); } return 0; }