static void nvptx_wait_async (int async1, int async2) { CUevent *e; struct ptx_stream *s1, *s2; pthread_t self = pthread_self (); /* The stream that is waiting (rather than being waited for) doesn't necessarily have to exist already. */ s2 = select_stream_for_async (async2, self, true, NULL); s1 = select_stream_for_async (async1, self, false, NULL); if (!s1) GOMP_PLUGIN_fatal ("invalid async 1\n"); if (s1 == s2) GOMP_PLUGIN_fatal ("identical parameters"); e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (true); CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream); event_add (PTX_EVT_SYNC, e, NULL, 0); CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0); }
static int nvptx_set_cuda_stream (int async, void *stream) { struct ptx_stream *oldstream; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); /* We have a list of active streams and an array mapping async values to entries of that list. We need to take "ownership" of the passed-in stream, and add it to our list, removing the previous entry also (if there was one) in order to prevent resource leaks. Note the potential for surprise here: maybe we should keep track of passed-in streams and leave it up to the user to tidy those up, but that doesn't work for stream handles returned from acc_get_cuda_stream above... */ oldstream = select_stream_for_async (async, self, false, NULL); if (oldstream) { if (nvthd->ptx_dev->active_streams == oldstream) nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; else { struct ptx_stream *s = nvthd->ptx_dev->active_streams; while (s->next != oldstream) s = s->next; s->next = s->next->next; } CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream); if (!map_fini (oldstream)) GOMP_PLUGIN_fatal ("error when freeing host memory"); free (oldstream); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); (void) select_stream_for_async (async, self, true, (CUstream) stream); return 1; }
static int nvptx_async_test (int async) { CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) { /* The oacc-parallel.c:goacc_wait function calls this hook to determine whether all work has completed on this stream, and if so omits the call to the wait hook. If that happens, event_gc might not get called (which prevents variables from getting unmapped and their associated device storage freed), so call it here. */ event_gc (true); return 1; } else if (r == CUDA_ERROR_NOT_READY) return 0; GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); return 0; }
static void nvptx_set_async (int async) { struct nvptx_thread *nvthd = nvptx_thread (); nvthd->current_stream = select_stream_for_async (async, pthread_self (), true, NULL); }
static void * nvptx_get_cuda_stream (int async) { struct ptx_stream *s; struct nvptx_thread *nvthd = nvptx_thread (); if (!nvthd || !nvthd->ptx_dev) return NULL; s = select_stream_for_async (async, pthread_self (), false, NULL); return s ? s->stream : NULL; }
static void nvptx_wait (int async) { struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); event_gc (true); }
static void nvptx_wait_all_async (int async) { CUresult r; struct ptx_stream *waiting_stream, *other_stream; CUevent *e; struct nvptx_thread *nvthd = nvptx_thread (); pthread_t self = pthread_self (); /* The stream doing the waiting. This could be the first mention of the stream, so create it if necessary. */ waiting_stream = select_stream_for_async (async, pthread_self (), true, NULL); /* Launches on the null stream already block on other streams in the context. */ if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) return; event_gc (true); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); for (other_stream = nvthd->ptx_dev->active_streams; other_stream != NULL; other_stream = other_stream->next) { if (!other_stream->multithreaded && !pthread_equal (other_stream->host_thread, self)) continue; e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); /* Record an event on the waited-for stream. */ r = cuEventRecord (*e, other_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_SYNC, e, NULL); r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); }
static void nvptx_wait (int async) { CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); r = cuStreamSynchronize (s->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); event_gc (true); }
void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int async, unsigned *dims, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; CUresult r; int i; struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ for (i = 0; i != 3; i++) if (targ_fn->launch->dim[i]) dims[i] = targ_fn->launch->dim[i]; /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[0], dims[1], dims[2]); // OpenACC CUDA // // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); #ifndef DISABLE_ASYNC if (async < acc_async_noval) { r = cuStreamSynchronize (dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } else { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, dev_str->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif map_pop (dev_str); }