static bool nvptx_dev2host (void *h, const void *d, size_t s) { CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return true; if (!d) { GOMP_PLUGIN_error ("invalid device address"); return false; } CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); if (!pb) { GOMP_PLUGIN_error ("invalid device address"); return false; } if (!h) { GOMP_PLUGIN_error ("invalid host address"); return false; } if (d == h) { GOMP_PLUGIN_error ("invalid host or device address"); return false; } if ((void *)(d + s) > (void *)(pb + ps)) { GOMP_PLUGIN_error ("invalid size"); return false; } #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (false); CUDA_CALL (cuMemcpyDtoHAsync, h, (CUdeviceptr) d, s, nvthd->current_stream->stream); CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_MEM, e, (void *)h, 0); } else #endif CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s); return true; }
static bool fini_streams_for_device (struct ptx_device *ptx_dev) { free (ptx_dev->async_streams.arr); bool ret = true; while (ptx_dev->active_streams != NULL) { struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; ret &= map_fini (s); CUresult r = cuStreamDestroy (s->stream); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r)); ret = false; } free (s); } ret &= map_fini (ptx_dev->null_stream); free (ptx_dev->null_stream); return ret; }
bool GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) { struct ptx_image_data *image, **prev_p; struct ptx_device *dev = ptx_devices[ord]; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) { GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" " (expected %u, received %u)", GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); return false; } bool ret = true; pthread_mutex_lock (&dev->image_lock); for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) if (image->target_data == target_data) { *prev_p = image->next; if (cuModuleUnload (image->module) != CUDA_SUCCESS) ret = false; free (image->fns); free (image); break; } pthread_mutex_unlock (&dev->image_lock); return ret; }
static bool nvptx_attach_host_thread_to_device (int n) { CUdevice dev; CUresult r; struct ptx_device *ptx_dev; CUcontext thd_ctx; r = cuCtxGetDevice (&dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); return false; } if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) return true; else { CUcontext old_ctx; ptx_dev = ptx_devices[n]; if (!ptx_dev) { GOMP_PLUGIN_error ("device %d not found", n); return false; } CUDA_CALL (cuCtxGetCurrent, &thd_ctx); /* We don't necessarily have a current context (e.g. if it has been destroyed. Pop it if we do though. */ if (thd_ctx != NULL) CUDA_CALL (cuCtxPopCurrent, &old_ctx); CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx); } return true; }
static bool nvptx_free (void *p) { CUdeviceptr pb; size_t ps; CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p); if ((CUdeviceptr) p != pb) { GOMP_PLUGIN_error ("invalid device address"); return false; } CUDA_CALL (cuMemFree, (CUdeviceptr) p); return true; }
static void link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, unsigned num_objs) { CUjit_option opts[7]; void *optvals[7]; float elapsed = 0.0; #define LOGSIZE 8192 char elog[LOGSIZE]; char ilog[LOGSIZE]; unsigned long logsize = LOGSIZE; CUlinkState linkstate; CUresult r; void *linkout; size_t linkoutsize __attribute__ ((unused)); opts[0] = CU_JIT_WALL_TIME; optvals[0] = &elapsed; opts[1] = CU_JIT_INFO_LOG_BUFFER; optvals[1] = &ilog[0]; opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; optvals[2] = (void *) logsize; opts[3] = CU_JIT_ERROR_LOG_BUFFER; optvals[3] = &elog[0]; opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optvals[4] = (void *) logsize; opts[5] = CU_JIT_LOG_VERBOSE; optvals[5] = (void *) 1; opts[6] = CU_JIT_TARGET; optvals[6] = (void *) CU_TARGET_COMPUTE_30; r = cuLinkCreate (7, opts, optvals, &linkstate); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r)); for (; num_objs--; ptx_objs++) { /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */ GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code, ptx_objs->size, 0, 0, 0, 0); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r)); } } GOMP_PLUGIN_debug (0, "Linking\n"); r = cuLinkComplete (linkstate, &linkout, &linkoutsize); GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r)); r = cuModuleLoadData (module, linkout); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r)); r = cuLinkDestroy (linkstate); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r)); }
static struct ptx_device * nvptx_open_device (int n) { struct ptx_device *ptx_dev; CUdevice dev, ctx_dev; CUresult r; int async_engines, pi; CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); ptx_dev->ord = n; ptx_dev->dev = dev; ptx_dev->ctx_shared = false; r = cuCtxGetDevice (&ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); return NULL; } if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) { /* The current host thread has an active context for a different device. Detach it. */ CUcontext old_ctx; CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx); } CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx); if (!ptx_dev->ctx) CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); else ptx_dev->ctx_shared = true; CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); ptx_dev->overlap = pi; CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); ptx_dev->map = pi; CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); ptx_dev->concur = pi; CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); ptx_dev->mode = pi; CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); ptx_dev->mkern = pi; r = cuDeviceGetAttribute (&async_engines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) async_engines = 1; ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); if (!init_streams_for_device (ptx_dev, async_engines)) return NULL; return ptx_dev; }
int GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, struct addr_pair **target_table) { CUmodule module; const char *const *var_names; const struct targ_fn_launch *fn_descs; unsigned int fn_entries, var_entries, i, j; struct targ_fn_descriptor *targ_fns; struct addr_pair *targ_tbl; const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data; struct ptx_image_data *new_image; struct ptx_device *dev; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) { GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" " (expected %u, received %u)", GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); return -1; } if (!nvptx_attach_host_thread_to_device (ord) || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num)) return -1; dev = ptx_devices[ord]; /* The mkoffload utility emits a struct of pointers/integers at the start of each offload image. The array of kernel names and the functions addresses form a one-to-one correspondence. */ var_entries = img_header->var_num; var_names = img_header->var_names; fn_entries = img_header->fn_num; fn_descs = img_header->fn_descs; targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) * (fn_entries + var_entries)); targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) * fn_entries); *target_table = targ_tbl; new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); new_image->target_data = target_data; new_image->module = module; new_image->fns = targ_fns; pthread_mutex_lock (&dev->image_lock); new_image->next = dev->images; dev->images = new_image; pthread_mutex_unlock (&dev->image_lock); for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) { CUfunction function; CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, fn_descs[i].fn); targ_fns->fn = function; targ_fns->launch = &fn_descs[i]; targ_tbl->start = (uintptr_t) targ_fns; targ_tbl->end = targ_tbl->start + 1; } for (j = 0; j < var_entries; j++, targ_tbl++) { CUdeviceptr var; size_t bytes; CUDA_CALL_ERET (-1, cuModuleGetGlobal, &var, &bytes, module, var_names[j]); targ_tbl->start = (uintptr_t) var; targ_tbl->end = targ_tbl->start + bytes; } return fn_entries + var_entries; }