/* * Create a VampirTrace CUPTI Activity context. * * @param ctxID ID of the CUDA context * @param devID ID of the CUDA device * * @return pointer to created VampirTrace CUPTI Activity context */ static vt_cuptiact_ctx_t* vt_cuptiact_createContext(uint32_t ctxID, CUcontext cuCtx, uint32_t devID) { vt_cuptiact_ctx_t* vtCtx = NULL; /* create new context, as it is not listed */ vtCtx = (vt_cuptiact_ctx_t *)malloc(sizeof(vt_cuptiact_ctx_t)); if(vtCtx == NULL) vt_error_msg("[CUPTI Activity] Could not allocate memory for context!"); vtCtx->ctxID = ctxID; vtCtx->next = NULL; vtCtx->strmList = NULL; vtCtx->gpuMemAllocated = 0; vtCtx->gpuMemList = NULL; vtCtx->buffer = NULL; vtCtx->vtLastGPUTime = vt_gpu_init_time; vtCtx->gpuIdleOn = 1; /* * Get time synchronization factor between host and GPU time for measurement * interval */ { VT_CUPTI_CALL(cuptiGetTimestamp(&(vtCtx->sync.gpuStart)), "cuptiGetTimestamp"); vtCtx->sync.hostStart = vt_pform_wtime(); } VT_CHECK_THREAD; vtCtx->ptid = VT_MY_THREAD; if(cuCtx == NULL) CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), NULL); vtCtx->cuCtx = cuCtx; /* set default CUPTI stream ID (needed for memory usage and idle tracing) */ VT_CUPTI_CALL(cuptiGetStreamId(vtCtx->cuCtx, NULL, &(vtCtx->defaultStrmID)), "cuptiGetStreamId"); if(devID == (uint32_t)-1){ CUdevice cuDev; /* driver API prog: correct cuDev, but result is 201 (invalid context) */ if(CUDA_SUCCESS != cuCtxGetDevice(&cuDev)){ devID = VT_NO_ID; }else{ devID = (uint32_t)cuDev; } } vtCtx->devID = devID; vtCtx->cuDev = devID; /*vt_cntl_msg(1,"device id: %d", devID);*/ return vtCtx; }
SEXP R_cuCtxGetDevice() { SEXP r_ans = R_NilValue; CUdevice device; CUresult ans; ans = cuCtxGetDevice(& device); if(ans) return(R_cudaErrorInfo(ans)); r_ans = ScalarInteger(device) ; return(r_ans); }
static int detect_arch(const char *prefix, char *ret, CUresult *err) { CUdevice dev; int major, minor; int res; size_t sz = strlen(prefix) + 3; *err = cuCtxGetDevice(&dev); if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR; *err = get_cc(dev, &major, &minor); if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR; res = snprintf(ret, sz, "%s%d%d", prefix, major, minor); if (res == -1 || res > sz) return GA_UNSUPPORTED_ERROR; return GA_NO_ERROR; }
CUresult AttachCuContext(ContextPtr* ppContext) { ContextPtr context(new CuContext(false)); CUresult result = cuCtxGetCurrent(&context->_h); if(CUDA_SUCCESS != result || !context->_h) return CUDA_ERROR_INVALID_CONTEXT; int ordinal; cuCtxGetDevice(&ordinal); CreateCuDevice(ordinal, &context->_device); ppContext->swap(context); return CUDA_SUCCESS; }
static CUdevice get_device_from_ctx(CUcontext ctx) { // Strangely, there does not seem to be a way to get this from the // context without making it current. Feels hacky, possibly // subject to future change. CUcontext curCtx = 0; CUdevice device = 0; cuCtxGetCurrent(&curCtx); if (curCtx != ctx) { cuCtxPushCurrent(ctx); } cuCtxGetDevice(&device); if (curCtx != ctx) { cuCtxPopCurrent(NULL); } return device; }
static void nvptx_attach_host_thread_to_device (int n) { CUdevice dev; CUresult r; struct ptx_device *ptx_dev; CUcontext thd_ctx; r = cuCtxGetDevice (&dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) return; else { CUcontext old_ctx; ptx_dev = ptx_devices[n]; assert (ptx_dev); r = cuCtxGetCurrent (&thd_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); /* We don't necessarily have a current context (e.g. if it has been destroyed. Pop it if we do though. */ if (thd_ctx != NULL) { r = cuCtxPopCurrent (&old_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); } r = cuCtxPushCurrent (ptx_dev->ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); } }
static bool nvptx_attach_host_thread_to_device (int n) { CUdevice dev; CUresult r; struct ptx_device *ptx_dev; CUcontext thd_ctx; r = cuCtxGetDevice (&dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); return false; } if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) return true; else { CUcontext old_ctx; ptx_dev = ptx_devices[n]; if (!ptx_dev) { GOMP_PLUGIN_error ("device %d not found", n); return false; } CUDA_CALL (cuCtxGetCurrent, &thd_ctx); /* We don't necessarily have a current context (e.g. if it has been destroyed. Pop it if we do though. */ if (thd_ctx != NULL) CUDA_CALL (cuCtxPopCurrent, &old_ctx); CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx); } return true; }
/* * Create a VampirTrace CUPTI context. If the CUDA context is not given, the * current context will be requested and used. * * @param cuCtx CUDA context * @param cuDev CUDA device * @param ctxID ID of the CUDA context * @param devID ID of the CUDA device * * @return pointer to created VampirTrace CUPTI context */ vt_cupti_ctx_t* vt_cupti_createCtx(CUcontext cuCtx, CUdevice cuDev, uint32_t cuCtxID, uint32_t cuDevID) { vt_cupti_ctx_t* vtCtx = NULL; /* create new context */ vtCtx = (vt_cupti_ctx_t *)malloc(sizeof(vt_cupti_ctx_t)); if(vtCtx == NULL) vt_error_msg("[CUPTI] Could not allocate memory for VT CUPTI context!"); vtCtx->ctxID = cuCtxID; #if (defined(VT_CUPTI_ACTIVITY) || defined(VT_CUPTI_CALLBACKS)) vtCtx->gpuMemAllocated = 0; vtCtx->gpuMemList = NULL; vtCtx->strmList = NULL; #endif vtCtx->next = NULL; VT_CHECK_THREAD; vtCtx->ptid = VT_MY_THREAD; /* try to get CUDA device (ID), if they are not given */ if(cuDevID == VT_CUPTI_NO_DEVICE_ID){ if(cuDev == VT_CUPTI_NO_CUDA_DEVICE){ CUcontext cuCurrCtx; if(cuCtx != NULL){ cuCtxGetCurrent(&cuCurrCtx); /* if given context does not match the current one, get the device for the given one */ if(cuCtx != cuCurrCtx) VT_CUDRV_CALL(cuCtxSetCurrent(cuCtx), NULL); } if(CUDA_SUCCESS == cuCtxGetDevice(&cuDev)) cuDevID = (uint32_t)cuDev; /* reset the active context */ if(cuCtx != NULL && cuCtx != cuCurrCtx) VT_CUDRV_CALL(cuCtxSetCurrent(cuCurrCtx), NULL); }else{ /* no device ID, but CUDA device is given */ cuDevID = (uint32_t)cuDev; } } vtCtx->devID = cuDevID; vtCtx->cuDev = cuDev; /* get the current CUDA context, if it is not given */ if(cuCtx == NULL) VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), NULL); /* set the CUDA context */ vtCtx->cuCtx = cuCtx; #if defined(VT_CUPTI_ACTIVITY) vtCtx->activity = NULL; #endif #if defined(VT_CUPTI_CALLBACKS) vtCtx->callbacks = NULL; #endif #if defined(VT_CUPTI_EVENTS) vtCtx->events = NULL; #endif vt_cntl_msg(2, "[CUPTI] Created context for CUcontext %d, CUdevice %d", cuCtx, cuDev); return vtCtx; }
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cuda_context *ctx = NULL; if (c != NULL) { ctx = (cuda_context *)c; ASSERT_CTX(ctx); } else if (buf != NULL) { ASSERT_BUF(buf); ctx = buf->ctx; } else if (k != NULL) { ASSERT_KER(k); ctx = k->ctx; } /* I know that 512 and 1024 are magic numbers. There is an indication in buffer.h, though. */ if (prop_id < 512) { if (ctx == NULL) return GA_VALUE_ERROR; } else if (prop_id < 1024) { if (buf == NULL) return GA_VALUE_ERROR; } else { if (k == NULL) return GA_VALUE_ERROR; } switch (prop_id) { char *s; CUdevice id; int i; size_t sz; case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } /* 256 is what the CUDA API uses so it's good enough for me */ s = malloc(256); if (s == NULL) { cuda_exit(ctx); return GA_MEMORY_ERROR; } ctx->err = cuDeviceGetName(s, 256, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((char **)res) = s; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((unsigned int *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: #ifdef WITH_CUDA_CUBLAS *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; #else *((void **)res) = NULL; return GA_DEVSUP_ERROR; #endif case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; case GA_CTX_PROP_ERRBUF: *((gpudata **)res) = ctx->errbuf; return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo(&sz, (size_t *)res); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo((size_t *)res, &sz); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_BUFFER_PROP_REFCNT: *((unsigned int *)res) = buf->refcnt; return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: *((size_t *)res) = buf->sz; return GA_NO_ERROR; case GA_BUFFER_PROP_CTX: case GA_KERNEL_PROP_CTX: *((void **)res) = (void *)ctx; return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k); cuda_exit(ctx); if (ctx->err != CUDA_SUCCESS) return GA_IMPL_ERROR; *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_NUMARGS: *((unsigned int *)res) = k->argcount; return GA_NO_ERROR; case GA_KERNEL_PROP_TYPES: *((const int **)res) = k->types; return GA_NO_ERROR; default: return GA_INVALID_ERROR; } }
static gpukernel *cuda_newkernel(void *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb sb = STRB_STATIC_INIT; char *bin, *log = NULL; srckey k, *ak; binval *av; gpukernel *res; size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int ptx_mode = 0; int binary_mode = 0; int major, minor; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive if (flags & ~GA_USE_BINARY) FAIL(NULL, GA_INVALID_ERROR); // We need the length for binary data and there is only one blob. if (count != 1 || lengths == NULL || lengths[0] == 0) FAIL(NULL, GA_VALUE_ERROR); } cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuDeviceComputeCapability(&major, &minor, dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } // GA_USE_CLUDA is done later // GA_USE_SMALL will always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } // GA_USE_HALF should always work if (flags & GA_USE_PTX) { ptx_mode = 1; } else if (flags & GA_USE_BINARY) { binary_mode = 1; } if (binary_mode) { bin = memdup(strings[0], lengths[0]); bin_len = lengths[0]; if (bin == NULL) { cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } } else { if (flags & GA_USE_CLUDA) { strb_appends(&sb, CUDA_PREAMBLE); } if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&sb, strings[i]); } else { for (i = 0; i < count; i++) { if (lengths[i] == 0) strb_appends(&sb, strings[i]); else strb_appendn(&sb, strings[i], lengths[i]); } } strb_append0(&sb); if (strb_error(&sb)) { strb_clear(&sb); cuda_exit(ctx); return NULL; } if (ptx_mode) { bin = sb.s; bin_len = sb.l; } else { bin = NULL; if (compile_cache != NULL) { k.src = sb.s; k.len = sb.l; memcpy(k.arch, ctx->bin_id, BIN_ID_LEN); av = cache_get(compile_cache, &k); if (av != NULL) { bin = memdup(av->bin, av->len); bin_len = av->len; } } if (bin == NULL) { bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, &log, &log_len, ret); } if (bin == NULL) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; // We're substituting debug_msg for a string with this first line: strb_appends(&debug_msg, "CUDA kernel build failure ::\n"); /* Delete the final NUL */ sb.l--; gpukernel_source_with_line_numbers(1, (const char **)&sb.s, &sb.l, &debug_msg); if (log != NULL) { strb_appends(&debug_msg, "\nCompiler log:\n"); strb_appendn(&debug_msg, log, log_len); free(log); } *err_str = strb_cstr(&debug_msg); // *err_str will be free()d by the caller (see docs in kernel.h) } strb_clear(&sb); cuda_exit(ctx); return NULL; } if (compile_cache == NULL) compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free, bin_free); if (compile_cache != NULL) { ak = malloc(sizeof(*ak)); av = malloc(sizeof(*av)); if (ak == NULL || av == NULL) { free(ak); free(av); goto done; } ak->src = memdup(sb.s, sb.l); if (ak->src == NULL) { free(ak); free(av); goto done; } ak->len = sb.l; memmove(ak->arch, ctx->bin_id, BIN_ID_LEN); av->len = bin_len; av->bin = memdup(bin, bin_len); if (av->bin == NULL) { src_free(ak); free(av); goto done; } cache_add(compile_cache, ak, av); } done: strb_clear(&sb); } } res = calloc(1, sizeof(*res)); if (res == NULL) { free(bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } res->bin_sz = bin_len; res->bin = bin; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } ctx->err = cuModuleLoadData(&res->m, bin); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); return res; }
static struct ptx_device * nvptx_open_device (int n) { struct ptx_device *ptx_dev; CUdevice dev, ctx_dev; CUresult r; int async_engines, pi; r = cuDeviceGet (&dev, n); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r)); ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); ptx_dev->ord = n; ptx_dev->dev = dev; ptx_dev->ctx_shared = false; r = cuCtxGetDevice (&ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) { /* The current host thread has an active context for a different device. Detach it. */ CUcontext old_ctx; r = cuCtxPopCurrent (&old_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); } r = cuCtxGetCurrent (&ptx_dev->ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); if (!ptx_dev->ctx) { r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r)); } else ptx_dev->ctx_shared = true; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->overlap = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->map = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->concur = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->mode = pi; r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); ptx_dev->mkern = pi; r = cuDeviceGetAttribute (&async_engines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) async_engines = 1; ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); init_streams_for_device (ptx_dev, async_engines); return ptx_dev; }
/* * Initializes a CUPTI host thread and create the event group. * * @param ptid the VampirTrace thread id * @param cuCtx optionally given CUDA context * * @return the created VampirTrace CUPTI host thread structure */ static vt_cupti_ctx_t* vt_cupti_initCtx(uint32_t ptid, CUcontext cuCtx) { vt_cupti_ctx_t *vtcuptiCtx = NULL; uint64_t time; vt_cntl_msg(2, "[CUPTI] Initializing VampirTrace CUPTI context (ptid=%d)", ptid); time = vt_pform_wtime(); vt_enter(ptid, &time, rid_cupti_init); /* do not trace CUDA functions invoked here */ VT_SUSPEND_CUDA_TRACING(ptid); /* initialize CUDA driver API, if necessary and get context handle */ if(cuCtx == NULL){ #if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); #else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); #endif } /* get a pointer to eventIDArray */ { CUresult cuErr = CUDA_SUCCESS; int dev_major, dev_minor; CUdevice cuDev = 0; vt_cupti_dev_t *cuptiDev; CHECK_CU_ERROR(cuCtxGetDevice(&cuDev), "cuCtxGetDevice"); cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev); CHECK_CU_ERROR(cuErr, "cuDeviceComputeCapability"); /* check if device capability already listed */ CUPTI_LOCK(); cuptiDev = vt_cupti_capList; CUPTI_UNLOCK(); cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor); if(cuptiDev){ vtcuptiCtx = (vt_cupti_ctx_t*)malloc(sizeof(vt_cupti_ctx_t)); if(vtcuptiCtx == NULL) vt_error_msg("malloc(sizeof(VTCUPTIhostThrd)) failed!"); vtcuptiCtx->cuCtx = cuCtx; vtcuptiCtx->vtDevCap = cuptiDev; vtcuptiCtx->vtGrpList = NULL; vtcuptiCtx->counterData = NULL; vtcuptiCtx->cuptiEvtIDs = NULL; vtcuptiCtx->next = NULL; }else{ time = vt_pform_wtime(); vt_exit(ptid, &time); VT_RESUME_CUDA_TRACING(ptid); return NULL; } } VT_RESUME_CUDA_TRACING(ptid); /* create and add the VampirTrace CUPTI groups to the context */ vt_cupti_addEvtGrpsToCtx(vtcuptiCtx); /* allocate memory for CUPTI counter reads */ { size_t allocSize = vtcuptiCtx->vtGrpList->evtNum; vtcuptiCtx->counterData = (uint64_t *)malloc(allocSize*sizeof(uint64_t)); vtcuptiCtx->cuptiEvtIDs = (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID)); } /* add VampirTrace CUPTI context entry to list (as first element) */ CUPTI_LOCK(); vtcuptiCtx->next = vtcuptiCtxlist; vtcuptiCtxlist = vtcuptiCtx; CUPTI_UNLOCK(); time = vt_pform_wtime(); vt_exit(ptid, &time); return vtcuptiCtx; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }