bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane) { Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12"); TexRes &r = res[plane]; if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes) return true; if (!ctx) { // TODO: how to use pop/push decoder's context without the context in opengl context CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false); if (USE_STREAM) { CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT)); CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT)); } qDebug("cuda contex on gl thread: %p", ctx); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this } if (r.cuRes) { CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false); r.cuRes = NULL; } // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false); r.texture = tex; r.w = w; r.h = h; r.H = H; return true; }
void *get_read_ptr_host(ComputeEnv *env, size_t read_byte_size) { if (host_valid) { return host_ptr; } if (host_ptr == nullptr) { host_ptr = _mm_malloc(byte_size, 64); } if (last_write.type == Processor::OpenCL) { OpenCLDev *dev = &env->cl_dev_list[last_write.devid]; clEnqueueReadBuffer(dev->queue, cl_ptr_list[last_write.devid], CL_TRUE, 0, read_byte_size, host_ptr, 0, nullptr, nullptr); } else if (last_write.type == Processor::CUDA) { CUDADev *dev = &env->cuda_dev_list[last_write.devid]; cuCtxPushCurrent(dev->context); //double t0 = getsec(); cuMemcpyDtoH(host_ptr, cuda_ptr_list[last_write.devid], read_byte_size); //double t1 = getsec(); //env->transfer_wait = t1-t0; CUcontext old; cuCtxPopCurrent(&old); } else { abort(); } host_valid = true; return host_ptr; }
void cuda_exit(cuda_context *ctx) { ASSERT_CTX(ctx); assert(ctx->enter > 0); ctx->enter--; if (!ctx->enter) cuCtxPopCurrent(NULL); }
CUdeviceptr get_read_ptr_cuda(ComputeEnv *env,int devid, size_t read_byte_size) { if (cuda_valid_list[devid]) { return cuda_ptr_list[devid]; } if (host_valid == false) { /* xx */ abort(); return 0; } CUDADev *dev = &env->cuda_dev_list[devid]; cuCtxPushCurrent(dev->context); if (cuda_ptr_list[devid] == 0) { CUresult err; err = cuMemAlloc(&cuda_ptr_list[devid], byte_size); if (err != CUDA_SUCCESS) { abort(); } } //double t0 = getsec(); cuMemcpyHtoD(cuda_ptr_list[devid], host_ptr, read_byte_size); //double t1 = getsec(); //env->transfer_wait = t1-t0; cuda_valid_list[devid] = true; CUcontext old; cuCtxPopCurrent(&old); return cuda_ptr_list[devid]; }
static void *do_init(CUdevice dev, int flags, int *ret) { cuda_context *res; CUcontext ctx; unsigned int fl = CU_CTX_SCHED_AUTO; int i; CHKFAIL(NULL); if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); CHKFAIL(NULL); if (i != 1) FAIL(NULL, GA_UNSUPPORTED_ERROR); err = cuCtxCreate(&ctx, fl, dev); CHKFAIL(NULL); res = cuda_make_ctx(ctx, 0); if (res == NULL) { cuCtxDestroy(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->flags |= flags; /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); return res; }
bool prealloc(ComputeEnv *env) { int devid; if (host_ptr == nullptr) { host_ptr = _mm_malloc(byte_size, 64); if (host_ptr == nullptr) { return false; } } switch (env->target_processor.type) { case W2XCONV_PROC_HOST: break; case W2XCONV_PROC_OPENCL: devid = env->target_processor.devid; if (cl_ptr_list[devid] == nullptr) { cl_int err; OpenCLDev *dev = &env->cl_dev_list[devid]; cl_ptr_list[devid] = clCreateBuffer(dev->context, CL_MEM_READ_WRITE, byte_size, nullptr, &err); if (cl_ptr_list[devid] == nullptr) { return false; } /* touch memory to force allocation */ char data = 0; err = clEnqueueWriteBuffer(dev->queue, cl_ptr_list[devid], CL_TRUE, 0, 1, &data, 0, nullptr, nullptr); if (err != CL_SUCCESS) { clReleaseMemObject(cl_ptr_list[devid]); cl_ptr_list[devid] = nullptr; return false; } } break; case W2XCONV_PROC_CUDA: devid = env->target_processor.devid; if (cuda_ptr_list[devid] == 0) { CUresult err; CUDADev *dev = &env->cuda_dev_list[devid]; cuCtxPushCurrent(dev->context); err = cuMemAlloc(&cuda_ptr_list[devid], byte_size); CUcontext old; cuCtxPopCurrent(&old); if (err != CUDA_SUCCESS) { return false; } } break; } return true; }
void GPUInterface::ResizeStreamCount(int newStreamCount) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuCtxSynchronize()); if (cudaStreams != NULL) { for(int i=0; i<numStreams; i++) { if (cudaStreams[i] != NULL) SAFE_CUDA(cuStreamDestroy(cudaStreams[i])); } free(cudaStreams); } if (cudaEvents != NULL) { for(int i=0; i<numStreams; i++) { if (cudaEvents[i] != NULL) SAFE_CUDA(cuEventDestroy(cudaEvents[i])); } free(cudaEvents); } if (newStreamCount == 1) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = newStreamCount; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::ResizeStreamCount\n"); #endif }
void ImageGL::unregisterAsCudaResource(int field_num) { cuCtxPushCurrent(oContext_); checkCudaErrors(cuGLUnregisterBufferObject(gl_pbo_[field_num])); bIsCudaResource_ = false; cuCtxPopCurrent(NULL); }
CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
int gib_alloc ( void **buffers, int buf_size, int *ld, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); #if GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemHostAlloc(buffers, (c->n+c->m)*buf_size, CU_MEMHOSTALLOC_DEVICEMAP)); #else ERROR_CHECK_FAIL(cuMemAllocHost(buffers, (c->n+c->m)*buf_size)); #endif *ld = buf_size; ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
void GPUInterface::LaunchKernel(GPUFunction deviceFunction, Dim3Int block, Dim3Int grid, int parameterCountV, int totalParameterCount, ...) { // parameters #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernel\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuFuncSetBlockShape(deviceFunction, block.x, block.y, block.z)); int offset = 0; va_list parameters; va_start(parameters, totalParameterCount); for(int i = 0; i < parameterCountV; i++) { void* param = (void*)(size_t)va_arg(parameters, GPUPtr); // adjust offset alignment requirements offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1); SAFE_CUDA(cuParamSetv(deviceFunction, offset, ¶m, sizeof(param))); offset += sizeof(void*); } for(int i = parameterCountV; i < totalParameterCount; i++) { unsigned int param = va_arg(parameters, unsigned int); // adjust offset alignment requirements offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1); SAFE_CUDA(cuParamSeti(deviceFunction, offset, param)); offset += sizeof(param); } va_end(parameters); SAFE_CUDA(cuParamSetSize(deviceFunction, offset)); SAFE_CUDA(cuLaunchGrid(deviceFunction, grid.x, grid.y)); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::LaunchKernel\n"); #endif }
SEXP R_cuCtxPopCurrent() { SEXP r_ans = R_NilValue; CUcontext pctx; CUresult ans; ans = cuCtxPopCurrent(& pctx); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef(pctx, "CUcontext") ; return(r_ans); }
/* * Finalizes CUPTI device. * * @param ptid VampirTrace process/thread id * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0 */ void vt_cuptievt_finalize_device(uint8_t cleanExit){ CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_ctx_t *vtcuptiCtx = NULL; vt_cntl_msg(2, "[CUPTI Events] Finalize device ... "); { CUcontext cuCtx; #if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); #else VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); #endif vtcuptiCtx = vt_cupti_removeCtx(&cuCtx); if(vtcuptiCtx == NULL) return; } if(vtcuptiCtx->events == NULL) return; if(cleanExit && vt_gpu_debug != 0){ /*uint64_t time = vt_pform_wtime(); vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/ /* stop CUPTI counter capturing */ vt_cuptievt_stop(vtcuptiCtx->events); /* destroy all CUPTI event groups, which have been created */ { vt_cupti_evtgrp_t *vtcuptiGrp = vtcuptiCtx->events->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupRemoveAllEvents"); cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp); VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } } } /* free VampirTrace CUPTI event context */ vt_cuptievt_freeEventCtx(vtcuptiCtx->events); }
/* * Finalizes CUPTI device. * * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0 */ void vt_cupti_finalize_device(uint32_t ptid, uint8_t cleanExit){ CUptiResult cuptiErr = CUPTI_SUCCESS; vt_cupti_ctx_t *vtcuptiCtx = NULL; vt_cntl_msg(2, "[CUPTI] Finalize device ... "); { CUcontext cuCtx = NULL; VT_SUSPEND_CUDA_TRACING(ptid); #if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); #else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); #endif VT_RESUME_CUDA_TRACING(ptid); vtcuptiCtx = vt_cupti_takeCtxFromList(cuCtx); if(vtcuptiCtx == NULL) return; } if(cleanExit && vt_gpu_debug != 0){ /*uint64_t time = vt_pform_wtime(); vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/ /* stop CUPTI counter capturing */ vt_cupti_stop(vtcuptiCtx); /* destroy all CUPTI event groups, which have been created */ { vt_cupti_grp_t *vtcuptiGrp = vtcuptiCtx->vtGrpList; while(vtcuptiGrp != NULL){ cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveAllEvents"); cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp); CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy"); vtcuptiGrp = vtcuptiGrp->next; } } } /* free VampirTrace CUPTI context */ vt_cupti_freeCtx(vtcuptiCtx); }
void GPUInterface::LaunchKernel(GPUFunction deviceFunction, Dim3Int block, Dim3Int grid, int parameterCountV, int totalParameterCount, ...) { // parameters #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernel\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); void** params; GPUPtr* paramPtrs; unsigned int* paramInts; params = (void**)malloc(sizeof(void*) * totalParameterCount); paramPtrs = (GPUPtr*)malloc(sizeof(GPUPtr) * totalParameterCount); paramInts = (unsigned int*)malloc(sizeof(unsigned int) * totalParameterCount); va_list parameters; va_start(parameters, totalParameterCount); for(int i = 0; i < parameterCountV; i++) { paramPtrs[i] = (GPUPtr)(size_t)va_arg(parameters, GPUPtr); params[i] = (void*)¶mPtrs[i]; } for(int i = parameterCountV; i < totalParameterCount; i++) { paramInts[i-parameterCountV] = va_arg(parameters, unsigned int); params[i] = (void*)¶mInts[i-parameterCountV]; } va_end(parameters); SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, cudaStreams[0], params, NULL)); free(params); free(paramPtrs); free(paramInts); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::LaunchKernel\n"); #endif }
// Release all previously initd objects HRESULT CudaVideoRender::cleanup(bool bDestroyContext) { // Attach the CUDA Context (so we may properly free memroy) cutilDrvSafeCallNoSync( cuCtxPushCurrent(m_cuContext) ); if (m_pInteropFrame[0]) { cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[0]) ); } if (m_pInteropFrame[1]) { cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[1]) ); } // Detach from the Current thread cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) ); terminateCudaVideo(bDestroyContext); return S_OK; }
bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H) { TexRes &r = res[0];// 1 NV12 texture if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes) return true; if (!ctx) { // TODO: how to use pop/push decoder's context without the context in opengl context if (!ensureD3DDevice()) return false; // CUdevice is different from decoder's CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false); #if USE_STREAM CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT)); CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT)); #endif //USE_STREAM qDebug("cuda contex on gl thread: %p", ctx); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this } if (r.cuRes) { CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false); r.cuRes = NULL; } // create d3d resource for interop if (!surface9_nv12) { // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch DX_ENSURE(device9->CreateTexture(W //, H , H*3/2 , 1 , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET? //, D3DUSAGE_RENDERTARGET , D3DFMT_L8 //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8? , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda? , &texture9_nv12 , NULL) // - Resources allocated as shared may not be registered with CUDA. , false); DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false); } // TODO: cudaD3D9.h says NV12 is not supported // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false); return true; }
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, long flags) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n"); #endif SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber])); if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) { SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cudaDevice)); } else { SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice)); } if (kernelMap == NULL) { // kernels have not yet been initialized; do so now. Hopefully, this only occurs once per library load. InitializeKernelMap(); } int id = paddedStateCount; if (flags & BEAGLE_FLAG_PRECISION_DOUBLE) { id *= -1; } if (kernelMap->count(id) == 0) { fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount); exit(-1); } kernelResource = (*kernelMap)[id].copy(); kernelResource->categoryCount = categoryCount; kernelResource->patternCount = paddedPatternCount; kernelResource->flags = flags; SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode)); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::SetDevice\n"); #endif }
static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) { AVFilterContext *ctx = link->dst; CUDAScaleContext *s = ctx->priv; AVFilterLink *outlink = ctx->outputs[0]; AVHWFramesContext *frames_ctx = (AVHWFramesContext*)s->frames_ctx->data; AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; AVFrame *out = NULL; CUresult err; CUcontext dummy; int ret = 0; out = av_frame_alloc(); if (!out) { ret = AVERROR(ENOMEM); goto fail; } err = cuCtxPushCurrent(device_hwctx->cuda_ctx); if (err != CUDA_SUCCESS) { ret = AVERROR_UNKNOWN; goto fail; } ret = cudascale_scale(ctx, out, in); cuCtxPopCurrent(&dummy); if (ret < 0) goto fail; av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den, (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w, (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h, INT_MAX); av_frame_free(&in); return ff_filter_frame(outlink, out); fail: av_frame_free(&in); av_frame_free(&out); return ret; }
static CUdevice get_device_from_ctx(CUcontext ctx) { // Strangely, there does not seem to be a way to get this from the // context without making it current. Feels hacky, possibly // subject to future change. CUcontext curCtx = 0; CUdevice device = 0; cuCtxGetCurrent(&curCtx); if (curCtx != ctx) { cuCtxPushCurrent(ctx); } cuCtxGetDevice(&device); if (curCtx != ctx) { cuCtxPopCurrent(NULL); } return device; }
static void *cuda_init(int ord, int flags, int *ret) { CUdevice dev; CUcontext ctx; cuda_context *res; static int init_done = 0; unsigned int fl = CU_CTX_SCHED_AUTO; if (ord == -1) { /* Grab the ambient context */ err = cuCtxGetCurrent(&ctx); CHKFAIL(NULL); res = cuda_make_ctx(ctx, DONTFREE); if (res == NULL) { FAIL(NULL, GA_IMPL_ERROR); } res->flags |= flags; return res; } if (!init_done) { err = cuInit(0); CHKFAIL(NULL); init_done = 1; } err = cuDeviceGet(&dev, ord); CHKFAIL(NULL); if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuCtxCreate(&ctx, fl, dev); CHKFAIL(NULL); res = cuda_make_ctx(ctx, 0); res->flags |= flags; if (res == NULL) { cuCtxDestroy(ctx); FAIL(NULL, GA_IMPL_ERROR); } /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); return res; }
/* * Retrieves the VampirTrace CUPTI context for the CUDA context associated with * the calling host thread. Initiates context creation, if it is not available * yet. * * @param ptid the VampirTrace thread id of the calling host thread * * @return VampirTrace CUPTI context */ vt_cupti_ctx_t* vt_cuptievt_getOrCreateCurrentCtx(uint32_t ptid) { CUcontext cuCtx = NULL; if(!vt_cuptievt_initialized) vt_cupti_events_init(); # if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); # else VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); # endif if(cuCtx == NULL){ vt_cntl_msg(2, "[CUPTI Events] No context is bound to the calling CPU thread!"); return NULL; } return vt_cuptievt_getOrCreateCtx(cuCtx, ptid); }
/* * returns the cubin = gpu machine code form the PTX ISA assembly * setup JIT compilation options and perform compilation */ CUmodule * CudaCompiler::compilePTX(uchar * KernelPTXDump, GPU * gpu=0) { // consider the default context is in run if gpu==0 // TODO : make it mandatory if(gpu!=0) CUDCHK( cuCtxPushCurrent(gpu->context) ); //CUDCHK( cuCtxSynchronize()); // in this branch we use compilation with parameters const unsigned int jitNumOptions = 3; int jitLogBufferSize = 1024; int jitRegCount = 32; CUjit_option * jitOptions = new CUjit_option[jitNumOptions]; void ** jitOptVals = new void*[jitNumOptions]; char * jitLogBuffer = new char[jitLogBufferSize]; jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;// set up size of compilation log buffer jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;// set up pointer to the compilation log buffer jitOptions[2] = CU_JIT_MAX_REGISTERS; // set up pointer to set the Maximum # of registers for a particular kernel jitOptVals[0] = (void *)(size_t)jitLogBufferSize; jitOptVals[1] = jitLogBuffer; jitOptVals[2] = (void *)(size_t)jitRegCount; // compile with set parameters qDebug()<<"> Compiling PTX module"; CUmodule * cuModule = new CUmodule(); CUDCHK( cuModuleLoadDataEx( cuModule, KernelPTXDump, jitNumOptions, jitOptions, (void **)jitOptVals) ); qDebug()<< "PTX JIT log: \n [" << jitLogBuffer <<"]" ; delete [] jitOptions; delete [] jitOptVals; delete [] jitLogBuffer; CUDCHK( cuCtxPopCurrent(0)); return cuModule; }
// Release all previously initd objects HRESULT cleanup(bool bDestroyContext) { // Attach the CUDA Context (so we may properly free memroy) if (bDestroyContext) { checkCudaErrors(cuCtxPushCurrent(g_oContext)); if (g_pInteropFrame[0]) { checkCudaErrors(cuMemFree(g_pInteropFrame[0])); } if (g_pInteropFrame[1]) { checkCudaErrors(cuMemFree(g_pInteropFrame[1])); } // Detach from the Current thread checkCudaErrors(cuCtxPopCurrent(NULL)); } if (g_pImageDX) { delete g_pImageDX; g_pImageDX = NULL; } freeCudaResources(bDestroyContext); // destroy the D3D device if (g_pD3DDevice) { g_pD3DDevice->Release(); g_pD3DDevice = NULL; } return S_OK; }
static void nvptx_attach_host_thread_to_device (int n) { CUdevice dev; CUresult r; struct ptx_device *ptx_dev; CUcontext thd_ctx; r = cuCtxGetDevice (&dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) return; else { CUcontext old_ctx; ptx_dev = ptx_devices[n]; assert (ptx_dev); r = cuCtxGetCurrent (&thd_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); /* We don't necessarily have a current context (e.g. if it has been destroyed. Pop it if we do though. */ if (thd_ctx != NULL) { r = cuCtxPopCurrent (&old_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); } r = cuCtxPushCurrent (ptx_dev->ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); } }
bool GLInteropResource::unmap(GLuint tex) { Q_UNUSED(tex); if (WORKAROUND_UNMAP_CONTEXT_SWITCH) return true; int plane = -1; if (res[0].texture == tex) plane = 0; else if (res[1].texture == tex) plane = 1; else return false; // FIXME: why cuCtxPushCurrent gives CUDA_ERROR_INVALID_CONTEXT if opengl viewport changed? CUDA_WARN(cuCtxPushCurrent(ctx)); CUDA_WARN(cuStreamSynchronize(res[plane].stream)); // FIXME: need a correct context. But why we have to push context even though map/unmap are called in the same thread // Because the decoder switch the context in another thread so we have to switch the context back? // to workaround the context issue, we must pop the context that valid in map() and push it here CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); return true; }
bool VideoDecoderCUDAPrivate::initCuda() { CUresult result = cuInit(0); if (result != CUDA_SUCCESS) { available = false; qWarning("cuInit(0) faile (%d)", result); return false; } cudev = GetMaxGflopsGraphicsDeviceId(); int clockRate; cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev); int major, minor; cuDeviceComputeCapability(&major, &minor, cudev); char devname[256]; cuDeviceGetName(devname, 256, cudev); description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000); //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO? CUcontext cuCurrent = NULL; result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { qWarning("cuCtxPopCurrent: %d\n", result); return false; } checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx)); { AutoCtxLock lock(this, vid_ctx_lock); Q_UNUSED(lock); //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING) checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT //require compute capability >= 1.1 //flag: Reserved for future use, must be 0 //cuStreamAddCallback(stream, CUstreamCallback, this, 0); } return true; }
CUdeviceptr get_write_ptr_cuda(ComputeEnv *env,int devid) { invalidate(env); CUDADev *dev = &env->cuda_dev_list[devid]; cuCtxPushCurrent(dev->context); if (cuda_ptr_list[devid] == 0) { CUresult err; err = cuMemAlloc(&cuda_ptr_list[devid], byte_size); if (err != CUDA_SUCCESS) { abort(); } } last_write.type = Processor::CUDA; last_write.devid = devid; cuda_valid_list[devid] = true; CUcontext old; cuCtxPopCurrent(&old); return cuda_ptr_list[devid]; }
/* * Returns the VampirTrace CUPTI context for the CUDA context associated with * the calling host thread. * * @param ptid the VampirTrace thread id of the calling host thread */ vt_cupti_ctx_t* vt_cupti_getCurrentContext(uint32_t ptid) { CUcontext cuCtx = NULL; if(!vt_cupti_initialized) vt_cupti_init(); VT_SUSPEND_CUDA_TRACING(ptid); # if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000)) CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent"); # else CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent"); # endif VT_RESUME_CUDA_TRACING(ptid); if(cuCtx == NULL) { vt_cntl_msg(2, "[CUPTI] No context is bound to the calling CPU thread", cuCtx); return NULL; } return vt_cupti_getCtx(cuCtx, ptid); }
~CudaContext() { CUcontext old; cuCtxPopCurrent(&old); halide_release_cuda_context(user_context); }