WEAK void halide_release(void *user_context) { // Do not do any of this if there is not context set. E.g. // if halide_release is called and no CUDA calls have been made. if (cuda_ctx_ptr != NULL) { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } cuda_ctx_ptr = NULL; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
WEAK void halide_release() { // CUcontext ignore; // TODO: this is for timing; bad for release-mode performance CHECK_CALL( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
bool VideoDecoderCUDAPrivate::releaseCuda() { available = false; if (!can_load) return true; if (dec) { cuvidDestroyDecoder(dec); dec = 0; } if (parser) { cuvidDestroyVideoParser(parser); parser = 0; } if (stream) { cuStreamDestroy(stream); stream = 0; } if (host_data) { cuMemFreeHost(host_data); host_data = 0; host_data_size = 0; } if (vid_ctx_lock) { cuvidCtxLockDestroy(vid_ctx_lock); vid_ctx_lock = 0; } if (cuctx) { checkCudaErrors(cuCtxDestroy(cuctx)); } // TODO: dllapi unload return true; }
static void *do_init(CUdevice dev, int flags, int *ret) { cuda_context *res; CUcontext ctx; unsigned int fl = CU_CTX_SCHED_AUTO; int i; CHKFAIL(NULL); if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); CHKFAIL(NULL); if (i != 1) FAIL(NULL, GA_UNSUPPORTED_ERROR); err = cuCtxCreate(&ctx, fl, dev); CHKFAIL(NULL); res = cuda_make_ctx(ctx, 0); if (res == NULL) { cuCtxDestroy(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->flags |= flags; /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); return res; }
CUDARunner::~CUDARunner() { DeallocateResources(); cuModuleUnload(m_module); cuCtxDestroy(m_context); }
~CNvidiaNvencCodec() { if (m_NvidiaNvencCodecContext.GetSaveOutputToFile()) { fclose(m_pOutputFile); } #ifdef ASYNCHRONOUS CHECK_NVENC_STATUS(m_FunctionList.nvEncUnregisterAsyncEvent(m_pEncoder, &m_EventParameters)); #endif CHECK_NVENC_STATUS(m_FunctionList.nvEncUnmapInputResource(m_pEncoder, m_PictureParameters.inputBuffer)); CHECK_NVENC_STATUS(m_FunctionList.nvEncUnregisterResource(m_pEncoder, m_pRegisteredResource)); CHECK_NVENC_STATUS(m_FunctionList.nvEncDestroyBitstreamBuffer(m_pEncoder, m_pOutputBitstream)); CHECK_NVENC_STATUS(m_FunctionList.nvEncDestroyEncoder(m_pEncoder)); CHECK_CUDA_DRV_STATUS(cuStreamDestroy(m_Stream)); if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda()) { cudaFree(m_pPageLockedNv12Buffer); } else { cudaFree(m_pPageLockedRgb32Buffer); CHECK_CUDA_DRV_STATUS(cuMemFree(m_pRgb32Buffer)); } CHECK_CUDA_DRV_STATUS(cuMemFree(m_pNv12Buffer)); CHECK_CUDA_DRV_STATUS(cuCtxDestroy(m_Context)); FreeModule(m_hNvEncodeAPI64); }
NVENCEncoder::~NVENCEncoder() { if (alive) { for (int i = 0; i < maxSurfaceCount; ++i) { if (inputSurfaces[i].locked) pNvEnc->nvEncUnlockInputBuffer(encoder, inputSurfaces[i].inputSurface); pNvEnc->nvEncDestroyInputBuffer(encoder, inputSurfaces[i].inputSurface); pNvEnc->nvEncDestroyBitstreamBuffer(encoder, outputSurfaces[i].outputSurface); } pNvEnc->nvEncDestroyEncoder(encoder); cuCtxDestroy(cuContext); NvLog(TEXT("Encoder closed")); } outputSurfaceQueueReady = std::queue<NVENCEncoderOutputSurface*>(); outputSurfaceQueue = std::queue<NVENCEncoderOutputSurface*>(); delete[] inputSurfaces; delete[] outputSurfaces; if (alive) encoderRefDec(); OSCloseMutex(frameMutex); delete[] pstart; }
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f) { CUresult res; CUdevice dev; res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %lu\n", (unsigned long)res); return res; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res); return res; } res = cuCtxCreate(pctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res); return res; } res = cuModuleLoad(pmod, f); if (res != CUDA_SUCCESS) { printf("cuModuleLoad() failed\n"); cuCtxDestroy(*pctx); return res; } return CUDA_SUCCESS; }
void clean_cuda(void) { CUresult res; for (int i = 0; i < device_num; i++) { res = cuModuleUnload(module[i]); CUDA_CHECK(res, "cuModuleUnload()"); } for (int i = 0; i < device_num; i++) { res = cuCtxDestroy(ctx[i]); CUDA_CHECK(res, "cuCtxDestroy()"); } free(NR_MAXTHREADS_X); free(NR_MAXTHREADS_Y); free(ConvolutionKernel_func); free(DistanceTransformTwoDimensionalProblem_func); free(BilinearKernelTex32F_func); free(calculateHistogram_func); free(getFeatureMaps_func); free(calculateNorm_func); free(normalizeAndTruncate_func); free(PCAFeatureMapsAddNullableBorder_func); free(module); free(dev); free(ctx); }
WEAK void halide_release() { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
int cleanup_accel (void) { #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; #endif switch (options.accel) { #ifdef _ENABLE_CUDA_ case cuda: curesult = cuCtxDestroy(cuContext); if (curesult != CUDA_SUCCESS) { return 1; } break; #endif #ifdef _ENABLE_OPENACC_ case openacc: acc_shutdown(acc_device_not_host); break; #endif default: fprintf(stderr, "Invalid accel type, should be cuda or openacc\n"); return 1; } return 0; }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; gpudata *next, *curr; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } cuMemFreeHost((void *)ctx->errbuf->ptr); deallocate(ctx->errbuf); cuStreamDestroy(ctx->s); /* Clear out the freelist */ for (curr = ctx->freeblocks; curr != NULL; curr = next) { next = curr->next; cuMemFree(curr->ptr); deallocate(curr); } if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_destroy(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
void device_t<CUDA>::free(){ OCCA_EXTRACT_DATA(CUDA, Device); OCCA_CUDA_CHECK("Device: Freeing Context", cuCtxDestroy(data_.context) ); delete (CUDADeviceData_t*) data; }
int fini_cuda(NvEncoder*enc) { CUresult cuResult; cuResult=cuCtxDestroy(enc->cuContext); CHK_CUDA(cuResult,"cuda Context Destroy"); return 0; }
static CUresult destroyContext(const void * args) { (void)args; CUcontext context; CU_ERROR_CHECK(cuCtxGetCurrent(&context)); CU_ERROR_CHECK(cuCtxDestroy(context)); return CUDA_SUCCESS; }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
ERL_NIF_TERM pteracuda_nifs_destroy_context(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ref; if (argc != 1 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ref)) { return enif_make_badarg(env); } if (!ref->destroyed) { cuCtxDestroy(ref->ctx); ref->destroyed = true; } return ATOM_OK; }
void pocl_cuda_uninit (cl_device_id device) { pocl_cuda_device_data_t *data = device->data; cuCtxDestroy (data->context); POCL_MEM_FREE (data); device->data = NULL; POCL_MEM_FREE (device->long_name); }
int main(int argc, char *argv[]) { // Read command line files // Init cuda CUcontext cuContext; initCuda(cuContext); // Image buffer allocation unsigned int depth =4; unsigned int width=960; unsigned int height=1080; size_t bufferSize = sizeof(unsigned char)*width*height*depth; unsigned char *imgRight = (unsigned char*)malloc(bufferSize); unsigned char *imgLeft= (unsigned char*)malloc(bufferSize); // Read the list of test files std::vector<std::string> testsFiles; readTestsFiles(testsFiles, "./tests.txt"); // Launch tests for(int i=0; i<testsFiles.size(); i++) { std::stringstream testImage1; testImage1 << "./" << testsFiles[i] << "_1.dat"; std::stringstream testImage2; testImage2 << "./" << testsFiles[i] << "_2.dat"; readTestImage( testImage1.str(), imgRight, bufferSize); readTestImage( testImage2.str(), imgLeft, bufferSize); VertexBufferObject rightPoints; VertexBufferObject leftPoints; DescriptorData rightDescriptors; DescriptorData leftDescriptors; computeDescriptorsLane(imgRight, depth, width, height, rightPoints, rightDescriptors); computeDescriptorsLane(imgLeft, depth, width, height, leftPoints, leftDescriptors); UInt2 imgSize(1920,1080); vector<CvPoint2D32f> leftMatchedPts; vector<CvPoint2D32f> rightMatchedPts; leftMatchedPts.reserve(10000); rightMatchedPts.reserve(10000); computeMatching( leftDescriptors, rightDescriptors, leftMatchedPts, rightMatchedPts, imgSize); } // finalize cuda CUresult cerr = cuCtxDestroy(cuContext); checkError(cerr); return 0; }
int destroy_cuda_context (void) { #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; curesult = cuCtxDestroy(cuContext); if (curesult != CUDA_SUCCESS) { return 1; } #endif return 0; }
int main(int argc, char *argv[]) { char c; CUcontext ctx; CUdevice dev = 0; void *toSpace; int status, free, total; CUdeviceptr ptr = (CUdeviceptr)NULL; int size; if(argc != 2){ fprintf(stderr,"Usage: mem_alloc.exe [MEMORY TO ALLOCATE IN MB]\n"); exit(1); } printf("All status results should be 0, if not an error has occured.\nIf 2 is reported an out of memory error has occured for\nwhich you should decrease the memory input\n"); size = atoi(argv[1]); printf("\nTrying to allocate %iMB of memory on host and GPU\n",size); if(size <= 0){ fprintf(stderr,"\nERROR: Memory must be greater than 0\n"); exit(1); } status = cuInit(0); printf("Init status: %i\n",status); status = cuCtxCreate(&ctx, 0, dev); printf("Context creation status: %i\n",status); cuMemGetInfo(&free, &total); printf("Get memory info status: %i\n",status); printf("\n%.1f/%.1f (Free/Total) MB\n", free/1024.0/1024.0, total/1024.0/1024.0); status = cuMemHostAlloc(&toSpace, size*1024*1024, 0); printf("Host allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); status = cuMemAlloc(&ptr, size*1024*1024); printf("GPU allocation status: %i %s\n",status, (status==CUDA_SUCCESS) ? "SUCCESS" : "FAILED"); printf("\nPress any key to exit..."); scanf("%c", &c); status = cuCtxDestroy(ctx); printf("Context destroy status: %i\n",status); return 0; }
int mcopy_gpu_close(struct device_info *device_info) { CUresult res; /* printf("madd_gpu_close called.\n"); */ res = cuCtxDestroy(device_info->context); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } return 0; }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
/* * module unload and destroy context */ void clean_cuda(void){ res = cuModuleUnload(module); if(res != CUDA_SUCCESS){ printf("cuModuleUnload failed: res = %s\n", conv(res)); exit(1); } res = cuCtxDestroy(ctx); if(res != CUDA_SUCCESS){ printf("cuCtxDestroy failed: res = %s\n", conv(res)); exit(1); } }
/* * Class: edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2 * Method: reinit * Signature: ()V */ JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_reinit (JNIEnv * env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block, jlong free_space) { cuMemFreeHost(toSpace); cuMemFree(gpuToSpace); cuMemFree(gpuClassMemory); cuMemFreeHost(handlesMemory); cuMemFree(gpuHandlesMemory); cuMemFreeHost(exceptionsMemory); cuMemFree(gpuExceptionsMemory); cuMemFree(gcInfoSpace); cuMemFree(gpuHeapEndPtr); cuMemFree(gpuBufferSize); cuCtxDestroy(cuContext); initDevice(env, this_ref, max_blocks_per_proc, max_threads_per_block, free_space); }
int gib_destroy ( gib_context c ) { /* TODO: Make sure everything created in gib_init is destroyed here. */ ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); int rc_i = gib_cpu_destroy(c); if (rc_i != GIB_SUC) { printf("gib_cpu_destroy returned %i\n", rc_i); exit(EXIT_FAILURE); } gpu_context gpu_c = (gpu_context) c->acc_context; #if !GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemFree(gpu_c->buffers)); #endif ERROR_CHECK_FAIL(cuModuleUnload(gpu_c->module)); ERROR_CHECK_FAIL(cuCtxDestroy(gpu_c->pCtx)); return GIB_SUC; }
void CudaVideoRender::terminateCudaVideo(bool bDestroyContext) { if (m_pVideoParser) delete m_pVideoParser; if (m_pVideoDecoder) delete m_pVideoDecoder; if (m_pVideoSource) delete m_pVideoSource; if (m_pFrameQueue) delete m_pFrameQueue; if (m_CtxLock) { cutilDrvSafeCallNoSync( cuvidCtxLockDestroy(m_CtxLock) ); } if (m_cuContext && bDestroyContext) { cutilDrvSafeCallNoSync( cuCtxDestroy(m_cuContext) ); m_cuContext = NULL; } if (m_ReadbackSID) cuStreamDestroy(m_ReadbackSID); if (m_KernelSID) cuStreamDestroy(m_KernelSID); }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } cuStreamDestroy(ctx->s); if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_free(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
CUresult cuda_driver_api_exit(CUcontext ctx, CUmodule mod) { CUresult res; res = cuModuleUnload(mod); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return res; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return res; } return CUDA_SUCCESS; }
int mmult_gpu_close(struct device_info *device_info) { CUresult res; res = cuModuleUnload(device_info->module); if (res != CUDA_SUCCESS) { printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res); return -1; } res = cuCtxDestroy(device_info->context); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res); return -1; } return 0; }