void *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; void *p; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->ctx = ctx; res->err = CUDA_SUCCESS; res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; res->enter = 0; res->freeblocks = NULL; if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { goto fail_cache; } res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq, (cache_hash_fn)extcopy_hash, (cache_freek_fn)extcopy_free, (cache_freev_fn)cuda_freekernel); if (res->extcopy_cache == NULL) { goto fail_cache; } err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { goto fail_stream; } err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING); if (err != CUDA_SUCCESS) { goto fail_mem_stream; } err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { goto fail_errbuf; } memset(p, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16); if (res->errbuf == NULL) { err = res->err; goto fail_end; } res->errbuf->flags |= CUDA_MAPPED_PTR; return res; fail_end: cuMemFreeHost(p); fail_errbuf: cuStreamDestroy(res->mem_s); fail_mem_stream: cuStreamDestroy(res->s); fail_stream: cache_destroy(res->extcopy_cache); fail_cache: free(res); return NULL; }
InteropResource::~InteropResource() { //CUDA_WARN(cuCtxPushCurrent(ctx)); //error invalid value if (res[0].cuRes) CUDA_WARN(cuGraphicsUnregisterResource(res[0].cuRes)); if (res[1].cuRes) CUDA_WARN(cuGraphicsUnregisterResource(res[1].cuRes)); if (res[0].stream) CUDA_WARN(cuStreamDestroy(res[0].stream)); if (res[1].stream) CUDA_WARN(cuStreamDestroy(res[1].stream)); // FIXME: we own the context. But why crash to destroy ctx? CUDA_ERROR_INVALID_VALUE //CUDA_ENSURE(cuCtxDestroy(ctx)); }
bool VideoDecoderCUDAPrivate::releaseCuda() { available = false; if (!can_load) return true; if (dec) { cuvidDestroyDecoder(dec); dec = 0; } if (parser) { cuvidDestroyVideoParser(parser); parser = 0; } if (stream) { cuStreamDestroy(stream); stream = 0; } if (host_data) { cuMemFreeHost(host_data); host_data = 0; host_data_size = 0; } if (vid_ctx_lock) { cuvidCtxLockDestroy(vid_ctx_lock); vid_ctx_lock = 0; } if (cuctx) { checkCudaErrors(cuCtxDestroy(cuctx)); } // TODO: dllapi unload return true; }
static bool fini_streams_for_device (struct ptx_device *ptx_dev) { free (ptx_dev->async_streams.arr); bool ret = true; while (ptx_dev->active_streams != NULL) { struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; ret &= map_fini (s); CUresult r = cuStreamDestroy (s->stream); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r)); ret = false; } free (s); } ret &= map_fini (ptx_dev->null_stream); free (ptx_dev->null_stream); return ret; }
~CNvidiaNvencCodec() { if (m_NvidiaNvencCodecContext.GetSaveOutputToFile()) { fclose(m_pOutputFile); } #ifdef ASYNCHRONOUS CHECK_NVENC_STATUS(m_FunctionList.nvEncUnregisterAsyncEvent(m_pEncoder, &m_EventParameters)); #endif CHECK_NVENC_STATUS(m_FunctionList.nvEncUnmapInputResource(m_pEncoder, m_PictureParameters.inputBuffer)); CHECK_NVENC_STATUS(m_FunctionList.nvEncUnregisterResource(m_pEncoder, m_pRegisteredResource)); CHECK_NVENC_STATUS(m_FunctionList.nvEncDestroyBitstreamBuffer(m_pEncoder, m_pOutputBitstream)); CHECK_NVENC_STATUS(m_FunctionList.nvEncDestroyEncoder(m_pEncoder)); CHECK_CUDA_DRV_STATUS(cuStreamDestroy(m_Stream)); if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda()) { cudaFree(m_pPageLockedNv12Buffer); } else { cudaFree(m_pPageLockedRgb32Buffer); CHECK_CUDA_DRV_STATUS(cuMemFree(m_pRgb32Buffer)); } CHECK_CUDA_DRV_STATUS(cuMemFree(m_pNv12Buffer)); CHECK_CUDA_DRV_STATUS(cuCtxDestroy(m_Context)); FreeModule(m_hNvEncodeAPI64); }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; gpudata *next, *curr; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } cuMemFreeHost((void *)ctx->errbuf->ptr); deallocate(ctx->errbuf); cuStreamDestroy(ctx->s); /* Clear out the freelist */ for (curr = ctx->freeblocks; curr != NULL; curr = next) { next = curr->next; cuMemFree(curr->ptr); deallocate(curr); } if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_destroy(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
void GPUInterface::ResizeStreamCount(int newStreamCount) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuCtxSynchronize()); if (cudaStreams != NULL) { for(int i=0; i<numStreams; i++) { if (cudaStreams[i] != NULL) SAFE_CUDA(cuStreamDestroy(cudaStreams[i])); } free(cudaStreams); } if (cudaEvents != NULL) { for(int i=0; i<numStreams; i++) { if (cudaEvents[i] != NULL) SAFE_CUDA(cuEventDestroy(cudaEvents[i])); } free(cudaEvents); } if (newStreamCount == 1) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = newStreamCount; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::ResizeStreamCount\n"); #endif }
void CudaVideoRender::terminateCudaVideo(bool bDestroyContext) { if (m_pVideoParser) delete m_pVideoParser; if (m_pVideoDecoder) delete m_pVideoDecoder; if (m_pVideoSource) delete m_pVideoSource; if (m_pFrameQueue) delete m_pFrameQueue; if (m_CtxLock) { cutilDrvSafeCallNoSync( cuvidCtxLockDestroy(m_CtxLock) ); } if (m_cuContext && bDestroyContext) { cutilDrvSafeCallNoSync( cuCtxDestroy(m_cuContext) ); m_cuContext = NULL; } if (m_ReadbackSID) cuStreamDestroy(m_ReadbackSID); if (m_KernelSID) cuStreamDestroy(m_KernelSID); }
void freeCudaResources(bool bDestroyContext) { if (g_pVideoParser) { delete g_pVideoParser; } if (g_pVideoDecoder) { delete g_pVideoDecoder; } if (g_pVideoSource) { delete g_pVideoSource; } if (g_pFrameQueue) { delete g_pFrameQueue; } if (g_CtxLock) { checkCudaErrors(cuvidCtxLockDestroy(g_CtxLock)); } if (g_oContext && bDestroyContext) { checkCudaErrors(cuCtxDestroy(g_oContext)); g_oContext = NULL; } if (g_ReadbackSID) { cuStreamDestroy(g_ReadbackSID); } if (g_KernelSID) { cuStreamDestroy(g_KernelSID); } }
static int nvptx_set_cuda_stream (int async, void *stream) { struct ptx_stream *oldstream; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); /* We have a list of active streams and an array mapping async values to entries of that list. We need to take "ownership" of the passed-in stream, and add it to our list, removing the previous entry also (if there was one) in order to prevent resource leaks. Note the potential for surprise here: maybe we should keep track of passed-in streams and leave it up to the user to tidy those up, but that doesn't work for stream handles returned from acc_get_cuda_stream above... */ oldstream = select_stream_for_async (async, self, false, NULL); if (oldstream) { if (nvthd->ptx_dev->active_streams == oldstream) nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; else { struct ptx_stream *s = nvthd->ptx_dev->active_streams; while (s->next != oldstream) s = s->next; s->next = s->next->next; } cuStreamDestroy (oldstream->stream); map_fini (oldstream); free (oldstream); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); (void) select_stream_for_async (async, self, true, (CUstream) stream); return 1; }
static void fini_streams_for_device (struct ptx_device *ptx_dev) { free (ptx_dev->async_streams.arr); while (ptx_dev->active_streams != NULL) { struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; map_fini (s); cuStreamDestroy (s->stream); free (s); } map_fini (ptx_dev->null_stream); free (ptx_dev->null_stream); }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } cuStreamDestroy(ctx->s); if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_free(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
void *cuda_make_ctx(CUcontext ctx, int flags) { int64_t v = 0; cuda_context *res; int e = 0; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->ctx = ctx; res->err = CUDA_SUCCESS; res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; res->enter = 0; if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { free(res); return NULL; } res->extcopy_cache = cache_alloc(64, 32); if (res->extcopy_cache == NULL) { free(res); return NULL; } err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { cache_free(res->extcopy_cache); free(res); return NULL; } TAG_CTX(res); /* Need to tag before cuda_alloc */ res->errbuf = cuda_alloc(res, 8, &v, GA_BUFFER_INIT, &e); if (e != GA_NO_ERROR) { err = res->err; cache_free(res->extcopy_cache); cuStreamDestroy(res->s); free(res); return NULL; } res->refcnt--; /* Don't want to create a reference loop with the errbuf */ return res; }
bool VideoDecoderCUDAPrivate::releaseCuda() { if (!can_load) return true; if (dec) { cuvidDestroyDecoder(dec); dec = 0; } if (parser) { cuvidDestroyVideoParser(parser); parser = 0; } if (stream) { cuStreamDestroy(stream); stream = 0; } cuvidCtxLockDestroy(vid_ctx_lock); if (cuctx) { checkCudaErrors(cuCtxDestroy(cuctx)); } return true; }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } ctx->refcnt = 2; /* Prevent recursive calls */ cuda_free(ctx->errbuf); cuStreamDestroy(ctx->s); if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_free(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
int main(int argc, char **argv) { //data CUdeviceptr d_data0 = 0; CUdeviceptr d_data1 = 0; DataStruct *h_data0 = 0; DataStruct *h_data1 = 0; DataStruct h_data_reference0; DataStruct h_data_reference1; unsigned int memSize = sizeof(DataStruct); //device references CUcontext hContext = 0; CUdevice hDevice = 0; CUmodule hModule = 0; CUstream hStream = 0; // Initialize the device and get a handle to the kernel CUresult status = initialize(0, &hContext, &hDevice, &hModule, &hStream); // Allocate memory on host and device if ((h_data0 = (DataStruct *)malloc(memSize)) == NULL) { std::cerr << "Could not allocate host memory" << std::endl; exit(-1); } status = cuMemAlloc(&d_data0, memSize); if ((h_data1 = (DataStruct *)malloc(memSize)) == NULL) { std::cerr << "Could not allocate host memory" << std::endl; exit(-1); } status = cuMemAlloc(&d_data1, memSize); if (status != CUDA_SUCCESS) printf("ERROR: during cuMemAlloc\n"); /////////////////////////////////////////////////////////////////////////////// //======================= test cases ========================================// /////////////////////////////////////////////////////////////////////////////// std::string name = ""; unsigned int testnum=0; unsigned int passed=0; //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ /////////////////////// Ralf /////////////////////////////////////////////////// //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if(runRalfFunction("test_phi_scalar", test_phi_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi2_scalar", test_phi2_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi3_scalar", test_phi3_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi4_scalar", test_phi4_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi5_scalar", test_phi5_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi6_scalar", test_phi6_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi7_scalar", test_phi7_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi8_scalar", test_phi8_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_phi9_scalar", test_phi9_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loopbad_scalar", test_loopbad_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loop23_scalar", test_loop23_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; if(runRalfFunction("test_loop13_scalar", test_loop13_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize)) passed++; testnum++; //////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_constant"; ///////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_constant(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_calculate"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = 3.2; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_calculate(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_parquetShader"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 1; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_parquetShader(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_dyn"; ///////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->u = h_data_reference0.u = 7; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_dyn(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simple"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = -4; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simple(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; /////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simple"; // Branch 2 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 8; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simple(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_simplePHI"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = -10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_simplePHI(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop"; ////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 100; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_math"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->f = h_data_reference0.f = 1.4; h_data0->i = h_data_reference0.i = 3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_math(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_signedOperands"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = -7; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_signedOperands(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_constantOperands"; ////////////////////////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 3; h_data0->f = h_data_reference0.f = -1.44; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_constantOperands(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop_semihard"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_semihard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_branch_loop_hard"; // Branch 1 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->u = h_data_reference0.u = 3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_hard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*////////////*/ name = "test_branch_loop_hard"; // Branch 2 ///////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 7; h_data0->u = h_data_reference0.u = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_branch_loop_hard(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_binaryInst"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 5; h_data0->f = h_data_reference0.f = -121.23; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_binaryInst(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_selp"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = -15; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_selp(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_GetElementPointer_complicated"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data_reference0.s.s.f = h_data0->s.s.f = 3.11; h_data_reference0.s.sa[2].f = h_data0->s.sa[2].f = -4.32; h_data_reference0.s.sa[h_data0->i].f = h_data0->s.sa[h_data0->i].f = 111.3; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_GetElementPointer_complicated(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_call"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 10; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_call(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*/////////////*/ name = "test_alloca"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->f = h_data_reference0.f = -3.23; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_alloca(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_alloca_complicated"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->i = h_data_reference0.i = 1; h_data0->f = h_data_reference0.f = 23.213; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_alloca_complicated(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_globalVariables"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_globalVariables(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_specialRegisters_x"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3); //run device function runHostTestFunction(test_specialRegisters_x, &h_data_reference0, 2,3,4, 2,3); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_specialRegisters_y"; ///////////////////////// setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3); //run device function runHostTestFunction(test_specialRegisters_x, &h_data_reference0, 2,3,4, 2,3); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_dualArgument"; ///////////////////////// setZero(h_data0,&h_data_reference0); setZero(h_data1,&h_data_reference1); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunDualTestFunction(&hModule, name, d_data0, d_data1, h_data0, h_data1, memSize); //run device function test_dualArgument(&h_data_reference0,&h_data_reference1); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} if(compareData(h_data1,&h_data_reference1)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_vector"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->fa[0] = h_data_reference0.fa[0] = 0.43f; h_data0->fa[1] = h_data_reference0.fa[1] = 0.234f; h_data0->fa[2] = h_data_reference0.fa[2] = 12893.f; h_data0->fa[3] = h_data_reference0.fa[3] = 13.33f; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_vector(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_reg2Const"; ///////////////////////// setZero(h_data0,&h_data_reference0); /* unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); */ std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_reg2Const(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_constantMemory"; ///////////////////////// setZero(h_data0,&h_data_reference0); h_data0->fa[0] = __ptx_constant_data_global.fa[0] = 0.2348f; unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); setZero(h_data0,&h_data_reference0); std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function test_constantMemory(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_sharedMemory"; ///////////////////////// setZero(h_data0,&h_data_reference0); for(int i = 0; i < ARRAY_N/2; i++) h_data0->fa[i*2] = i; for(int i = 0; i < ARRAY_N/2; i++) h_data0->fa[i*2+1] = -i; std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 32,1,1, 1,1); //run device function for(int i = 0; i < ARRAY_N/2; i++) h_data_reference0.fa[i] = i; for(int i = 0; i < ARRAY_N/2; i++) h_data_reference0.fa[i+32] = -i; // runHostTestFunction(test_sharedMemory, &h_data_reference0, 16,1,1, 1,1); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; ////////////////////////////////////////////////////////////////////////////////////////////////// /*///////////////*/ name = "test_lightShader"; ///////////////////////// setZero(h_data0,&h_data_reference0); /* unsigned int bytes; //size of constant CUdeviceptr devptr_const=0; status = cuModuleGetGlobal(&devptr_const, &bytes, hModule, "__ptx_constant_data_global"); cuMemcpyHtoD(devptr_const, h_data0, memSize); */ std::cout << "=============== Test " << testnum << ": " << name << " ===================\n"; loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize); //run device function /* test_lightShader(&h_data_reference0); //run host reference if(compareData(h_data0,&h_data_reference0)) //compare Data {passed++; std::cout << " => Test passed!!!\n";} testnum++; */ /////////////////////////////////////////////////////////////////////////////// //======================= test cases END ====================================// /////////////////////////////////////////////////////////////////////////////// // Check the result std::cout << "\nPASSED " << passed << " tests" << std::endl; std::cout << "FAILED " << (testnum-passed) << " tests" << std::endl; // Cleanup if (d_data0) { cuMemFree(d_data0); d_data0 = 0; } if (d_data1) { cuMemFree(d_data1); d_data1 = 0; } if (h_data0) { free(h_data0); h_data0 = 0; } if (h_data1) { free(h_data1); h_data1 = 0; } if (hModule) { cuModuleUnload(hModule); hModule = 0; } if (hStream) { cuStreamDestroy(hStream); hStream = 0; } if (hContext) { cuCtxDestroy(hContext); hContext = 0; } return 0; }
/* // Property Message // // API //static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) // INPUT // image // step // numStep // startIndex // sideLength // bx // by // OUTPUT // maps // RESULT // Error status */ static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) { CvLSVMFeatureMap **feature_maps; int i; int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY; float *scales; CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca; CUstream *streams; CUresult res; scales = (float *) malloc(sizeof(float) * (numStep)); devs_img = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); streams = (CUstream *) malloc(sizeof(CUstream) * (numStep)); feature_maps = (CvLSVMFeatureMap **) malloc( sizeof(CvLSVMFeatureMap *) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { scales[i] = 1.0f / powf(step, (float) i); width = (int) (((float) image->width ) * scales[i] + 0.5); height = (int) (((float) image->height) * scales[i] + 0.5); numChannels = image->nChannels; sizeX = width / sideLength; sizeY = height / sideLength; p = NUM_SECTOR * 3; pp = NUM_SECTOR * 12; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_img[i], width, height, numChannels); allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p); allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX, newSizeY, pp); res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT); CUDA_CHECK(res, "cuStreamCreate(stream)"); } // excute main function resizeGPUStream(numStep, image, scales, devs_img, streams); getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm, streams); normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm, devs_map_pre_pca, streams); PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps, streams); // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); cuStreamDestroy(streams[i]); } for (i = 0; i < numStep; i++) { (*maps)->pyramid[startIndex + i] = feature_maps[i]; }/*for(i = 0; i < numStep; i++)*/ // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_img[i]); freeFeatureMapObjectGPU(&devs_map_pre_norm[i]); freeFeatureMapObjectGPU(&devs_map_pre_pca[i]); } free(scales); free(devs_img); free(devs_map_pre_norm); free(devs_map_pre_pca); free(streams); free(feature_maps); return LATENT_SVM_OK; }
static void dispose(CUstream stream) { cuda_check( cuStreamDestroy(stream) ); }
int cuda_test_memcpy_async(unsigned int size) { int i; CUresult res; CUdevice dev; CUcontext ctx; CUstream stream; CUdeviceptr data_addr; unsigned int *in, *out; struct timeval tv; struct timeval tv_total_start, tv_total_end; unsigned long total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %u\n", (unsigned int)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamCreate(&stream, 0); if (res != CUDA_SUCCESS) { printf("cuStreamCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAlloc(&data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&in, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&out, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res); return -1; } for (i = 0; i < size / 4; i++) { in[i] = i+1; out[i] = 0; } gettimeofday(&tv_h2d_start, NULL); res = cuMemcpyHtoDAsync(data_addr, in, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_d2h_start, NULL); res = cuMemcpyDtoHAsync(out, data_addr, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_d2h_end, NULL); for (i = 0; i < size / 4; i++) { if (in[i] != out[i]) { printf("in[%d] = %u, out[%d] = %u\n", i, in[i], i, out[i]); } } res = cuMemFreeHost(out); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFreeHost(in); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFree(data_addr); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamDestroy(stream); if (res != CUDA_SUCCESS) { printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000 + tv.tv_usec / 1000; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); return 0; end: return -1; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }
void device_t<CUDA>::freeStream(stream s){ OCCA_CUDA_CHECK("Device: freeStream", cuStreamDestroy( *((CUstream*) s) )); delete (CUstream*) s; }