void CudaVideoRender::initCudaVideo( ) { // bind the context lock to the CUDA context CUresult result = cuvidCtxLockCreate(&m_CtxLock, m_cuContext); if (result != CUDA_SUCCESS) { printf("cuvidCtxLockCreate failed: %d\n", result); assert(0); } std::auto_ptr<VideoDecoder> apVideoDecoder(new VideoDecoder(m_pVideoSource->format(), m_cuContext, m_eVideoCreateFlags, m_CtxLock)); std::auto_ptr<VideoParser> apVideoParser(new VideoParser(apVideoDecoder.get(), m_pFrameQueue)); m_pVideoSource->setParser(*apVideoParser.get()); m_pVideoParser = apVideoParser.release(); m_pVideoDecoder = apVideoDecoder.release(); // Create a Stream ID for handling Readback if (m_bReadback) { cutilDrvSafeCallNoSync( cuStreamCreate(&m_ReadbackSID, 0) ); cutilDrvSafeCallNoSync( cuStreamCreate(&m_KernelSID, 0) ); printf("> initCudaVideo()\n"); printf(" CUDA Streams (%s) <m_ReadbackSID = %p>\n", ((m_ReadbackSID == 0) ? "Disabled" : "Enabled"), m_ReadbackSID ); printf(" CUDA Streams (%s) <m_KernelSID = %p>\n", ((m_KernelSID == 0) ? "Disabled" : "Enabled"), m_KernelSID ); } }
void initCudaVideo() { // bind the context lock to the CUDA context CUresult result = cuvidCtxLockCreate(&g_CtxLock, g_oContext); if (result != CUDA_SUCCESS) { printf("cuvidCtxLockCreate failed: %d\n", result); assert(0); } std::auto_ptr<VideoDecoder> apVideoDecoder(new VideoDecoder(g_pVideoSource->format(), g_oContext, g_eVideoCreateFlags, g_CtxLock)); std::auto_ptr<VideoParser> apVideoParser(new VideoParser(apVideoDecoder.get(), g_pFrameQueue)); g_pVideoSource->setParser(*apVideoParser.get()); g_pVideoParser = apVideoParser.release(); g_pVideoDecoder = apVideoDecoder.release(); // Create a Stream ID for handling Readback if (g_bReadback) { checkCudaErrors(cuStreamCreate(&g_ReadbackSID, 0)); checkCudaErrors(cuStreamCreate(&g_KernelSID, 0)); printf("> initCudaVideo()\n"); printf(" CUDA Streams (%s) <g_ReadbackSID = %p>\n", ((g_ReadbackSID == 0) ? "Disabled" : "Enabled"), g_ReadbackSID); printf(" CUDA Streams (%s) <g_KernelSID = %p>\n", ((g_KernelSID == 0) ? "Disabled" : "Enabled"), g_KernelSID); } }
bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane) { Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12"); TexRes &r = res[plane]; if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes) return true; if (!ctx) { // TODO: how to use pop/push decoder's context without the context in opengl context CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false); if (USE_STREAM) { CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT)); CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT)); } qDebug("cuda contex on gl thread: %p", ctx); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this } if (r.cuRes) { CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false); r.cuRes = NULL; } // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false); r.texture = tex; r.w = w; r.h = h; r.H = H; return true; }
void *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; void *p; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->ctx = ctx; res->err = CUDA_SUCCESS; res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; res->enter = 0; res->freeblocks = NULL; if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { goto fail_cache; } res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq, (cache_hash_fn)extcopy_hash, (cache_freek_fn)extcopy_free, (cache_freev_fn)cuda_freekernel); if (res->extcopy_cache == NULL) { goto fail_cache; } err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { goto fail_stream; } err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING); if (err != CUDA_SUCCESS) { goto fail_mem_stream; } err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { goto fail_errbuf; } memset(p, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16); if (res->errbuf == NULL) { err = res->err; goto fail_end; } res->errbuf->flags |= CUDA_MAPPED_PTR; return res; fail_end: cuMemFreeHost(p); fail_errbuf: cuStreamDestroy(res->mem_s); fail_mem_stream: cuStreamDestroy(res->s); fail_stream: cache_destroy(res->extcopy_cache); fail_cache: free(res); return NULL; }
bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H) { TexRes &r = res[0];// 1 NV12 texture if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes) return true; if (!ctx) { // TODO: how to use pop/push decoder's context without the context in opengl context if (!ensureD3DDevice()) return false; // CUdevice is different from decoder's CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false); #if USE_STREAM CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT)); CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT)); #endif //USE_STREAM qDebug("cuda contex on gl thread: %p", ctx); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this } if (r.cuRes) { CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false); r.cuRes = NULL; } // create d3d resource for interop if (!surface9_nv12) { // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch DX_ENSURE(device9->CreateTexture(W //, H , H*3/2 , 1 , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET? //, D3DUSAGE_RENDERTARGET , D3DFMT_L8 //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8? , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda? , &texture9_nv12 , NULL) // - Resources allocated as shared may not be registered with CUDA. , false); DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false); } // TODO: cudaD3D9.h says NV12 is not supported // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false); return true; }
void GPUInterface::ResizeStreamCount(int newStreamCount) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuCtxSynchronize()); if (cudaStreams != NULL) { for(int i=0; i<numStreams; i++) { if (cudaStreams[i] != NULL) SAFE_CUDA(cuStreamDestroy(cudaStreams[i])); } free(cudaStreams); } if (cudaEvents != NULL) { for(int i=0; i<numStreams; i++) { if (cudaEvents[i] != NULL) SAFE_CUDA(cuEventDestroy(cudaEvents[i])); } free(cudaEvents); } if (newStreamCount == 1) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = newStreamCount; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::ResizeStreamCount\n"); #endif }
static CUstream create(const vex::backend::context &ctx, unsigned flags = 0) { ctx.set_current(); CUstream s; cuda_check( cuStreamCreate(&s, flags) ); return s; }
stream device_t<CUDA>::createStream(){ OCCA_EXTRACT_DATA(CUDA, Device); CUstream *retStream = new CUstream; OCCA_CUDA_CHECK("Device: createStream", cuStreamCreate(retStream, CU_STREAM_DEFAULT)); return retStream; }
int main (int argc, char **argv) { float atime; CUstream stream; CUresult r; acc_init (acc_device_nvidia); (void) acc_get_device_num (acc_device_nvidia); init_timers (1); stream = (CUstream) acc_get_cuda_stream (0); if (stream != NULL) abort (); r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); start_timer (0); acc_wait_all_async (0); acc_wait (0); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time too long\n"); abort (); } fini_timers (); acc_shutdown (acc_device_nvidia); exit (0); }
int main (int argc, char **argv) { CUstream stream; CUresult r; struct timeval tv1, tv2; time_t t1; acc_init (acc_device_nvidia); stream = (CUstream) acc_get_cuda_stream (0); if (stream != NULL) abort (); r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); gettimeofday (&tv1, NULL); acc_wait_all_async (0); acc_wait (0); gettimeofday (&tv2, NULL); t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec); if (t1 > 1000) { fprintf (stderr, "too long\n"); abort (); } acc_shutdown (acc_device_nvidia); exit (0); }
void *cuda_make_ctx(CUcontext ctx, int flags) { int64_t v = 0; cuda_context *res; int e = 0; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->ctx = ctx; res->err = CUDA_SUCCESS; res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; res->enter = 0; if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { free(res); return NULL; } res->extcopy_cache = cache_alloc(64, 32); if (res->extcopy_cache == NULL) { free(res); return NULL; } err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { cache_free(res->extcopy_cache); free(res); return NULL; } TAG_CTX(res); /* Need to tag before cuda_alloc */ res->errbuf = cuda_alloc(res, 8, &v, GA_BUFFER_INIT, &e); if (e != GA_NO_ERROR) { err = res->err; cache_free(res->extcopy_cache); cuStreamDestroy(res->s); free(res); return NULL; } res->refcnt--; /* Don't want to create a reference loop with the errbuf */ return res; }
CUresult initialize(int device, CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUstream *phStream) { // Initialize the device and create the context cuInit(0); cuDeviceGet(phDevice, device); CUresult status = cuCtxCreate(phContext, 0, *phDevice); if (status != CUDA_SUCCESS) {std::cout << "ERROR: could not create context\n"; exit(0);} status = cuModuleLoad(phModule, "PTXTestFunctions.o.ptx"); if (status != CUDA_SUCCESS) {std::cout << "ERROR: could not load .ptx module: " << status << "\n"; exit(0);} // Create stream status = cuStreamCreate(phStream, 0); if (status != CUDA_SUCCESS) {printf("ERROR: during stream creation\n"); exit(0);} return status; }
bool VideoDecoderCUDAPrivate::initCuda() { CUresult result = cuInit(0); if (result != CUDA_SUCCESS) { available = false; qWarning("cuInit(0) faile (%d)", result); return false; } cudev = GetMaxGflopsGraphicsDeviceId(); int clockRate; cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev); int major, minor; cuDeviceComputeCapability(&major, &minor, cudev); char devname[256]; cuDeviceGetName(devname, 256, cudev); description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000); //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO? CUcontext cuCurrent = NULL; result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { qWarning("cuCtxPopCurrent: %d\n", result); return false; } checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx)); { AutoCtxLock lock(this, vid_ctx_lock); Q_UNUSED(lock); //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING) checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT //require compute capability >= 1.1 //flag: Reserved for future use, must be 0 //cuStreamAddCallback(stream, CUstreamCallback, this, 0); } return true; }
void *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->ctx = ctx; res->err = CUDA_SUCCESS; res->blas_handle = NULL; res->refcnt = 1; res->flags = flags; res->extcopy_cache = cache_alloc(64, 32); if (res->extcopy_cache == NULL) { free(res); return NULL; } err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { cache_free(res->extcopy_cache); free(res); return NULL; } TAG_CTX(res); return res; }
static void try_init( void ) { int deviceCount = 0; int syncflag; char *sync; CUresult err = cuInit(0); CUresult status; struct cudaDeviceProp deviceProp; if( state.init ) { return; } // already initialised state.device = 0; if (CUDA_SUCCESS == err) CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); if (deviceCount == 0) { error( "No device found" ); } if( state.target_device >= deviceCount ) { error( "Invalid device requested" ); } CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&(state.device), state.target_device)); #ifdef USE_BOINC syncflag=0x4; #else syncflag=0x0; #endif sync = getenv("SWAN_SYNC" ); if( sync != NULL ) { syncflag = atoi( sync ); fprintf(stderr, "SWAN: Using synchronization method %d\n", syncflag ); } if( getenv("SWAN_PROFILE") || getenv("CUDA_PROFILE") ) { state.debug = 1; } if( state.debug ) { printf("SWAN: Built for CUDA version %d.%d\n", CUDA_MAJOR, CUDA_MINOR ); } #ifdef DEV_EXTENSIONS syncflag |= CU_CTX_MAP_HOST; #endif #ifdef USE_FIXED_DEVICE printf( "********************************************** OVERRIDING DEVICE ALLOCATION\n"); status = cuCtxCreate( &(state.context), syncflag , 0 ); // state.device + state.target_device ); if ( CUDA_SUCCESS != status ) error ( "Unable to create context\n" ); #else status = cuCtxCreate( &(state.context), syncflag , state.target_device ); if ( CUDA_SUCCESS != status ) { printf("SWAN: Failed to get requested device (compute exclusive mode). Trying any..\n" ); int count; cuDeviceGetCount( &count ); int i=0; while ( (i < count) && (status != CUDA_SUCCESS) ) { state.target_device = i; CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&(state.device), state.target_device)); status = cuCtxCreate( &(state.context), syncflag , state.target_device ); i++; } } #endif if ( CUDA_SUCCESS != status ) error ( "Unable to create context\n" ); state.mods = NULL; state.mod_names = NULL; state.num_mods = 0; state.funcs = NULL; state.func_names = NULL; state.num_funcs = 0; state.init = 1; cudaGetDeviceProperties(&deviceProp, state.device); cuStreamCreate( &state.stream, 0 ); state.device_version = (deviceProp.major * 100 + deviceProp.minor * 10); state.multiProcessorCount = deviceProp.multiProcessorCount; }
/* // Property Message // // API //static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) // INPUT // image // step // numStep // startIndex // sideLength // bx // by // OUTPUT // maps // RESULT // Error status */ static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step, int numStep, int startIndex, int sideLength, int bx, int by, CvLSVMFeaturePyramid **maps) { CvLSVMFeatureMap **feature_maps; int i; int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY; float *scales; CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca; CUstream *streams; CUresult res; scales = (float *) malloc(sizeof(float) * (numStep)); devs_img = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * (numStep)); streams = (CUstream *) malloc(sizeof(CUstream) * (numStep)); feature_maps = (CvLSVMFeatureMap **) malloc( sizeof(CvLSVMFeatureMap *) * (numStep)); // allocate device memory for (i = 0; i < numStep; i++) { scales[i] = 1.0f / powf(step, (float) i); width = (int) (((float) image->width ) * scales[i] + 0.5); height = (int) (((float) image->height) * scales[i] + 0.5); numChannels = image->nChannels; sizeX = width / sideLength; sizeY = height / sideLength; p = NUM_SECTOR * 3; pp = NUM_SECTOR * 12; newSizeX = sizeX - 2; newSizeY = sizeY - 2; allocFeatureMapObjectGPU<float>(&devs_img[i], width, height, numChannels); allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p); allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX, newSizeY, pp); res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT); CUDA_CHECK(res, "cuStreamCreate(stream)"); } // excute main function resizeGPUStream(numStep, image, scales, devs_img, streams); getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm, streams); normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm, devs_map_pre_pca, streams); PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps, streams); // synchronize cuda stream for (i = 0; i < numStep; i++) { cuStreamSynchronize(streams[i]); cuStreamDestroy(streams[i]); } for (i = 0; i < numStep; i++) { (*maps)->pyramid[startIndex + i] = feature_maps[i]; }/*for(i = 0; i < numStep; i++)*/ // free device memory for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_img[i]); freeFeatureMapObjectGPU(&devs_map_pre_norm[i]); freeFeatureMapObjectGPU(&devs_map_pre_pca[i]); } free(scales); free(devs_img); free(devs_map_pre_norm); free(devs_map_pre_pca); free(streams); free(feature_maps); return LATENT_SVM_OK; }
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, int unpaddedPatternCount, int tipCount, long flags) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n"); #endif SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber])); unsigned int ctxFlags = CU_CTX_SCHED_AUTO; if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) { ctxFlags |= CU_CTX_MAP_HOST; } CUresult error = cuCtxCreate(&cudaContext, ctxFlags, cudaDevice); if(error != CUDA_SUCCESS) { fprintf(stderr, "CUDA error: \"%s\" (%d) from file <%s>, line %i.\n", GetCUDAErrorDescription(error), error, __FILE__, __LINE__); if (error == CUDA_ERROR_INVALID_DEVICE) { fprintf(stderr, "(The requested CUDA device is likely set to compute exclusive mode. This mode prevents multiple processes from running on the device.)"); } exit(-1); } InitializeKernelResource(paddedStateCount, flags & BEAGLE_FLAG_PRECISION_DOUBLE); if (!kernelResource) { fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount); exit(-1); } kernelResource->categoryCount = categoryCount; kernelResource->patternCount = paddedPatternCount; kernelResource->unpaddedPatternCount = unpaddedPatternCount; kernelResource->flags = flags; SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode)); if ((paddedPatternCount < BEAGLE_MULTI_GRID_MAX || flags & BEAGLE_FLAG_PARALLELOPS_GRID) && !(flags & BEAGLE_FLAG_PARALLELOPS_STREAMS)) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = tipCount/2 + 1; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::SetDevice\n"); #endif }
int cuda_test_memcpy_async(unsigned int size) { int i; CUresult res; CUdevice dev; CUcontext ctx; CUstream stream; CUdeviceptr data_addr; unsigned int *in, *out; struct timeval tv; struct timeval tv_total_start, tv_total_end; unsigned long total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %u\n", (unsigned int)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamCreate(&stream, 0); if (res != CUDA_SUCCESS) { printf("cuStreamCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAlloc(&data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&in, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&out, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res); return -1; } for (i = 0; i < size / 4; i++) { in[i] = i+1; out[i] = 0; } gettimeofday(&tv_h2d_start, NULL); res = cuMemcpyHtoDAsync(data_addr, in, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_d2h_start, NULL); res = cuMemcpyDtoHAsync(out, data_addr, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_d2h_end, NULL); for (i = 0; i < size / 4; i++) { if (in[i] != out[i]) { printf("in[%d] = %u, out[%d] = %u\n", i, in[i], i, out[i]); } } res = cuMemFreeHost(out); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFreeHost(in); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFree(data_addr); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamDestroy(stream); if (res != CUDA_SUCCESS) { printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000 + tv.tv_usec / 1000; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); return 0; end: return -1; }
static struct ptx_stream * select_stream_for_async (int async, pthread_t thread, bool create, CUstream existing) { struct nvptx_thread *nvthd = nvptx_thread (); /* Local copy of TLS variable. */ struct ptx_device *ptx_dev = nvthd->ptx_dev; struct ptx_stream *stream = NULL; int orig_async = async; /* The special value acc_async_noval (-1) maps (for now) to an implicitly-created stream, which is then handled the same as any other numbered async stream. Other options are available, e.g. using the null stream for anonymous async operations, or choosing an idle stream from an active set. But, stick with this for now. */ if (async > acc_async_sync) async++; if (create) pthread_mutex_lock (&ptx_dev->stream_lock); /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the null stream, and in fact better performance may be obtainable if it doesn't (because the null stream enforces overly-strict synchronisation with respect to other streams for legacy reasons, and that's probably not needed with OpenACC). Maybe investigate later. */ if (async == acc_async_sync) stream = ptx_dev->null_stream; else if (async >= 0 && async < ptx_dev->async_streams.size && ptx_dev->async_streams.arr[async] && !(create && existing)) stream = ptx_dev->async_streams.arr[async]; else if (async >= 0 && create) { if (async >= ptx_dev->async_streams.size) { int i, newsize = ptx_dev->async_streams.size * 2; if (async >= newsize) newsize = async + 1; ptx_dev->async_streams.arr = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr, newsize * sizeof (struct ptx_stream *)); for (i = ptx_dev->async_streams.size; i < newsize; i++) ptx_dev->async_streams.arr[i] = NULL; ptx_dev->async_streams.size = newsize; } /* Create a new stream on-demand if there isn't one already, or if we're setting a particular async value to an existing (externally-provided) stream. */ if (!ptx_dev->async_streams.arr[async] || existing) { CUresult r; struct ptx_stream *s = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); if (existing) s->stream = existing; else { r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r)); } /* If CREATE is true, we're going to be queueing some work on this stream. Associate it with the current host thread. */ s->host_thread = thread; s->multithreaded = false; s->d = (CUdeviceptr) NULL; s->h = NULL; map_init (s); s->next = ptx_dev->active_streams; ptx_dev->active_streams = s; ptx_dev->async_streams.arr[async] = s; } stream = ptx_dev->async_streams.arr[async]; } else if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); if (create) { assert (stream != NULL); /* If we're trying to use the same stream from different threads simultaneously, set stream->multithreaded to true. This affects the behaviour of acc_async_test_all and acc_wait_all, which are supposed to only wait for asynchronous launches from the same host thread they are invoked on. If multiple threads use the same async value, we make note of that here and fall back to testing/waiting for all threads in those functions. */ if (thread != stream->host_thread) stream->multithreaded = true; pthread_mutex_unlock (&ptx_dev->stream_lock); } else if (stream && !stream->multithreaded && !pthread_equal (stream->host_thread, thread)) GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async); return stream; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } if (acc_async_test_all () != 0) { fprintf (stderr, "asynchronous operation not running\n"); abort (); } sleep ((int) (dtime / 1000.f) + 1); if (acc_async_test_all () != 1) { fprintf (stderr, "found asynchronous operation still running\n"); abort (); } acc_unmap_data (a); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); exit (0); }
value spoc_getCudaDevice(value i) { CAMLparam1(i); CAMLlocal4(general_info, cuda_info, specific_info, gc_info); CAMLlocal3(device, maxT, maxG); int nb_devices; CUdevprop dev_infos; CUdevice dev; CUcontext ctx; CUstream queue[2]; spoc_cu_context *spoc_ctx; //CUcontext gl_ctx; char infoStr[1024]; int infoInt; size_t infoUInt; int major, minor; enum cudaError_enum cuda_error; cuDeviceGetCount (&nb_devices); if ((Int_val(i)) > nb_devices) raise_constant(*caml_named_value("no_cuda_device")) ; CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i))); CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev)); general_info = caml_alloc (9, 0); CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev)); Store_field(general_info,0, copy_string(infoStr));// CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev)); Store_field(general_info,1, Val_int(infoUInt));// Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));// Store_field(general_info,3, Val_int(dev_infos.clockRate));// Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); Store_field(general_info,5, Val_int(infoInt));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); Store_field(general_info,6, Val_bool(infoInt));// Store_field(general_info,7, i); CUDA_CHECK_CALL(cuCtxCreate (&ctx, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, dev)); spoc_ctx = malloc(sizeof(spoc_cl_context)); spoc_ctx->ctx = ctx; CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0)); CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0)); spoc_ctx->queue[0] = queue[0]; spoc_ctx->queue[1] = queue[1]; Store_field(general_info,8, (value)spoc_ctx); CUDA_CHECK_CALL(cuCtxSetCurrent(ctx)); cuda_info = caml_alloc(1, 0); //0 -> Cuda specific_info = caml_alloc(18, 0); cuDeviceComputeCapability(&major, &minor, dev); Store_field(specific_info,0, Val_int(major));// Store_field(specific_info,1, Val_int(minor));// Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));// Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));// Store_field(specific_info,4, Val_int(dev_infos.memPitch));// Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));// maxT = caml_alloc(3, 0); Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));// Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));// Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));// Store_field(specific_info,6, maxT); maxG = caml_alloc(3, 0); Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));// Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));// Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));// Store_field(specific_info,7, maxG); Store_field(specific_info,8, Val_int(dev_infos.textureAlign));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); Store_field(specific_info,9, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); Store_field(specific_info,10, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); Store_field(specific_info,11, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); Store_field(specific_info,12, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); Store_field(specific_info,13, Val_int(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); Store_field(specific_info,14, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); Store_field(specific_info,15, Val_int(infoInt)); cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); Store_field(specific_info,16, Val_int(infoInt)); cuDriverGetVersion(&infoInt); Store_field(specific_info, 17, Val_int(infoInt)); Store_field(cuda_info, 0, specific_info); device = caml_alloc(4, 0); Store_field(device, 0, general_info); Store_field(device, 1, cuda_info); {spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info)); CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL)); infoUInt -= (32*1024*1024); Store_field(device, 2, (value)gcInfo); {cuda_event_list* events = NULL; Store_field(device, 3, (value)events); CAMLreturn(device);}} }
CNvidiaNvencCodec(DWORD nCodecInstanceId, const CCodecContextBase& CodecContext) : m_NvidiaNvencCodecContext(static_cast<const CNvidiaNvencCodecContext&>(CodecContext)), m_hNvEncodeAPI64(LoadLibraryA("nvEncodeAPI64.dll")) { PNVENCODEAPICREATEINSTANCE pNvEncodeAPICreateInstance = reinterpret_cast<PNVENCODEAPICREATEINSTANCE>(GetProcAddress(m_hNvEncodeAPI64, "NvEncodeAPICreateInstance")); memset(&m_FunctionList, 0, sizeof(m_FunctionList)); m_FunctionList.version = NV_ENCODE_API_FUNCTION_LIST_VER; NVENCSTATUS nStatus = pNvEncodeAPICreateInstance(&m_FunctionList); CHECK_CUDA_DRV_STATUS(cuCtxCreate(&m_Context, 0, 0)); if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda()) { CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2)); m_nNv12BufferPitch = m_NvidiaNvencCodecContext.GetWidth(); CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2)); m_pNv12Planes[0] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer); m_pNv12Planes[1] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer) + m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight(); m_pNv12Strides[0] = m_NvidiaNvencCodecContext.GetWidth(); m_pNv12Strides[1] = m_NvidiaNvencCodecContext.GetWidth(); m_SwscaleContext = sws_getContext(m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_BGR32, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_NV12, 0, 0, 0, 0); } else { CHECK_CUDA_DRV_STATUS(cuMemAllocPitch(&m_pNv12Buffer, &m_nNv12BufferPitch, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight() * 3 / 2, 16)); if (m_NvidiaNvencCodecContext.GetUsePageLockedIntermediateBuffer()) { CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4)); } CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4)); } CHECK_CUDA_DRV_STATUS(cuStreamCreate(&m_Stream, 0)); NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS SessionParameters; memset(&SessionParameters, 0, sizeof(SessionParameters)); SessionParameters.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; SessionParameters.apiVersion = NVENCAPI_VERSION; SessionParameters.device = m_Context; SessionParameters.deviceType = NV_ENC_DEVICE_TYPE_CUDA; nStatus = m_FunctionList.nvEncOpenEncodeSessionEx(&SessionParameters, &m_pEncoder); m_PictureParameters.version = NV_ENC_PIC_PARAMS_VER; auto PresetGuid = NV_ENC_PRESET_HP_GUID; NV_ENC_PRESET_CONFIG PresetConfiguration = { NV_ENC_PRESET_CONFIG_VER, 0 }; PresetConfiguration.presetCfg.version = NV_ENC_CONFIG_VER; CHECK_NVENC_STATUS(m_FunctionList.nvEncGetEncodePresetConfig(m_pEncoder, NV_ENC_CODEC_H264_GUID, PresetGuid, &PresetConfiguration)); NV_ENC_CONFIG EncoderConfiguration = { NV_ENC_CONFIG_VER, 0 }; EncoderConfiguration = PresetConfiguration.presetCfg; EncoderConfiguration.gopLength = NVENC_INFINITE_GOPLENGTH; EncoderConfiguration.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID; EncoderConfiguration.frameIntervalP = 1; // No B frames EncoderConfiguration.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME; EncoderConfiguration.encodeCodecConfig.h264Config.idrPeriod = m_NvidiaNvencCodecContext.GetFrameCount(); EncoderConfiguration.encodeCodecConfig.h264Config.chromaFormatIDC = 1; EncoderConfiguration.encodeCodecConfig.h264Config.sliceMode = 0; EncoderConfiguration.encodeCodecConfig.h264Config.sliceModeData = 0; NV_ENC_INITIALIZE_PARAMS InitializationParameters = { NV_ENC_INITIALIZE_PARAMS_VER, 0 }; InitializationParameters.encodeGUID = NV_ENC_CODEC_H264_GUID; InitializationParameters.presetGUID = PresetGuid; InitializationParameters.frameRateNum = m_NvidiaNvencCodecContext.GetFps(); InitializationParameters.frameRateDen = 1; #ifdef ASYNCHRONOUS InitializationParameters.enableEncodeAsync = 1; #else InitializationParameters.enableEncodeAsync = 0; #endif InitializationParameters.enablePTD = 1; // Let the encoder decide the picture type InitializationParameters.reportSliceOffsets = 0; InitializationParameters.maxEncodeWidth = m_NvidiaNvencCodecContext.GetWidth(); InitializationParameters.maxEncodeHeight = m_NvidiaNvencCodecContext.GetHeight(); InitializationParameters.encodeConfig = &EncoderConfiguration; InitializationParameters.encodeWidth = m_NvidiaNvencCodecContext.GetWidth(); InitializationParameters.encodeHeight = m_NvidiaNvencCodecContext.GetHeight(); InitializationParameters.darWidth = 16; InitializationParameters.darHeight = 9; CHECK_NVENC_STATUS(m_FunctionList.nvEncInitializeEncoder(m_pEncoder, &InitializationParameters)); // Picture parameters that are known ahead of encoding m_PictureParameters = { NV_ENC_PIC_PARAMS_VER, 0 }; m_PictureParameters.codecPicParams.h264PicParams.sliceMode = 0; m_PictureParameters.codecPicParams.h264PicParams.sliceModeData = 0; m_PictureParameters.inputWidth = m_NvidiaNvencCodecContext.GetWidth(); m_PictureParameters.inputHeight = m_NvidiaNvencCodecContext.GetHeight(); m_PictureParameters.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL; m_PictureParameters.inputPitch = static_cast<uint32_t>(m_nNv12BufferPitch); m_PictureParameters.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; #ifdef ASYNCHRONOUS m_hCompletionEvent = CreateEvent(NULL, FALSE, FALSE, NULL); m_EventParameters = { NV_ENC_EVENT_PARAMS_VER, 0 }; m_EventParameters.completionEvent = m_hCompletionEvent; CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterAsyncEvent(m_pEncoder, &m_EventParameters)); m_PictureParameters.completionEvent = m_hCompletionEvent; #endif // Register CUDA input pointer NV_ENC_REGISTER_RESOURCE RegisterResource = { NV_ENC_REGISTER_RESOURCE_VER, NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), static_cast<uint32_t>(m_nNv12BufferPitch), 0, reinterpret_cast<void*>(m_pNv12Buffer), NULL, NV_ENC_BUFFER_FORMAT_NV12_PL }; CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterResource(m_pEncoder, &RegisterResource)); NV_ENC_MAP_INPUT_RESOURCE MapInputResource = { NV_ENC_MAP_INPUT_RESOURCE_VER, 0, 0, RegisterResource.registeredResource }; m_pRegisteredResource = RegisterResource.registeredResource; CHECK_NVENC_STATUS(m_FunctionList.nvEncMapInputResource(m_pEncoder, &MapInputResource)); m_PictureParameters.inputBuffer = MapInputResource.mappedResource; // Create output bitstream buffer m_nOutputBitstreamSize = 2 * 1024 * 1024; NV_ENC_CREATE_BITSTREAM_BUFFER CreateBitstreamBuffer = { NV_ENC_CREATE_BITSTREAM_BUFFER_VER, m_nOutputBitstreamSize, NV_ENC_MEMORY_HEAP_AUTOSELECT, 0 }; CHECK_NVENC_STATUS(m_FunctionList.nvEncCreateBitstreamBuffer(m_pEncoder, &CreateBitstreamBuffer)); m_pOutputBitstream = CreateBitstreamBuffer.bitstreamBuffer; m_PictureParameters.outputBitstream = m_pOutputBitstream; if (m_NvidiaNvencCodecContext.GetSaveOutputToFile()) { char pOutputFilename[MAX_PATH]; sprintf_s(pOutputFilename, "nvenc-%d.h264", nCodecInstanceId); if (fopen_s(&m_pOutputFile, pOutputFilename, "wb") != 0) { throw std::runtime_error(std::string("could not open ").append(pOutputFilename).append(" for writing!")); } } }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay2; CUmodule module; CUresult r; int N; int i; CUstream *streams; unsigned long **a, **d_a, *tid, ticks; int nbytes; void *kargs[3]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay2, module, "delay2"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = sizeof (int); ticks = (unsigned long) (200.0 * clkrate); N = nprocs; streams = (CUstream *) malloc (N * sizeof (void *)); a = (unsigned long **) malloc (N * sizeof (unsigned long *)); d_a = (unsigned long **) malloc (N * sizeof (unsigned long *)); tid = (unsigned long *) malloc (N * sizeof (unsigned long)); for (i = 0; i < N; i++) { a[i] = (unsigned long *) malloc (sizeof (unsigned long)); *a[i] = N; d_a[i] = (unsigned long *) acc_malloc (nbytes); tid[i] = i; acc_map_data (a[i], d_a[i], nbytes); streams[i] = (CUstream) acc_get_cuda_stream (i); if (streams[i] != NULL) abort (); r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (i, streams[i])) abort (); } for (i = 0; i < N; i++) { kargs[0] = (void *) &d_a[i]; kargs[1] = (void *) &ticks; kargs[2] = (void *) &tid[i]; r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } ticks = (unsigned long) (50.0 * clkrate); } acc_wait_all_async (0); for (i = 0; i < N; i++) { acc_copyout (a[i], nbytes); if (*a[i] != i) abort (); } free (streams); for (i = 0; i < N; i++) { free (a[i]); } free (a); free (d_a); free (tid); acc_shutdown (acc_device_nvidia); exit (0); }