bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H) { TexRes &r = res[0];// 1 NV12 texture if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes) return true; if (!ctx) { // TODO: how to use pop/push decoder's context without the context in opengl context if (!ensureD3DDevice()) return false; // CUdevice is different from decoder's CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false); #if USE_STREAM CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT)); CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT)); #endif //USE_STREAM qDebug("cuda contex on gl thread: %p", ctx); CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this } if (r.cuRes) { CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false); r.cuRes = NULL; } // create d3d resource for interop if (!surface9_nv12) { // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch DX_ENSURE(device9->CreateTexture(W //, H , H*3/2 , 1 , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET? //, D3DUSAGE_RENDERTARGET , D3DFMT_L8 //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8? , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda? , &texture9_nv12 , NULL) // - Resources allocated as shared may not be registered with CUDA. , false); DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false); } // TODO: cudaD3D9.h says NV12 is not supported // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false); return true; }
HRESULT initCudaResources(int argc, char **argv, int bUseInterop, int bTCC) { HRESULT hr = S_OK; CUdevice cuda_device; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { cuda_device = getCmdLineArgumentInt(argc, (const char **) argv, "device"); // If interop is disabled, then we need to create a CUDA context w/o the GL context if (bUseInterop && !bTCC) { cuda_device = findCudaDeviceDRV(argc, (const char **)argv); } else { cuda_device = findCudaGLDeviceDRV(argc, (const char **)argv); } if (cuda_device < 0) { printf("No CUDA Capable devices found, exiting...\n"); exit(EXIT_SUCCESS); } checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } else { // If we want to use Graphics Interop, then choose the GPU that is capable if (bUseInterop) { cuda_device = gpuGetMaxGflopsGLDeviceIdDRV(); checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } else { cuda_device = gpuGetMaxGflopsDeviceIdDRV(); checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } } // get compute capabilities and the devicename int major, minor; size_t totalGlobalMem; char deviceName[256]; checkCudaErrors(cuDeviceComputeCapability(&major, &minor, g_oDevice)); checkCudaErrors(cuDeviceGetName(deviceName, 256, g_oDevice)); printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor); checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, g_oDevice)); printf(" Total amount of global memory: %4.4f MB\n", (float)totalGlobalMem/(1024*1024)); // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC) // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization) if (bUseInterop) { checkCudaErrors(cuD3D9CtxCreate(&g_oContext, &g_oDevice, CU_CTX_BLOCKING_SYNC, g_pD3DDevice)); } else { checkCudaErrors(cuCtxCreate(&g_oContext, CU_CTX_BLOCKING_SYNC, g_oDevice)); } try { // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running if (sizeof(void *) == 4) { g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", exec_path, 2, 2, 2); } else { g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", exec_path, 2, 2, 2); } } catch (char const *p_file) { // If the CUmoduleManager constructor fails to load the PTX file, it will throw an exception printf("\n>> CUmoduleManager::Exception! %s not found!\n", p_file); printf(">> Please rebuild NV12ToARGB_drvapi.cu or re-install this sample.\n"); return E_FAIL; } g_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi", &gfpNV12toARGB); g_pCudaModule->GetCudaFunction("Passthru_drvapi", &gfpPassthru); /////////////////Change/////////////////////////// // Now we create the CUDA resources and the CUDA decoder context initCudaVideo(); if (bUseInterop) { initD3D9Surface(g_pVideoDecoder->targetWidth(), g_pVideoDecoder->targetHeight()); } else { checkCudaErrors(cuMemAlloc(&g_pInteropFrame[0], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2)); checkCudaErrors(cuMemAlloc(&g_pInteropFrame[1], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2)); } CUcontext cuCurrent = NULL; CUresult result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { printf("cuCtxPopCurrent: %d\n", result); assert(0); } ///////////////////////////////////////// return ((g_pCudaModule && g_pVideoDecoder && (g_pImageDX || g_pInteropFrame[0])) ? S_OK : E_FAIL); }
HRESULT CudaVideoRender::initCudaResources(int bUseInterop, int bTCC) { HRESULT hr = S_OK; CUdevice cuda_device; { // If we want to use Graphics Interop, then choose the GPU that is capable if (bUseInterop) { cuda_device = cutilDrvGetMaxGflopsGraphicsDeviceId(); cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device )); } else { cuda_device = cutilDrvGetMaxGflopsDeviceId(); cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device )); } } // get compute capabilities and the devicename int major, minor; size_t totalGlobalMem; char deviceName[256]; cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, m_cuDevice) ); cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, m_cuDevice) ); printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor); cutilDrvSafeCallNoSync( cuDeviceTotalMem(&totalGlobalMem, m_cuDevice) ); printf(" Total amount of global memory: %4.4f MB\n", (float)totalGlobalMem/(1024*1024) ); // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC) // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization) if (bUseInterop) { cutilDrvSafeCallNoSync( cuD3D9CtxCreate(&m_cuContext, &m_cuDevice, CU_CTX_BLOCKING_SYNC, m_pRenderer9->getDevice()) ); } else { cutilDrvSafeCallNoSync( cuCtxCreate(&m_cuContext, CU_CTX_BLOCKING_SYNC, m_cuDevice) ); } // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running if (sizeof(void *) == 4) { m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", "./", 2, 2, 2); } else { m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", "./", 2, 2, 2); } m_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi", &m_fpNV12toARGB); m_pCudaModule->GetCudaFunction("Passthru_drvapi", &m_fpPassthru); /////////////////Change/////////////////////////// // Now we create the CUDA resources and the CUDA decoder context initCudaVideo(); if (bUseInterop) { //initD3D9Surface ( m_pVideoDecoder->targetWidth(), // m_pVideoDecoder->targetHeight() ); } else { cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[0], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) ); cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[1], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) ); } CUcontext cuCurrent = NULL; CUresult result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { printf("cuCtxPopCurrent: %d\n", result); assert(0); } ///////////////////////////////////////// return ((m_pCudaModule && m_pVideoDecoder) ? S_OK : E_FAIL); }