예제 #1
bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H)
    TexRes &r = res[0];// 1 NV12 texture
    if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes)
        return true;
    if (!ctx) {
        // TODO: how to use pop/push decoder's context without the context in opengl context
        if (!ensureD3DDevice())
            return false;
        // CUdevice is different from decoder's
        CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false);
        CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
        CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
#endif //USE_STREAM
        qDebug("cuda contex on gl thread: %p", ctx);
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
    if (r.cuRes) {
        CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
        r.cuRes = NULL;

    // create d3d resource for interop
    if (!surface9_nv12) {
        // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch
                                         //, H
                                         , H*3/2
                                         , 1
                                         , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET?
                                         //, D3DUSAGE_RENDERTARGET
                                         , D3DFMT_L8
                                         //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8?
                                         , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda?
                                         , &texture9_nv12
                                         , NULL) // - Resources allocated as shared may not be registered with CUDA.
                  , false);
        DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false);

    // TODO: cudaD3D9.h says NV12 is not supported
    // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface
    // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl
    CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
    return true;
예제 #2
HRESULT initCudaResources(int argc, char **argv, int bUseInterop, int bTCC)
    HRESULT hr = S_OK;

    CUdevice cuda_device;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        cuda_device = getCmdLineArgumentInt(argc, (const char **) argv, "device");

        // If interop is disabled, then we need to create a CUDA context w/o the GL context
        if (bUseInterop && !bTCC)
            cuda_device = findCudaDeviceDRV(argc, (const char **)argv);
            cuda_device = findCudaGLDeviceDRV(argc, (const char **)argv);

        if (cuda_device < 0)
            printf("No CUDA Capable devices found, exiting...\n");

        checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
        // If we want to use Graphics Interop, then choose the GPU that is capable
        if (bUseInterop)
            cuda_device = gpuGetMaxGflopsGLDeviceIdDRV();
            checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
            cuda_device = gpuGetMaxGflopsDeviceIdDRV();
            checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));

    // get compute capabilities and the devicename
    int major, minor;
    size_t totalGlobalMem;
    char deviceName[256];
    checkCudaErrors(cuDeviceComputeCapability(&major, &minor, g_oDevice));
    checkCudaErrors(cuDeviceGetName(deviceName, 256, g_oDevice));
    printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor);

    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, g_oDevice));
    printf("  Total amount of global memory:     %4.4f MB\n", (float)totalGlobalMem/(1024*1024));

    // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC)
    // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization)
    if (bUseInterop)
        checkCudaErrors(cuD3D9CtxCreate(&g_oContext, &g_oDevice, CU_CTX_BLOCKING_SYNC, g_pD3DDevice));
        checkCudaErrors(cuCtxCreate(&g_oContext, CU_CTX_BLOCKING_SYNC, g_oDevice));

        // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running
        if (sizeof(void *) == 4)
            g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", exec_path, 2, 2, 2);
            g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", exec_path, 2, 2, 2);
    catch (char const *p_file)
        // If the CUmoduleManager constructor fails to load the PTX file, it will throw an exception
        printf("\n>> CUmoduleManager::Exception!  %s not found!\n", p_file);
        printf(">> Please rebuild NV12ToARGB_drvapi.cu or re-install this sample.\n");
        return E_FAIL;

    g_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi",   &gfpNV12toARGB);
    g_pCudaModule->GetCudaFunction("Passthru_drvapi",     &gfpPassthru);

    // Now we create the CUDA resources and the CUDA decoder context

    if (bUseInterop)
        checkCudaErrors(cuMemAlloc(&g_pInteropFrame[0], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2));
        checkCudaErrors(cuMemAlloc(&g_pInteropFrame[1], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2));

    CUcontext cuCurrent = NULL;
    CUresult result = cuCtxPopCurrent(&cuCurrent);

    if (result != CUDA_SUCCESS)
        printf("cuCtxPopCurrent: %d\n", result);

    return ((g_pCudaModule && g_pVideoDecoder && (g_pImageDX || g_pInteropFrame[0])) ? S_OK : E_FAIL);
	HRESULT CudaVideoRender::initCudaResources(int bUseInterop, int bTCC)
		HRESULT hr = S_OK;

		CUdevice cuda_device;
			// If we want to use Graphics Interop, then choose the GPU that is capable
			if (bUseInterop) {
				cuda_device = cutilDrvGetMaxGflopsGraphicsDeviceId();
				cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device ));
			} else {
				cuda_device = cutilDrvGetMaxGflopsDeviceId();
				cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device ));

		// get compute capabilities and the devicename
		int major, minor;
		size_t totalGlobalMem;
		char deviceName[256];
		cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, m_cuDevice) );
		cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, m_cuDevice) );
		printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor);

		cutilDrvSafeCallNoSync( cuDeviceTotalMem(&totalGlobalMem, m_cuDevice) );
		printf("  Total amount of global memory:     %4.4f MB\n", (float)totalGlobalMem/(1024*1024) );

		// Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC)
		// (use CU_CTX_BLOCKING_SYNC for better CPU synchronization)
		if (bUseInterop) {
			cutilDrvSafeCallNoSync( cuD3D9CtxCreate(&m_cuContext, &m_cuDevice, CU_CTX_BLOCKING_SYNC, m_pRenderer9->getDevice()) );
		} else {
			cutilDrvSafeCallNoSync( cuCtxCreate(&m_cuContext, CU_CTX_BLOCKING_SYNC, m_cuDevice) );

		// Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running
		if (sizeof(void *) == 4) {
			m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", "./", 2, 2, 2);
		} else {
			m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", "./", 2, 2, 2);

		m_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi",   &m_fpNV12toARGB);
		m_pCudaModule->GetCudaFunction("Passthru_drvapi",     &m_fpPassthru);

		// Now we create the CUDA resources and the CUDA decoder context

		if (bUseInterop) {
			//initD3D9Surface   ( m_pVideoDecoder->targetWidth(), 
			//					m_pVideoDecoder->targetHeight() );
		} else {
			cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[0], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) );
			cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[1], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) );

		CUcontext cuCurrent = NULL;
		CUresult result = cuCtxPopCurrent(&cuCurrent);
		if (result != CUDA_SUCCESS) {
			printf("cuCtxPopCurrent: %d\n", result);

		return ((m_pCudaModule && m_pVideoDecoder) ? S_OK : E_FAIL);