void CudaVideoRender::initCudaVideo( ) { // bind the context lock to the CUDA context CUresult result = cuvidCtxLockCreate(&m_CtxLock, m_cuContext); if (result != CUDA_SUCCESS) { printf("cuvidCtxLockCreate failed: %d\n", result); assert(0); } std::auto_ptr<VideoDecoder> apVideoDecoder(new VideoDecoder(m_pVideoSource->format(), m_cuContext, m_eVideoCreateFlags, m_CtxLock)); std::auto_ptr<VideoParser> apVideoParser(new VideoParser(apVideoDecoder.get(), m_pFrameQueue)); m_pVideoSource->setParser(*apVideoParser.get()); m_pVideoParser = apVideoParser.release(); m_pVideoDecoder = apVideoDecoder.release(); // Create a Stream ID for handling Readback if (m_bReadback) { cutilDrvSafeCallNoSync( cuStreamCreate(&m_ReadbackSID, 0) ); cutilDrvSafeCallNoSync( cuStreamCreate(&m_KernelSID, 0) ); printf("> initCudaVideo()\n"); printf(" CUDA Streams (%s) <m_ReadbackSID = %p>\n", ((m_ReadbackSID == 0) ? "Disabled" : "Enabled"), m_ReadbackSID ); printf(" CUDA Streams (%s) <m_KernelSID = %p>\n", ((m_KernelSID == 0) ? "Disabled" : "Enabled"), m_KernelSID ); } }
void CudaVideoRender::map(CUdeviceptr * ppImageData, unsigned int * pImagePitch, int active_field) { unsigned int nFrames = m_bIsProgressive ? 1 : 2; if(!m_ppRegisteredResources[0]) registerResources(nFrames); cutilDrvSafeCallNoSync ( cuD3D9MapResources(nFrames, reinterpret_cast<IDirect3DResource9 **>(m_ppRegisteredResources) )); cutilDrvSafeCallNoSync ( cuD3D9ResourceGetMappedPointer(ppImageData, m_ppRegisteredResources[active_field], 0, 0) ); assert(0 != *ppImageData); cutilDrvSafeCallNoSync ( cuD3D9ResourceGetMappedPitch(pImagePitch, NULL, m_ppRegisteredResources[active_field], 0, 0) ); assert(0 != *pImagePitch); }
// Release all previously initd objects HRESULT CudaVideoRender::cleanup(bool bDestroyContext) { // Attach the CUDA Context (so we may properly free memroy) cutilDrvSafeCallNoSync( cuCtxPushCurrent(m_cuContext) ); if (m_pInteropFrame[0]) { cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[0]) ); } if (m_pInteropFrame[1]) { cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[1]) ); } // Detach from the Current thread cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) ); terminateCudaVideo(bDestroyContext); return S_OK; }
void CudaVideoRender::terminateCudaVideo(bool bDestroyContext) { if (m_pVideoParser) delete m_pVideoParser; if (m_pVideoDecoder) delete m_pVideoDecoder; if (m_pVideoSource) delete m_pVideoSource; if (m_pFrameQueue) delete m_pFrameQueue; if (m_CtxLock) { cutilDrvSafeCallNoSync( cuvidCtxLockDestroy(m_CtxLock) ); } if (m_cuContext && bDestroyContext) { cutilDrvSafeCallNoSync( cuCtxDestroy(m_cuContext) ); m_cuContext = NULL; } if (m_ReadbackSID) cuStreamDestroy(m_ReadbackSID); if (m_KernelSID) cuStreamDestroy(m_KernelSID); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest(int argc, char** argv) { CUcontext cuContext; // initialize CUDA CUfunction pk = NULL; const char cubin_name [] = "pass_kernel.cubin"; const char kernel_name [] = "pass_kernel"; CU_SAFE_CALL(initCuda(cuContext, argv[0], &pk, argc, argv, cubin_name, kernel_name)); printf("initCuda-returned CUfunction:\n"); // cuParamSetx, x=i f v // http://visionexperts.blogspot.com/2010/07/cuda-parameter-alignment.html - check alignment #define ALIGN_UP(offset, alignment) \ (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) size_t offset = 0; // input integers // CU paramset i. for(int i = 0 ; i < NUM_ARG ; i++) { int align = __alignof(int); ALIGN_UP(offset, align); cuParamSeti(pk, offset, i); printf ("offset %d = %d\n", i, offset); offset += sizeof(int); } // return array for updated inputs int size_int = sizeof(int); int size_array = size_int * NUM_ARG; CUdeviceptr d_return_values; cuMemAlloc (&d_return_values, size_array); void* ptr = (void*)(size_t)d_return_values; int align = __alignof(ptr); ALIGN_UP(offset, align); cuParamSetv(pk, offset, &ptr, sizeof(ptr)); printf("return values offset:%d\n", offset); offset += sizeof(ptr); CUdeviceptr d_return_N; cuMemAlloc(&d_return_N, size_int); void* ptrN = (void*)(size_t)d_return_N; int alignN = __alignof(ptrN); ALIGN_UP(offset, alignN); cuParamSetv(pk, offset, &ptrN, sizeof(ptr)); printf("return int offset:%d\n", offset); offset += sizeof(ptrN); // Calling kernel int BLOCK_SIZE_X = NUM_ARG; int BLOCK_SIZE_Y = 1; int BLOCK_SIZE_Z = 1; int GRID_SIZE = 1; cutilDrvSafeCallNoSync(cuFuncSetBlockShape(pk, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z)); printf("paramsetsize:%d\n", offset); CU_SAFE_CALL(cuParamSetSize(pk, offset)); CU_SAFE_CALL(cuLaunchGrid(pk, GRID_SIZE, GRID_SIZE)); int* h_return_values = (int*)malloc(NUM_ARG * sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_values, d_return_values, size_array)); CU_SAFE_CALL(cuMemFree(d_return_values)); for(int i=0;i<NUM_ARG;i++) printf("%dth value = %d\n", i, h_return_values[i]); free(h_return_values); int* h_return_N = (int*)malloc(sizeof(int)); CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_N, d_return_N, size_int)); CU_SAFE_CALL(cuMemFree(d_return_N)); printf("%d sizeof array\n", *h_return_N); if(cuContext !=NULL) cuCtxDetach(cuContext); }
void CudaVideoRender::unmap(int active_field) { int nFrames = m_bIsProgressive ? 1 : 2; cutilDrvSafeCallNoSync ( cuD3D9UnmapResources(nFrames, reinterpret_cast<IDirect3DResource9 **>(m_ppRegisteredResources) )); }
// Run the Cuda part of the computation bool CudaVideoRender::copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive) { CUVIDPARSERDISPINFO oDisplayInfo; if (m_pFrameQueue->dequeue(&oDisplayInfo)) { CCtxAutoLock lck ( m_CtxLock ); // Push the current CUDA context (only if we are using CUDA decoding path) CUresult result = cuCtxPushCurrent(m_cuContext); CUdeviceptr pDecodedFrame[2] = { 0, 0 }; CUdeviceptr pInteropFrame[2] = { 0, 0 }; int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field)); *pbIsProgressive = oDisplayInfo.progressive_frame; m_bIsProgressive = oDisplayInfo.progressive_frame ? true : false; for (int active_field=0; active_field<num_fields; active_field++) { nRepeats = oDisplayInfo.repeat_first_field; CUVIDPROCPARAMS oVideoProcessingParameters; memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS)); oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame; oVideoProcessingParameters.second_field = active_field; oVideoProcessingParameters.top_field_first = oDisplayInfo.top_field_first; oVideoProcessingParameters.unpaired_field = (num_fields == 1); unsigned int nDecodedPitch = 0; unsigned int nWidth = 0; unsigned int nHeight = 0; // map decoded video frame to CUDA surfae m_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, (unsigned int*)&pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters); nWidth = m_pVideoDecoder->targetWidth(); nHeight = m_pVideoDecoder->targetHeight(); // map DirectX texture to CUDA surface unsigned int nTexturePitch = 0; // If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks if (m_bReadback && m_bFirstFrame && m_ReadbackSID) { CUresult result; cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2) ) ); cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2) ) ); m_bFirstFrame = false; if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } // If streams are enabled, we can perform the readback to the host while the kernel is executing if (m_bReadback && m_ReadbackSID) { //TODO: test if &m_bFrameData[active_field] is the correct void* CUresult result = cuMemcpyDtoHAsync(&m_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), m_ReadbackSID); if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } #if ENABLE_DEBUG_OUT printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n", (oDisplayInfo.progressive_frame ? "Frame" : "Field"), m_nDecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp); #endif if (true) { // map the texture surface //m_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field); //TODO: map interop frames to d3d9surface map(&pInteropFrame[active_field], &nTexturePitch, active_field); } else { pInteropFrame[active_field] = m_pInteropFrame[active_field]; nTexturePitch = m_pVideoDecoder->targetWidth() * 2; } // perform post processing on the CUDA surface (performs colors space conversion and post processing) // comment this out if we inclue the line of code seen above cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, m_pCudaModule->getModule(), m_fpNV12toARGB, m_KernelSID); if (true) { // unmap the texture surface //m_pImageDX->unmap(active_field); //TODO: map interop frames to d3d9surface unmap(active_field); } // unmap video frame // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding) m_pVideoDecoder->unmapFrame((unsigned int*)&pDecodedFrame[active_field]); // release the frame, so it can be re-used in decoder m_pFrameQueue->releaseFrame(&oDisplayInfo); m_nDecodeFrameCount++; } // Detach from the Current thread cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) ); } else { return false; } // check if decoding has come to an end. // if yes, signal the app to shut down. if (!m_pVideoSource->isStarted() || m_pFrameQueue->isEndOfDecode()) { // Let's free the Frame Data if (m_ReadbackSID && m_bFrameData) { cuMemFreeHost((void *)m_bFrameData[0]); cuMemFreeHost((void *)m_bFrameData[1]); m_bFrameData[0] = NULL; m_bFrameData[1] = NULL; } // Let's just stop, and allow the user to quit, so they can at least see the results m_pVideoSource->stop(); // If we want to loop reload the video file and restart if (m_bLoop && !m_bAutoQuit) { reinitCudaResources(); m_nFrameCount = 0; m_nDecodeFrameCount = 0; m_pVideoSource->start(); } if (m_bAutoQuit) { m_bDone = true; } } return true; }
HRESULT CudaVideoRender::initCudaResources(int bUseInterop, int bTCC) { HRESULT hr = S_OK; CUdevice cuda_device; { // If we want to use Graphics Interop, then choose the GPU that is capable if (bUseInterop) { cuda_device = cutilDrvGetMaxGflopsGraphicsDeviceId(); cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device )); } else { cuda_device = cutilDrvGetMaxGflopsDeviceId(); cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device )); } } // get compute capabilities and the devicename int major, minor; size_t totalGlobalMem; char deviceName[256]; cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, m_cuDevice) ); cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, m_cuDevice) ); printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor); cutilDrvSafeCallNoSync( cuDeviceTotalMem(&totalGlobalMem, m_cuDevice) ); printf(" Total amount of global memory: %4.4f MB\n", (float)totalGlobalMem/(1024*1024) ); // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC) // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization) if (bUseInterop) { cutilDrvSafeCallNoSync( cuD3D9CtxCreate(&m_cuContext, &m_cuDevice, CU_CTX_BLOCKING_SYNC, m_pRenderer9->getDevice()) ); } else { cutilDrvSafeCallNoSync( cuCtxCreate(&m_cuContext, CU_CTX_BLOCKING_SYNC, m_cuDevice) ); } // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running if (sizeof(void *) == 4) { m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", "./", 2, 2, 2); } else { m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", "./", 2, 2, 2); } m_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi", &m_fpNV12toARGB); m_pCudaModule->GetCudaFunction("Passthru_drvapi", &m_fpPassthru); /////////////////Change/////////////////////////// // Now we create the CUDA resources and the CUDA decoder context initCudaVideo(); if (bUseInterop) { //initD3D9Surface ( m_pVideoDecoder->targetWidth(), // m_pVideoDecoder->targetHeight() ); } else { cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[0], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) ); cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[1], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) ); } CUcontext cuCurrent = NULL; CUresult result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { printf("cuCtxPopCurrent: %d\n", result); assert(0); } ///////////////////////////////////////// return ((m_pCudaModule && m_pVideoDecoder) ? S_OK : E_FAIL); }