void CudaVideoRender::initCudaVideo( )
	{
		// bind the context lock to the CUDA context
		CUresult result = cuvidCtxLockCreate(&m_CtxLock, m_cuContext);
		if (result != CUDA_SUCCESS) {
			printf("cuvidCtxLockCreate failed: %d\n", result);
			assert(0);
		}

		std::auto_ptr<VideoDecoder> apVideoDecoder(new VideoDecoder(m_pVideoSource->format(), m_cuContext, m_eVideoCreateFlags, m_CtxLock));
		std::auto_ptr<VideoParser> apVideoParser(new VideoParser(apVideoDecoder.get(), m_pFrameQueue));
		m_pVideoSource->setParser(*apVideoParser.get());

		m_pVideoParser  = apVideoParser.release();
		m_pVideoDecoder = apVideoDecoder.release();

		// Create a Stream ID for handling Readback
		if (m_bReadback) {
			cutilDrvSafeCallNoSync( cuStreamCreate(&m_ReadbackSID, 0) );
			cutilDrvSafeCallNoSync( cuStreamCreate(&m_KernelSID,   0) );
			printf("> initCudaVideo()\n");
			printf("  CUDA Streams (%s) <m_ReadbackSID = %p>\n", ((m_ReadbackSID == 0) ? "Disabled" : "Enabled"), m_ReadbackSID );
			printf("  CUDA Streams (%s) <m_KernelSID   = %p>\n", ((m_KernelSID   == 0) ? "Disabled" : "Enabled"), m_KernelSID   );
		}
	}
	void CudaVideoRender::map(CUdeviceptr * ppImageData, unsigned int * pImagePitch, int active_field)
	{
		unsigned int nFrames = m_bIsProgressive ? 1 : 2;

		if(!m_ppRegisteredResources[0])
			registerResources(nFrames);

		cutilDrvSafeCallNoSync ( cuD3D9MapResources(nFrames, reinterpret_cast<IDirect3DResource9 **>(m_ppRegisteredResources) ));

		cutilDrvSafeCallNoSync ( cuD3D9ResourceGetMappedPointer(ppImageData, m_ppRegisteredResources[active_field], 0, 0) );
		assert(0 != *ppImageData);

		cutilDrvSafeCallNoSync ( cuD3D9ResourceGetMappedPitch(pImagePitch, NULL, m_ppRegisteredResources[active_field], 0, 0) );
		assert(0 != *pImagePitch);	
	}
	// Release all previously initd objects
	HRESULT CudaVideoRender::cleanup(bool bDestroyContext)
	{
		// Attach the CUDA Context (so we may properly free memroy)
		cutilDrvSafeCallNoSync( cuCtxPushCurrent(m_cuContext) );

		if (m_pInteropFrame[0]) {
			cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[0]) );
		}
		if (m_pInteropFrame[1]) {
			cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[1]) );
		}
		// Detach from the Current thread
		cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) );
		terminateCudaVideo(bDestroyContext);
		return S_OK;
	}
	void CudaVideoRender::terminateCudaVideo(bool bDestroyContext)
	{
		if (m_pVideoParser)  delete m_pVideoParser;
		if (m_pVideoDecoder) delete m_pVideoDecoder;
		if (m_pVideoSource)  delete m_pVideoSource;
		if (m_pFrameQueue)   delete m_pFrameQueue;

		if (m_CtxLock) {
			cutilDrvSafeCallNoSync( cuvidCtxLockDestroy(m_CtxLock) );
		}
		if (m_cuContext && bDestroyContext)  {
			cutilDrvSafeCallNoSync( cuCtxDestroy(m_cuContext) );
			m_cuContext = NULL;
		}

		if (m_ReadbackSID)   cuStreamDestroy(m_ReadbackSID);
		if (m_KernelSID)     cuStreamDestroy(m_KernelSID);
	}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest(int argc, char** argv)
{
    CUcontext cuContext;

    // initialize CUDA
    CUfunction pk = NULL;
    const char cubin_name [] = "pass_kernel.cubin";
    const char kernel_name [] = "pass_kernel";

    CU_SAFE_CALL(initCuda(cuContext, argv[0], &pk, argc, argv, cubin_name, kernel_name));
    printf("initCuda-returned CUfunction:\n");

    // cuParamSetx, x=i f v
    // http://visionexperts.blogspot.com/2010/07/cuda-parameter-alignment.html - check alignment
    #define ALIGN_UP(offset, alignment)					\
        (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

    size_t offset = 0;

    // input integers
    // CU paramset i.
    for(int i = 0 ; i < NUM_ARG ; i++) 
    {
 	int align = __alignof(int);
	ALIGN_UP(offset, align);
	cuParamSeti(pk, offset, i);
	printf ("offset %d = %d\n", i, offset);
	offset += sizeof(int);
    }

    // return array for updated inputs
    int size_int = sizeof(int);

    int size_array = size_int * NUM_ARG;
    CUdeviceptr d_return_values;
    cuMemAlloc (&d_return_values, size_array);
    void* ptr = (void*)(size_t)d_return_values;
    int align = __alignof(ptr);
    ALIGN_UP(offset, align);
    cuParamSetv(pk, offset, &ptr, sizeof(ptr));
    printf("return values offset:%d\n", offset);
    offset += sizeof(ptr);

    CUdeviceptr d_return_N;
    cuMemAlloc(&d_return_N, size_int);
    void* ptrN = (void*)(size_t)d_return_N;
    int alignN = __alignof(ptrN);
    ALIGN_UP(offset, alignN);
    cuParamSetv(pk, offset, &ptrN, sizeof(ptr));
    printf("return int offset:%d\n", offset);
    offset += sizeof(ptrN);

    // Calling kernel
    int BLOCK_SIZE_X = NUM_ARG;
    int BLOCK_SIZE_Y = 1;
    int BLOCK_SIZE_Z = 1;
    int GRID_SIZE = 1;
    cutilDrvSafeCallNoSync(cuFuncSetBlockShape(pk, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z));
 
    printf("paramsetsize:%d\n", offset);
    CU_SAFE_CALL(cuParamSetSize(pk, offset));
    CU_SAFE_CALL(cuLaunchGrid(pk, GRID_SIZE, GRID_SIZE));

    int* h_return_values = (int*)malloc(NUM_ARG * sizeof(int));
    CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_values, d_return_values, size_array));
    CU_SAFE_CALL(cuMemFree(d_return_values));

    for(int i=0;i<NUM_ARG;i++)
        printf("%dth value = %d\n", i, h_return_values[i]);
    free(h_return_values);

    int* h_return_N = (int*)malloc(sizeof(int));
    CU_SAFE_CALL(cuMemcpyDtoH((void*)h_return_N, d_return_N, size_int));
    CU_SAFE_CALL(cuMemFree(d_return_N));

    printf("%d sizeof array\n", *h_return_N);

    if(cuContext !=NULL) cuCtxDetach(cuContext);
}
	void CudaVideoRender::unmap(int active_field)
	{
		int nFrames = m_bIsProgressive ? 1 : 2;

		cutilDrvSafeCallNoSync ( cuD3D9UnmapResources(nFrames, reinterpret_cast<IDirect3DResource9 **>(m_ppRegisteredResources) ));
	}
	// Run the Cuda part of the computation
	bool CudaVideoRender::copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive)
	{
		CUVIDPARSERDISPINFO oDisplayInfo;
		if (m_pFrameQueue->dequeue(&oDisplayInfo))
		{
			CCtxAutoLock lck  ( m_CtxLock );
			// Push the current CUDA context (only if we are using CUDA decoding path)
			CUresult result = cuCtxPushCurrent(m_cuContext);

			CUdeviceptr	 pDecodedFrame[2] = { 0, 0 };
			CUdeviceptr  pInteropFrame[2] = { 0, 0 };

			int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field));
			*pbIsProgressive = oDisplayInfo.progressive_frame;
			m_bIsProgressive = oDisplayInfo.progressive_frame ? true : false;
			for (int active_field=0; active_field<num_fields; active_field++)
			{
				nRepeats = oDisplayInfo.repeat_first_field;
				CUVIDPROCPARAMS oVideoProcessingParameters;
				memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS));

				oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame;
				oVideoProcessingParameters.second_field      = active_field;
				oVideoProcessingParameters.top_field_first   = oDisplayInfo.top_field_first;
				oVideoProcessingParameters.unpaired_field    = (num_fields == 1);

				unsigned int nDecodedPitch = 0;
				unsigned int nWidth = 0;
				unsigned int nHeight = 0;

				// map decoded video frame to CUDA surfae
				m_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, (unsigned int*)&pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters);
				nWidth  = m_pVideoDecoder->targetWidth();
				nHeight = m_pVideoDecoder->targetHeight();
				// map DirectX texture to CUDA surface
				unsigned int nTexturePitch = 0;

				// If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks
				if (m_bReadback && m_bFirstFrame && m_ReadbackSID) {
					CUresult result;
					cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2) ) );
					cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2) ) );
					m_bFirstFrame = false;
					if (result != CUDA_SUCCESS) {
						printf("cuMemAllocHost returned %d\n", (int)result);
					}
				}

				// If streams are enabled, we can perform the readback to the host while the kernel is executing
				if (m_bReadback && m_ReadbackSID) {
					//TODO: test if &m_bFrameData[active_field] is the correct void*
					CUresult result = cuMemcpyDtoHAsync(&m_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), m_ReadbackSID);
					if (result != CUDA_SUCCESS) {
						printf("cuMemAllocHost returned %d\n", (int)result);
					}
				}

#if ENABLE_DEBUG_OUT
				printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n", 
					(oDisplayInfo.progressive_frame ? "Frame" : "Field"),
					m_nDecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp);
#endif

				if (true) {
					// map the texture surface
					//m_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field);
					//TODO: map interop frames to d3d9surface
					map(&pInteropFrame[active_field], &nTexturePitch, active_field);
				} else {
					pInteropFrame[active_field] = m_pInteropFrame[active_field];
					nTexturePitch = m_pVideoDecoder->targetWidth() * 2; 
				}

				// perform post processing on the CUDA surface (performs colors space conversion and post processing)
				// comment this out if we inclue the line of code seen above 
				cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, m_pCudaModule->getModule(), m_fpNV12toARGB, m_KernelSID);
				if (true) {
					// unmap the texture surface
					//m_pImageDX->unmap(active_field);
					//TODO: map interop frames to d3d9surface
					unmap(active_field);
				}

				// unmap video frame
				// unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
				m_pVideoDecoder->unmapFrame((unsigned int*)&pDecodedFrame[active_field]);
				// release the frame, so it can be re-used in decoder
				m_pFrameQueue->releaseFrame(&oDisplayInfo);
				m_nDecodeFrameCount++;
			}

			// Detach from the Current thread
			cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) );
		} else {
			return false;
		}

		// check if decoding has come to an end.
		// if yes, signal the app to shut down.
		if (!m_pVideoSource->isStarted() || m_pFrameQueue->isEndOfDecode())
		{
			// Let's free the Frame Data
			if (m_ReadbackSID && m_bFrameData) {
				cuMemFreeHost((void *)m_bFrameData[0]);
				cuMemFreeHost((void *)m_bFrameData[1]);
				m_bFrameData[0] = NULL;
				m_bFrameData[1] = NULL;
			}

			// Let's just stop, and allow the user to quit, so they can at least see the results
			m_pVideoSource->stop();

			// If we want to loop reload the video file and restart
			if (m_bLoop && !m_bAutoQuit) {
				reinitCudaResources();
				m_nFrameCount = 0;
				m_nDecodeFrameCount = 0;
				m_pVideoSource->start();
			}
			if (m_bAutoQuit) {
				m_bDone = true;
			}
		}
		return true;
	}
	HRESULT CudaVideoRender::initCudaResources(int bUseInterop, int bTCC)
	{
		HRESULT hr = S_OK;

		CUdevice cuda_device;
		{
			// If we want to use Graphics Interop, then choose the GPU that is capable
			if (bUseInterop) {
				cuda_device = cutilDrvGetMaxGflopsGraphicsDeviceId();
				cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device ));
			} else {
				cuda_device = cutilDrvGetMaxGflopsDeviceId();
				cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device ));
			}
		}

		// get compute capabilities and the devicename
		int major, minor;
		size_t totalGlobalMem;
		char deviceName[256];
		cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, m_cuDevice) );
		cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, m_cuDevice) );
		printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor);

		cutilDrvSafeCallNoSync( cuDeviceTotalMem(&totalGlobalMem, m_cuDevice) );
		printf("  Total amount of global memory:     %4.4f MB\n", (float)totalGlobalMem/(1024*1024) );

		// Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC)
		// (use CU_CTX_BLOCKING_SYNC for better CPU synchronization)
		if (bUseInterop) {
			cutilDrvSafeCallNoSync( cuD3D9CtxCreate(&m_cuContext, &m_cuDevice, CU_CTX_BLOCKING_SYNC, m_pRenderer9->getDevice()) );
		} else {
			cutilDrvSafeCallNoSync( cuCtxCreate(&m_cuContext, CU_CTX_BLOCKING_SYNC, m_cuDevice) );
		}

		// Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running
		if (sizeof(void *) == 4) {
			m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", "./", 2, 2, 2);
		} else {
			m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", "./", 2, 2, 2);
		}

		m_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi",   &m_fpNV12toARGB);
		m_pCudaModule->GetCudaFunction("Passthru_drvapi",     &m_fpPassthru);

		/////////////////Change///////////////////////////
		// Now we create the CUDA resources and the CUDA decoder context
		initCudaVideo();

		if (bUseInterop) {
			//initD3D9Surface   ( m_pVideoDecoder->targetWidth(), 
			//					m_pVideoDecoder->targetHeight() );
		} else {
			cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[0], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) );
			cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[1], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) );
		}

		CUcontext cuCurrent = NULL;
		CUresult result = cuCtxPopCurrent(&cuCurrent);
		if (result != CUDA_SUCCESS) {
			printf("cuCtxPopCurrent: %d\n", result);
			assert(0);
		}

		/////////////////////////////////////////
		return ((m_pCudaModule && m_pVideoDecoder) ? S_OK : E_FAIL);
	}