static inline CUresult cuMemcpyDtoH2DAsync(void * A, size_t lda, size_t ai, size_t aj, CUdeviceptr B, size_t ldb, size_t bi, size_t bj, size_t m, size_t n, size_t elemSize, CUstream stream) { CUDA_MEMCPY2D copy = { bi * elemSize, bj, CU_MEMORYTYPE_DEVICE, NULL, B, 0, ldb * elemSize, ai * elemSize, aj, CU_MEMORYTYPE_HOST, A, 0, 0, lda * elemSize, m * elemSize, n }; return cuMemcpy2DAsync(©, stream); }
bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane) { AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data return false; //CUDA_ENSURE(cuCtxPushCurrent(ctx), false); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false); CUarray array; CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false); CUDA_MEMCPY2D cu2d; memset(&cu2d, 0, sizeof(cu2d)); cu2d.srcDevice = devptr; cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE; cu2d.srcPitch = pitch; cu2d.dstArray = array; cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY; cu2d.dstPitch = pitch; // the whole size or copy size? cu2d.WidthInBytes = pitch; cu2d.Height = h; if (plane == 1) { cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes cu2d.srcY = H; // skip the padding height cu2d.Height /= 2; } if (res[plane].stream) CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false); else CUDA_ENSURE(cuMemcpy2D(&cu2d), false); //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)? // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error // so I simply unmap the resource here if (WORKAROUND_UNMAP_CONTEXT_SWITCH) { if (res[plane].stream) { //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery? CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize } /* * This function provides the synchronization guarantee that any CUDA work issued * in \p stream before ::cuGraphicsUnmapResources() will complete before any * subsequently issued graphics work begins. * The graphics API from which \p resources were registered * should not access any resources while they are mapped by CUDA. If an * application does so, the results are undefined. */ CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); } else { // call it at last. current context will be used by other cuda calls (unmap() for example) CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required } return true; }
bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane) { // plane is always 0 because frame is rgb AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data return false; //CUDA_ENSURE(cuCtxPushCurrent(ctx), false); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false); CUarray array; CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false); CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible! CUDA_MEMCPY2D cu2d; memset(&cu2d, 0, sizeof(cu2d)); // Y plane cu2d.srcDevice = devptr; cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE; cu2d.srcPitch = pitch; cu2d.dstArray = array; cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY; cu2d.dstPitch = pitch; // the whole size or copy size? cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12 cu2d.Height = H*3/2; if (res[plane].stream) CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false); else CUDA_ENSURE(cuMemcpy2D(&cu2d), false); //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)? // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error // so I simply unmap the resource here if (WORKAROUND_UNMAP_CONTEXT_SWITCH) { if (res[plane].stream) { //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery? CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize } /* * This function provides the synchronization guarantee that any CUDA work issued * in \p stream before ::cuGraphicsUnmapResources() will complete before any * subsequently issued graphics work begins. * The graphics API from which \p resources were registered * should not access any resources while they are mapped by CUDA. If an * application does so, the results are undefined. */ // CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); } D3DLOCKED_RECT rect_src, rect_dst; DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false); DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false); memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h DX_ENSURE(surface9_nv12->UnlockRect(), false); DX_ENSURE(texture9_nv12->UnlockRect(0), false); #if 0 //IDirect3DSurface9 *raw_surface = NULL; //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false); const RECT src = { 0, 0, w, h*3/2}; DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false); #endif if (!map(surface9_nv12, tex, w, h, H)) return false; return true; }