// NV12 is 4:2:0 format (12bpc) // Luma followed by U/V chroma interleaved (12bpc), chroma is subsampled (w/2,h/2) void VideoEncoder::CopyNV12Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock) { // Source is NV12 in pitch linear memory // Because we are assume input is NV12 (if we take input in the native format), the encoder handles NV12 as a native format in pitch linear memory // Luma/Chroma can be done in a single transfer CUDA_MEMCPY2D stCopyNV12; memset((void *)&stCopyNV12, 0, sizeof(stCopyNV12)); stCopyNV12.srcXInBytes = 0; stCopyNV12.srcY = 0; stCopyNV12.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyNV12.srcHost = sFrameParams.picBuf; stCopyNV12.srcDevice = 0; stCopyNV12.srcArray = 0; stCopyNV12.srcPitch = sFrameParams.Width; stCopyNV12.dstXInBytes = 0; stCopyNV12.dstY = 0; stCopyNV12.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyNV12.dstHost = 0; stCopyNV12.dstDevice = dptr_VideoFrame; stCopyNV12.dstArray = 0; stCopyNV12.dstPitch = m_pEncoderParams->nDeviceMemPitch; stCopyNV12.WidthInBytes = m_pEncoderParams->iInputSize[0]; stCopyNV12.Height =(m_pEncoderParams->iInputSize[1] * 3) >> 1; // Don't forget we need to lock/unlock between memcopies checkCudaErrors(cuvidCtxLock(ctxLock, 0)); checkCudaErrors(cuMemcpy2D(&stCopyNV12)); // Now DMA Luma/Chroma checkCudaErrors(cuvidCtxUnlock(ctxLock, 0)); }
// UYVY/YUY2 are both 4:2:2 formats (16bpc) // Luma, U, V are interleaved, chroma is subsampled (w/2,h) void VideoEncoder::CopyUYVYorYUY2Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock) { // Source is YUVY/YUY2 4:2:2, the YUV data in a packed and interleaved // YUV Copy setup CUDA_MEMCPY2D stCopyYUV422; memset((void *)&stCopyYUV422, 0, sizeof(stCopyYUV422)); stCopyYUV422.srcXInBytes = 0; stCopyYUV422.srcY = 0; stCopyYUV422.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyYUV422.srcHost = sFrameParams.picBuf; stCopyYUV422.srcDevice = 0; stCopyYUV422.srcArray = 0; stCopyYUV422.srcPitch = sFrameParams.Width * 2; stCopyYUV422.dstXInBytes = 0; stCopyYUV422.dstY = 0; stCopyYUV422.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyYUV422.dstHost = 0; stCopyYUV422.dstDevice = dptr_VideoFrame; stCopyYUV422.dstArray = 0; stCopyYUV422.dstPitch = m_pEncoderParams->nDeviceMemPitch; stCopyYUV422.WidthInBytes = m_pEncoderParams->iInputSize[0]*2; stCopyYUV422.Height = m_pEncoderParams->iInputSize[1]; // Don't forget we need to lock/unlock between memcopies checkCudaErrors(cuvidCtxLock(ctxLock, 0)); checkCudaErrors(cuMemcpy2D(&stCopyYUV422)); // Now DMA Luma/Chroma checkCudaErrors(cuvidCtxUnlock(ctxLock, 0)); }
NVENCSTATUS VideoEncoder::ReleaseIOBuffers() { for (uint32_t i = 0; i < m_uEncodeBufferCount; i++) { __cu(cuvidCtxLock(m_ctxLock, 0)); cuMemFree(m_stEncodeBuffer[i].stInputBfr.pNV12devPtr); __cu(cuvidCtxUnlock(m_ctxLock, 0)); m_pNvHWEncoder->NvEncDestroyBitstreamBuffer(m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer); m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer = NULL; #if defined(NV_WINDOWS) m_pNvHWEncoder->NvEncUnregisterAsyncEvent(m_stEncodeBuffer[i].stOutputBfr.hOutputEvent); nvCloseFile(m_stEncodeBuffer[i].stOutputBfr.hOutputEvent); m_stEncodeBuffer[i].stOutputBfr.hOutputEvent = NULL; #endif } if (m_stEOSOutputBfr.hOutputEvent) { #if defined(NV_WINDOWS) m_pNvHWEncoder->NvEncUnregisterAsyncEvent(m_stEOSOutputBfr.hOutputEvent); nvCloseFile(m_stEOSOutputBfr.hOutputEvent); m_stEOSOutputBfr.hOutputEvent = NULL; #endif } return NV_ENC_SUCCESS; }
NVENCSTATUS VideoEncoder::EncodeFrame(EncodeFrameConfig *pEncodeFrame, NV_ENC_PIC_STRUCT picType, bool bFlush) { NVENCSTATUS nvStatus = NV_ENC_SUCCESS; if (bFlush) { FlushEncoder(); return NV_ENC_SUCCESS; } assert(pEncodeFrame); EncodeBuffer *pEncodeBuffer = m_EncodeBufferQueue.GetAvailable(); if (!pEncodeBuffer) { pEncodeBuffer = m_EncodeBufferQueue.GetPending(); m_pNvHWEncoder->ProcessOutput(pEncodeBuffer); // UnMap the input buffer after frame done if (pEncodeBuffer->stInputBfr.hInputSurface) { nvStatus = m_pNvHWEncoder->NvEncUnmapInputResource(pEncodeBuffer->stInputBfr.hInputSurface); pEncodeBuffer->stInputBfr.hInputSurface = NULL; } pEncodeBuffer = m_EncodeBufferQueue.GetAvailable(); } // encode width and height unsigned int dwWidth = pEncodeBuffer->stInputBfr.dwWidth; unsigned int dwHeight = pEncodeBuffer->stInputBfr.dwHeight; // Here we copy from Host to Device Memory (CUDA) cuvidCtxLock(m_ctxLock, 0); assert(pEncodeFrame->width == dwWidth && pEncodeFrame->height == dwHeight); CUDA_MEMCPY2D memcpy2D = {0}; memcpy2D.srcMemoryType = CU_MEMORYTYPE_DEVICE; memcpy2D.srcDevice = pEncodeFrame->dptr; memcpy2D.srcPitch = pEncodeFrame->pitch; memcpy2D.dstMemoryType = CU_MEMORYTYPE_DEVICE; memcpy2D.dstDevice = (CUdeviceptr)pEncodeBuffer->stInputBfr.pNV12devPtr; memcpy2D.dstPitch = pEncodeBuffer->stInputBfr.uNV12Stride; memcpy2D.WidthInBytes = dwWidth; memcpy2D.Height = dwHeight*3/2; __cu(cuMemcpy2D(&memcpy2D)); cuvidCtxUnlock(m_ctxLock, 0); nvStatus = m_pNvHWEncoder->NvEncMapInputResource(pEncodeBuffer->stInputBfr.nvRegisteredResource, &pEncodeBuffer->stInputBfr.hInputSurface); if (nvStatus != NV_ENC_SUCCESS) { PRINTERR("Failed to Map input buffer %p\n", pEncodeBuffer->stInputBfr.hInputSurface); return nvStatus; } m_pNvHWEncoder->NvEncEncodeFrame(pEncodeBuffer, NULL, pEncodeFrame->width, pEncodeFrame->height, picType); m_iEncodedFrames++; return NV_ENC_SUCCESS; }
NVENCSTATUS VideoEncoder::AllocateIOBuffers(EncodeConfig* pEncodeConfig) { NVENCSTATUS nvStatus = NV_ENC_SUCCESS; m_uEncodeBufferCount = pEncodeConfig->numB + 4; uint32_t uInputWidth = pEncodeConfig->width; uint32_t uInputHeight = pEncodeConfig->height; m_EncodeBufferQueue.Initialize(m_stEncodeBuffer, m_uEncodeBufferCount); //Allocate input buffer for (uint32_t i = 0; i < m_uEncodeBufferCount; i++) { __cu(cuvidCtxLock(m_ctxLock, 0)); __cu(cuMemAllocPitch(&m_stEncodeBuffer[i].stInputBfr.pNV12devPtr, (size_t*)&m_stEncodeBuffer[i].stInputBfr.uNV12Stride, uInputWidth, uInputHeight * 3 / 2, 16)); __cu(cuvidCtxUnlock(m_ctxLock, 0)); nvStatus = m_pNvHWEncoder->NvEncRegisterResource(NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, (void*)m_stEncodeBuffer[i].stInputBfr.pNV12devPtr, uInputWidth, uInputHeight, m_stEncodeBuffer[i].stInputBfr.uNV12Stride, &m_stEncodeBuffer[i].stInputBfr.nvRegisteredResource); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL; m_stEncodeBuffer[i].stInputBfr.dwWidth = uInputWidth; m_stEncodeBuffer[i].stInputBfr.dwHeight = uInputHeight; nvStatus = m_pNvHWEncoder->NvEncCreateBitstreamBuffer(BITSTREAM_BUFFER_SIZE, &m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; m_stEncodeBuffer[i].stOutputBfr.dwBitstreamBufferSize = BITSTREAM_BUFFER_SIZE; #if defined(NV_WINDOWS) nvStatus = m_pNvHWEncoder->NvEncRegisterAsyncEvent(&m_stEncodeBuffer[i].stOutputBfr.hOutputEvent); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; m_stEncodeBuffer[i].stOutputBfr.bWaitOnEvent = true; #else m_stEncodeBuffer[i].stOutputBfr.hOutputEvent = NULL; #endif } m_stEOSOutputBfr.bEOSFlag = TRUE; #if defined(NV_WINDOWS) nvStatus = m_pNvHWEncoder->NvEncRegisterAsyncEvent(&m_stEOSOutputBfr.hOutputEvent); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; #else m_stEOSOutputBfr.hOutputEvent = NULL; #endif return NV_ENC_SUCCESS; }
// YV12/IYUV are both 4:2:0 planar formats (12bpc) // Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2) void VideoEncoder::CopyYV12orIYUVFrame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock) { // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder // (1) luma copy setup CUDA_MEMCPY2D stCopyLuma; memset((void *)&stCopyLuma, 0, sizeof(stCopyLuma)); stCopyLuma.srcXInBytes = 0; stCopyLuma.srcY = 0; stCopyLuma.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyLuma.srcHost = sFrameParams.picBuf; stCopyLuma.srcDevice = 0; stCopyLuma.srcArray = 0; stCopyLuma.srcPitch = sFrameParams.Width; stCopyLuma.dstXInBytes = 0; stCopyLuma.dstY = 0; stCopyLuma.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyLuma.dstHost = 0; stCopyLuma.dstDevice = dptr_VideoFrame; stCopyLuma.dstArray = 0; stCopyLuma.dstPitch = m_pEncoderParams->nDeviceMemPitch; stCopyLuma.WidthInBytes = m_pEncoderParams->iInputSize[0]; stCopyLuma.Height = m_pEncoderParams->iInputSize[1]; // (2) chroma copy setup, U/V can be done together CUDA_MEMCPY2D stCopyChroma; memset((void *)&stCopyChroma, 0, sizeof(stCopyChroma)); stCopyChroma.srcXInBytes = 0; stCopyChroma.srcY = m_pEncoderParams->iInputSize[1]<<1; // U/V chroma offset stCopyChroma.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyChroma.srcHost = sFrameParams.picBuf; stCopyChroma.srcDevice = 0; stCopyChroma.srcArray = 0; stCopyChroma.srcPitch = sFrameParams.Width>>1; // chroma is subsampled by 2 (but it has U/V are next to each other) stCopyChroma.dstXInBytes = 0; stCopyChroma.dstY = m_pEncoderParams->iInputSize[1]<<1; // chroma offset (srcY*srcPitch now points to the chroma planes) stCopyChroma.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyChroma.dstHost = 0; stCopyChroma.dstDevice = dptr_VideoFrame; stCopyChroma.dstArray = 0; stCopyChroma.dstPitch = m_pEncoderParams->nDeviceMemPitch>>1; stCopyChroma.WidthInBytes = m_pEncoderParams->iInputSize[0]>>1; stCopyChroma.Height = m_pEncoderParams->iInputSize[1]; // U/V are sent together // Don't forget we need to lock/unlock between memcopies checkCudaErrors(cuvidCtxLock(ctxLock, 0)); checkCudaErrors(cuMemcpy2D(&stCopyLuma)); // Now DMA Luma checkCudaErrors(cuMemcpy2D(&stCopyChroma)); // Now DMA Chroma channels (UV side by side) checkCudaErrors(cuvidCtxUnlock(ctxLock, 0)); }
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) { int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field; for (int active_field = 0; active_field < num_fields; ++active_field) { CUVIDPROCPARAMS proc_params; memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS)); proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config proc_params.second_field = active_field == 1; //check user config proc_params.top_field_first = cuviddisp->top_field_first; proc_params.unpaired_field = cuviddisp->progressive_frame == 1; CUdeviceptr devptr; unsigned int pitch; cuvidCtxLock(vid_ctx_lock, 0); CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params); if (cuStatus != CUDA_SUCCESS) { qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } #define PAD_ALIGN(x,mask) ( (x + mask) & ~mask ) //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F); uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //? #undef PAD_ALIGN int size = pitch*h*3/2; if (size > host_data_size && host_data) { cuMemFreeHost(host_data); host_data = 0; host_data_size = 0; } if (!host_data) { cuStatus = cuMemAllocHost((void**)&host_data, size); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } host_data_size = size; } if (!host_data) { qWarning("No valid staging memory!"); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream); if (cuStatus != CUDA_SUCCESS) { qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); return false; } cuStatus = cuCtxSynchronize(); if (cuStatus != CUDA_SUCCESS) { qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus)); } cuvidUnmapVideoFrame(dec, devptr); cuvidCtxUnlock(vid_ctx_lock, 0); //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index); surface_in_use[cuviddisp->picture_index] = false; uchar *planes[] = { host_data, host_data + pitch * h }; int pitches[] = { (int)pitch, (int)pitch }; VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12); frame.setBits(planes); frame.setBytesPerLine(pitches); //TODO: is clone required? may crash on clone, I should review clone() //frame = frame.clone(); if (outFrame) { *outFrame = frame.clone(); } #if COPY_ON_DECODE frame_queue.put(frame.clone()); #endif //qDebug("frame queue size: %d", frame_queue.size()); } return true; }
// Auto-lock helper for C++ applications CCtxAutoLock::CCtxAutoLock(CUvideoctxlock ctx) : m_ctx(ctx) { cuvidCtxLock(m_ctx, 0); }