// NV12 is 4:2:0 format (12bpc)
// Luma followed by U/V chroma interleaved (12bpc), chroma is subsampled (w/2,h/2)
void
VideoEncoder::CopyNV12Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock)
{
    // Source is NV12 in pitch linear memory
    // Because we are assume input is NV12 (if we take input in the native format), the encoder handles NV12 as a native format in pitch linear memory
    // Luma/Chroma can be done in a single transfer
    CUDA_MEMCPY2D stCopyNV12;
    memset((void *)&stCopyNV12, 0, sizeof(stCopyNV12));
    stCopyNV12.srcXInBytes          = 0;
    stCopyNV12.srcY                 = 0;
    stCopyNV12.srcMemoryType        = CU_MEMORYTYPE_HOST;
    stCopyNV12.srcHost              = sFrameParams.picBuf;
    stCopyNV12.srcDevice            = 0;
    stCopyNV12.srcArray             = 0;
    stCopyNV12.srcPitch             = sFrameParams.Width;

    stCopyNV12.dstXInBytes          = 0;
    stCopyNV12.dstY                 = 0;
    stCopyNV12.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
    stCopyNV12.dstHost              = 0;
    stCopyNV12.dstDevice            = dptr_VideoFrame;
    stCopyNV12.dstArray             = 0;
    stCopyNV12.dstPitch             = m_pEncoderParams->nDeviceMemPitch;

    stCopyNV12.WidthInBytes         = m_pEncoderParams->iInputSize[0];
    stCopyNV12.Height               =(m_pEncoderParams->iInputSize[1] * 3) >> 1;

    // Don't forget we need to lock/unlock between memcopies
    checkCudaErrors(cuvidCtxLock(ctxLock, 0));
    checkCudaErrors(cuMemcpy2D(&stCopyNV12));    // Now DMA Luma/Chroma
    checkCudaErrors(cuvidCtxUnlock(ctxLock, 0));
}
// UYVY/YUY2 are both 4:2:2 formats (16bpc)
// Luma, U, V are interleaved, chroma is subsampled (w/2,h)
void
VideoEncoder::CopyUYVYorYUY2Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock)
{
    // Source is YUVY/YUY2 4:2:2, the YUV data in a packed and interleaved
    // YUV Copy setup
    CUDA_MEMCPY2D stCopyYUV422;
    memset((void *)&stCopyYUV422, 0, sizeof(stCopyYUV422));
    stCopyYUV422.srcXInBytes          = 0;
    stCopyYUV422.srcY                 = 0;
    stCopyYUV422.srcMemoryType        = CU_MEMORYTYPE_HOST;
    stCopyYUV422.srcHost              = sFrameParams.picBuf;
    stCopyYUV422.srcDevice            = 0;
    stCopyYUV422.srcArray             = 0;
    stCopyYUV422.srcPitch             = sFrameParams.Width * 2;

    stCopyYUV422.dstXInBytes          = 0;
    stCopyYUV422.dstY                 = 0;
    stCopyYUV422.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
    stCopyYUV422.dstHost              = 0;
    stCopyYUV422.dstDevice            = dptr_VideoFrame;
    stCopyYUV422.dstArray             = 0;
    stCopyYUV422.dstPitch             = m_pEncoderParams->nDeviceMemPitch;

    stCopyYUV422.WidthInBytes         = m_pEncoderParams->iInputSize[0]*2;
    stCopyYUV422.Height               = m_pEncoderParams->iInputSize[1];

    // Don't forget we need to lock/unlock between memcopies
    checkCudaErrors(cuvidCtxLock(ctxLock, 0));
    checkCudaErrors(cuMemcpy2D(&stCopyYUV422));     // Now DMA Luma/Chroma
    checkCudaErrors(cuvidCtxUnlock(ctxLock, 0));
}
NVENCSTATUS VideoEncoder::ReleaseIOBuffers()
{
    for (uint32_t i = 0; i < m_uEncodeBufferCount; i++)
    {
        __cu(cuvidCtxLock(m_ctxLock, 0));
        cuMemFree(m_stEncodeBuffer[i].stInputBfr.pNV12devPtr);
        __cu(cuvidCtxUnlock(m_ctxLock, 0));

        m_pNvHWEncoder->NvEncDestroyBitstreamBuffer(m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer);
        m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer = NULL;

#if defined(NV_WINDOWS)
        m_pNvHWEncoder->NvEncUnregisterAsyncEvent(m_stEncodeBuffer[i].stOutputBfr.hOutputEvent);
        nvCloseFile(m_stEncodeBuffer[i].stOutputBfr.hOutputEvent);
        m_stEncodeBuffer[i].stOutputBfr.hOutputEvent = NULL;
#endif
    }

    if (m_stEOSOutputBfr.hOutputEvent)
    {
#if defined(NV_WINDOWS)
        m_pNvHWEncoder->NvEncUnregisterAsyncEvent(m_stEOSOutputBfr.hOutputEvent);
        nvCloseFile(m_stEOSOutputBfr.hOutputEvent);
        m_stEOSOutputBfr.hOutputEvent = NULL;
#endif
    }

    return NV_ENC_SUCCESS;
}
NVENCSTATUS VideoEncoder::EncodeFrame(EncodeFrameConfig *pEncodeFrame, NV_ENC_PIC_STRUCT picType, bool bFlush)
{
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (bFlush)
    {
        FlushEncoder();
        return NV_ENC_SUCCESS;
    }

    assert(pEncodeFrame);

    EncodeBuffer *pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
    if (!pEncodeBuffer)
    {
        pEncodeBuffer = m_EncodeBufferQueue.GetPending();
        m_pNvHWEncoder->ProcessOutput(pEncodeBuffer);
        // UnMap the input buffer after frame done
        if (pEncodeBuffer->stInputBfr.hInputSurface)
        {
            nvStatus = m_pNvHWEncoder->NvEncUnmapInputResource(pEncodeBuffer->stInputBfr.hInputSurface);
            pEncodeBuffer->stInputBfr.hInputSurface = NULL;
        }
        pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
    }

    // encode width and height
    unsigned int dwWidth  = pEncodeBuffer->stInputBfr.dwWidth;
    unsigned int dwHeight = pEncodeBuffer->stInputBfr.dwHeight;

    // Here we copy from Host to Device Memory (CUDA)
    cuvidCtxLock(m_ctxLock, 0);
    assert(pEncodeFrame->width == dwWidth && pEncodeFrame->height == dwHeight);

    CUDA_MEMCPY2D memcpy2D  = {0};
    memcpy2D.srcMemoryType  = CU_MEMORYTYPE_DEVICE;
    memcpy2D.srcDevice      = pEncodeFrame->dptr;
    memcpy2D.srcPitch       = pEncodeFrame->pitch;
    memcpy2D.dstMemoryType  = CU_MEMORYTYPE_DEVICE;
    memcpy2D.dstDevice      = (CUdeviceptr)pEncodeBuffer->stInputBfr.pNV12devPtr;
    memcpy2D.dstPitch       = pEncodeBuffer->stInputBfr.uNV12Stride;
    memcpy2D.WidthInBytes   = dwWidth;
    memcpy2D.Height         = dwHeight*3/2;
    __cu(cuMemcpy2D(&memcpy2D));

    cuvidCtxUnlock(m_ctxLock, 0);

    nvStatus = m_pNvHWEncoder->NvEncMapInputResource(pEncodeBuffer->stInputBfr.nvRegisteredResource, &pEncodeBuffer->stInputBfr.hInputSurface);
    if (nvStatus != NV_ENC_SUCCESS)
    {
        PRINTERR("Failed to Map input buffer %p\n", pEncodeBuffer->stInputBfr.hInputSurface);
        return nvStatus;
    }

    m_pNvHWEncoder->NvEncEncodeFrame(pEncodeBuffer, NULL, pEncodeFrame->width, pEncodeFrame->height, picType);
    m_iEncodedFrames++;

    return NV_ENC_SUCCESS;
}
NVENCSTATUS VideoEncoder::AllocateIOBuffers(EncodeConfig* pEncodeConfig)
{
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    m_uEncodeBufferCount = pEncodeConfig->numB + 4;

    uint32_t uInputWidth  = pEncodeConfig->width;
    uint32_t uInputHeight = pEncodeConfig->height;
    m_EncodeBufferQueue.Initialize(m_stEncodeBuffer, m_uEncodeBufferCount);

    //Allocate input buffer
    for (uint32_t i = 0; i < m_uEncodeBufferCount; i++) {
        __cu(cuvidCtxLock(m_ctxLock, 0));
        __cu(cuMemAllocPitch(&m_stEncodeBuffer[i].stInputBfr.pNV12devPtr,
            (size_t*)&m_stEncodeBuffer[i].stInputBfr.uNV12Stride,
            uInputWidth, uInputHeight * 3 / 2, 16));
        __cu(cuvidCtxUnlock(m_ctxLock, 0));

        nvStatus = m_pNvHWEncoder->NvEncRegisterResource(NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR,
            (void*)m_stEncodeBuffer[i].stInputBfr.pNV12devPtr,
            uInputWidth, uInputHeight,
            m_stEncodeBuffer[i].stInputBfr.uNV12Stride,
            &m_stEncodeBuffer[i].stInputBfr.nvRegisteredResource);

        if (nvStatus != NV_ENC_SUCCESS)
            return nvStatus;

        m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
        m_stEncodeBuffer[i].stInputBfr.dwWidth = uInputWidth;
        m_stEncodeBuffer[i].stInputBfr.dwHeight = uInputHeight;

        nvStatus = m_pNvHWEncoder->NvEncCreateBitstreamBuffer(BITSTREAM_BUFFER_SIZE, &m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer);
        if (nvStatus != NV_ENC_SUCCESS)
            return nvStatus;
        m_stEncodeBuffer[i].stOutputBfr.dwBitstreamBufferSize = BITSTREAM_BUFFER_SIZE;

#if defined(NV_WINDOWS)
        nvStatus = m_pNvHWEncoder->NvEncRegisterAsyncEvent(&m_stEncodeBuffer[i].stOutputBfr.hOutputEvent);
        if (nvStatus != NV_ENC_SUCCESS)
            return nvStatus;
        m_stEncodeBuffer[i].stOutputBfr.bWaitOnEvent = true;
#else
        m_stEncodeBuffer[i].stOutputBfr.hOutputEvent = NULL;
#endif
    }

    m_stEOSOutputBfr.bEOSFlag = TRUE;
#if defined(NV_WINDOWS)
    nvStatus = m_pNvHWEncoder->NvEncRegisterAsyncEvent(&m_stEOSOutputBfr.hOutputEvent);
    if (nvStatus != NV_ENC_SUCCESS)
        return nvStatus;
#else
    m_stEOSOutputBfr.hOutputEvent = NULL;
#endif

    return NV_ENC_SUCCESS;
}
// YV12/IYUV are both 4:2:0 planar formats (12bpc)
// Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2)
void
VideoEncoder::CopyYV12orIYUVFrame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock)
{
    // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder
    // (1) luma copy setup
    CUDA_MEMCPY2D stCopyLuma;
    memset((void *)&stCopyLuma, 0, sizeof(stCopyLuma));
    stCopyLuma.srcXInBytes          = 0;
    stCopyLuma.srcY                 = 0;
    stCopyLuma.srcMemoryType        = CU_MEMORYTYPE_HOST;
    stCopyLuma.srcHost              = sFrameParams.picBuf;
    stCopyLuma.srcDevice            = 0;
    stCopyLuma.srcArray             = 0;
    stCopyLuma.srcPitch             = sFrameParams.Width;

    stCopyLuma.dstXInBytes          = 0;
    stCopyLuma.dstY                 = 0;
    stCopyLuma.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
    stCopyLuma.dstHost              = 0;
    stCopyLuma.dstDevice            = dptr_VideoFrame;
    stCopyLuma.dstArray             = 0;
    stCopyLuma.dstPitch             = m_pEncoderParams->nDeviceMemPitch;

    stCopyLuma.WidthInBytes         = m_pEncoderParams->iInputSize[0];
    stCopyLuma.Height               = m_pEncoderParams->iInputSize[1];

    // (2) chroma copy setup, U/V can be done together
    CUDA_MEMCPY2D stCopyChroma;
    memset((void *)&stCopyChroma, 0, sizeof(stCopyChroma));
    stCopyChroma.srcXInBytes        = 0;
    stCopyChroma.srcY               = m_pEncoderParams->iInputSize[1]<<1; // U/V chroma offset
    stCopyChroma.srcMemoryType      = CU_MEMORYTYPE_HOST;
    stCopyChroma.srcHost            = sFrameParams.picBuf;
    stCopyChroma.srcDevice          = 0;
    stCopyChroma.srcArray           = 0;
    stCopyChroma.srcPitch           = sFrameParams.Width>>1; // chroma is subsampled by 2 (but it has U/V are next to each other)

    stCopyChroma.dstXInBytes        = 0;
    stCopyChroma.dstY               = m_pEncoderParams->iInputSize[1]<<1; // chroma offset (srcY*srcPitch now points to the chroma planes)
    stCopyChroma.dstMemoryType      = CU_MEMORYTYPE_DEVICE;
    stCopyChroma.dstHost            = 0;
    stCopyChroma.dstDevice          = dptr_VideoFrame;
    stCopyChroma.dstArray           = 0;
    stCopyChroma.dstPitch           = m_pEncoderParams->nDeviceMemPitch>>1;

    stCopyChroma.WidthInBytes       = m_pEncoderParams->iInputSize[0]>>1;
    stCopyChroma.Height             = m_pEncoderParams->iInputSize[1]; // U/V are sent together

    // Don't forget we need to lock/unlock between memcopies
    checkCudaErrors(cuvidCtxLock(ctxLock, 0));
    checkCudaErrors(cuMemcpy2D(&stCopyLuma));       // Now DMA Luma
    checkCudaErrors(cuMemcpy2D(&stCopyChroma));     // Now DMA Chroma channels (UV side by side)
    checkCudaErrors(cuvidCtxUnlock(ctxLock, 0));
}
Beispiel #7
0
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) {
    int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field;

    for (int active_field = 0; active_field < num_fields; ++active_field) {
        CUVIDPROCPARAMS proc_params;
        memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS));
        proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config
        proc_params.second_field = active_field == 1; //check user config
        proc_params.top_field_first = cuviddisp->top_field_first;
        proc_params.unpaired_field = cuviddisp->progressive_frame == 1;

        CUdeviceptr devptr;
        unsigned int pitch;
        cuvidCtxLock(vid_ctx_lock, 0);
        CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
#define PAD_ALIGN(x,mask) ( (x + mask) & ~mask )
        //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F);
        uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //?
#undef PAD_ALIGN
        int size = pitch*h*3/2;
        if (size > host_data_size && host_data) {
            cuMemFreeHost(host_data);
            host_data = 0;
            host_data_size = 0;
        }
        if (!host_data) {
            cuStatus = cuMemAllocHost((void**)&host_data, size);
            if (cuStatus != CUDA_SUCCESS) {
                qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
                cuvidUnmapVideoFrame(dec, devptr);
                cuvidCtxUnlock(vid_ctx_lock, 0);
                return false;
            }
            host_data_size = size;
        }
        if (!host_data) {
            qWarning("No valid staging memory!");
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuCtxSynchronize();
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
        }
        cuvidUnmapVideoFrame(dec, devptr);
        cuvidCtxUnlock(vid_ctx_lock, 0);
        //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index);
        surface_in_use[cuviddisp->picture_index] = false;

        uchar *planes[] = {
            host_data,
            host_data + pitch * h
        };
        int pitches[] = { (int)pitch, (int)pitch };
        VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12);
        frame.setBits(planes);
        frame.setBytesPerLine(pitches);
        //TODO: is clone required? may crash on clone, I should review clone()
        //frame = frame.clone();
        if (outFrame) {
            *outFrame = frame.clone();
        }
#if COPY_ON_DECODE
        frame_queue.put(frame.clone());
#endif
        //qDebug("frame queue size: %d", frame_queue.size());
    }
    return true;
}
Beispiel #8
0
// Auto-lock helper for C++ applications
CCtxAutoLock::CCtxAutoLock(CUvideoctxlock ctx) 
    : m_ctx(ctx) 
{
    cuvidCtxLock(m_ctx, 0); 
}