Ejemplo n.º 1
0
    cuda::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
    {
        CUdeviceptr ptr;
        unsigned int pitch;

        cuSafeCall( cuvidMapVideoFrame(decoder_, picIdx, &ptr, &pitch, &videoProcParams) );

        return cuda::GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
    }
Ejemplo n.º 2
0
void
VideoDecoder::mapFrame(int iPictureIndex, CUdeviceptr *ppDevice, unsigned int *pPitch, CUVIDPROCPARAMS *pVideoProcessingParameters)
{
    CUresult oResult = cuvidMapVideoFrame(oDecoder_,
                                          iPictureIndex,
                                          ppDevice,
                                          pPitch, pVideoProcessingParameters);
    assert(CUDA_SUCCESS == oResult);
    assert(0 != *ppDevice);
    assert(0 != *pPitch);
}
Ejemplo n.º 3
0
void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS &param, int width, int height, int coded_height)
{
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), NULL);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    uchar* host_data = NULL;
    const size_t host_size = pitch*coded_height*3/2;
    CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL);
    // copy to the memory not allocated by cuda is possible but much slower
    CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL);

    VideoFrame frame(width, height, VideoFormat::Format_NV12);
    uchar *planes[] = {
        host_data,
        host_data + pitch * coded_height
    };
    frame.setBits(planes);
    int pitches[] = { (int)pitch, (int)pitch };
    frame.setBytesPerLine(pitches);

    VideoFrame *f = reinterpret_cast<VideoFrame*>(handle);
    frame.setTimestamp(f->timestamp());
    frame.setDisplayAspectRatio(f->displayAspectRatio());
    if (format == frame.format())
        *f = frame.clone();
    else
        *f = frame.to(format);

    cuMemFreeHost(host_data);
    return f;
}
Ejemplo n.º 4
0
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) {
    int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field;

    for (int active_field = 0; active_field < num_fields; ++active_field) {
        CUVIDPROCPARAMS proc_params;
        memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS));
        proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config
        proc_params.second_field = active_field == 1; //check user config
        proc_params.top_field_first = cuviddisp->top_field_first;
        proc_params.unpaired_field = cuviddisp->progressive_frame == 1;

        CUdeviceptr devptr;
        unsigned int pitch;
        cuvidCtxLock(vid_ctx_lock, 0);
        CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
#define PAD_ALIGN(x,mask) ( (x + mask) & ~mask )
        //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F);
        uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //?
#undef PAD_ALIGN
        int size = pitch*h*3/2;
        if (size > host_data_size && host_data) {
            cuMemFreeHost(host_data);
            host_data = 0;
            host_data_size = 0;
        }
        if (!host_data) {
            cuStatus = cuMemAllocHost((void**)&host_data, size);
            if (cuStatus != CUDA_SUCCESS) {
                qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
                cuvidUnmapVideoFrame(dec, devptr);
                cuvidCtxUnlock(vid_ctx_lock, 0);
                return false;
            }
            host_data_size = size;
        }
        if (!host_data) {
            qWarning("No valid staging memory!");
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuCtxSynchronize();
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
        }
        cuvidUnmapVideoFrame(dec, devptr);
        cuvidCtxUnlock(vid_ctx_lock, 0);
        //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index);
        surface_in_use[cuviddisp->picture_index] = false;

        uchar *planes[] = {
            host_data,
            host_data + pitch * h
        };
        int pitches[] = { (int)pitch, (int)pitch };
        VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12);
        frame.setBits(planes);
        frame.setBytesPerLine(pitches);
        //TODO: is clone required? may crash on clone, I should review clone()
        //frame = frame.clone();
        if (outFrame) {
            *outFrame = frame.clone();
        }
#if COPY_ON_DECODE
        frame_queue.put(frame.clone());
#endif
        //qDebug("frame queue size: %d", frame_queue.size());
    }
    return true;
}
Ejemplo n.º 5
0
static int cuvid_output_frame(AVCodecContext *avctx, AVFrame *frame)
{
    CuvidContext *ctx = avctx->priv_data;
    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
    CUdeviceptr mapped_frame = 0;
    int ret = 0, eret = 0;

    av_log(avctx, AV_LOG_TRACE, "cuvid_output_frame\n");

    if (ctx->decoder_flushing) {
        ret = cuvid_decode_packet(avctx, NULL);
        if (ret < 0 && ret != AVERROR_EOF)
            return ret;
    }

    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
    if (ret < 0)
        return ret;

    if (av_fifo_size(ctx->frame_queue)) {
        CuvidParsedFrame parsed_frame;
        CUVIDPROCPARAMS params;
        unsigned int pitch = 0;
        int offset = 0;
        int i;

        av_fifo_generic_read(ctx->frame_queue, &parsed_frame, sizeof(CuvidParsedFrame), NULL);

        memset(&params, 0, sizeof(params));
        params.progressive_frame = parsed_frame.dispinfo.progressive_frame;
        params.second_field = parsed_frame.second_field;
        params.top_field_first = parsed_frame.dispinfo.top_field_first;

        ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, parsed_frame.dispinfo.picture_index, &mapped_frame, &pitch, &params));
        if (ret < 0)
            goto error;

        if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
            ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n");
                goto error;
            }

            ret = ff_decode_frame_props(avctx, frame);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n");
                goto error;
            }

            for (i = 0; i < 2; i++) {
                CUDA_MEMCPY2D cpy = {
                    .srcMemoryType = CU_MEMORYTYPE_DEVICE,
                    .dstMemoryType = CU_MEMORYTYPE_DEVICE,
                    .srcDevice     = mapped_frame,
                    .dstDevice     = (CUdeviceptr)frame->data[i],
                    .srcPitch      = pitch,
                    .dstPitch      = frame->linesize[i],
                    .srcY          = offset,
                    .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
                    .Height        = avctx->height >> (i ? 1 : 0),
                };

                ret = CHECK_CU(cuMemcpy2D(&cpy));
                if (ret < 0)
                    goto error;

                offset += avctx->coded_height;
            }
        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
Ejemplo n.º 6
0
bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
{
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data
        return false;
    //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
    CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
    CUarray array;
    CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);

    CUDA_MEMCPY2D cu2d;
    memset(&cu2d, 0, sizeof(cu2d));
    cu2d.srcDevice = devptr;
    cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
    cu2d.srcPitch = pitch;
    cu2d.dstArray = array;
    cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
    cu2d.dstPitch = pitch;
    // the whole size or copy size?
    cu2d.WidthInBytes = pitch;
    cu2d.Height = h;
    if (plane == 1) {
        cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes
        cu2d.srcY = H; // skip the padding height
        cu2d.Height /= 2;
    }
    if (res[plane].stream)
        CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
    else
        CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
    //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
    // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
    // so I simply unmap the resource here
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
        if (res[plane].stream) {
            //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
            CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
        }
        /*
         * This function provides the synchronization guarantee that any CUDA work issued
         * in \p stream before ::cuGraphicsUnmapResources() will complete before any
         * subsequently issued graphics work begins.
         * The graphics API from which \p resources were registered
         * should not access any resources while they are mapped by CUDA. If an
         * application does so, the results are undefined.
         */
        CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    } else {
        // call it at last. current context will be used by other cuda calls (unmap() for example)
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required
    }
    return true;
}
Ejemplo n.º 7
0
bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
{
    // plane is always 0 because frame is rgb
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data
        return false;
    //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
    CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
    CUarray array;
    CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);
    CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible!

    CUDA_MEMCPY2D cu2d;
    memset(&cu2d, 0, sizeof(cu2d));
    // Y plane
    cu2d.srcDevice = devptr;
    cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
    cu2d.srcPitch = pitch;
    cu2d.dstArray = array;
    cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
    cu2d.dstPitch = pitch;
    // the whole size or copy size?
    cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12
    cu2d.Height = H*3/2;
    if (res[plane].stream)
        CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
    else
        CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
    //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
    // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
    // so I simply unmap the resource here
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
        if (res[plane].stream) {
            //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
            CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
        }
        /*
         * This function provides the synchronization guarantee that any CUDA work issued
         * in \p stream before ::cuGraphicsUnmapResources() will complete before any
         * subsequently issued graphics work begins.
         * The graphics API from which \p resources were registered
         * should not access any resources while they are mapped by CUDA. If an
         * application does so, the results are undefined.
         */
//        CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    }
    D3DLOCKED_RECT rect_src, rect_dst;
    DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false);
    DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false);
    memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h
    DX_ENSURE(surface9_nv12->UnlockRect(), false);
    DX_ENSURE(texture9_nv12->UnlockRect(0), false);
#if 0
    //IDirect3DSurface9 *raw_surface = NULL;
    //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false);
    const RECT src = { 0, 0, w, h*3/2};
    DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false);
#endif
    if (!map(surface9_nv12, tex, w, h, H))
        return false;
    return true;
}
Ejemplo n.º 8
0
/*
 * Main transcoding thread
 * Initializes CUDA device, decodes frames with NVCUVID API and adds them to frame queue, which passes them to NVENC for encoding, then output
 */
void NVENCGUI::Transcode()
{
	CUresult result;

	// initialize CUDA
	result = cuInit(0);
	if (result != CUDA_SUCCESS)
	{
		emit Error(ERR_CUDA_INIT);
		return;
	}

	NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

	// no input file
	if (encodeConfig.inputFileName == NULL)
	{
		emit Error(ERR_INPUT);
		return;
	}

	// no output file
	if (encodeConfig.outputFileName == NULL)
	{
		emit Error(ERR_OUTPUT);
		return;
	}

	// unable to open input file
	if (!fopen(encodeConfig.inputFileName, "r"))
	{
		emit Error(ERR_INPUT);
		return;
	}

	encodeConfig.fOutput = fopen(encodeConfig.outputFileName, "wb");
	// unable to open output file
	if (encodeConfig.fOutput == NULL)
	{
		emit Error(ERR_OUTPUT);
		return;
	}

	// initialize CUDA on device and set CUDA context
	CUcontext cudaCtx;
	CUdevice device;

	result = cuDeviceGet(&device, encodeConfig.deviceID);
	if (result != CUDA_SUCCESS)
	{
		emit Error(ERR_CUDA_DEVICE);
		return;
	}
	result = cuCtxCreate(&cudaCtx, CU_CTX_SCHED_AUTO, device);
	if (result != CUDA_SUCCESS)
	{
		emit Error(ERR_CUDA_CTX);
		return;
	}

	// initialize NVCUVID context
	CUcontext curCtx;
	CUvideoctxlock ctxLock;
	result = cuCtxPopCurrent(&curCtx);
	if (result != CUDA_SUCCESS)
	{
		emit Error(ERR_CUDA_CTX);
		return;
	}
	result = cuvidCtxLockCreate(&ctxLock, curCtx);
	if (result != CUDA_SUCCESS)
	{
		emit Error(ERR_CUDA_CTX);
		return;
	}

	CudaDecoder* pDecoder = new CudaDecoder;
	FrameQueue* pFrameQueue = new CUVIDFrameQueue(ctxLock);
	pDecoder->InitVideoDecoder(encodeConfig.inputFileName, ctxLock, pFrameQueue, encodeConfig.width, encodeConfig.height);

	int decodedW, decodedH, decodedFRN, decodedFRD;
	pDecoder->GetCodecParam(&decodedW, &decodedH, &decodedFRN, &decodedFRD);

	// If the width/height is not set, set to same as source
	if (encodeConfig.width <= 0 || encodeConfig.height <= 0) {
		encodeConfig.width = decodedW;
		encodeConfig.height = decodedH;
	}

	// same, except for fps
	if (encodeConfig.fps <= 0) {
		if (decodedFRN <= 0 || decodedFRD <= 0)
			encodeConfig.fps = 30;
		else
			encodeConfig.fps = decodedFRN / decodedFRD;
	}

	// initialize frame queue with width/height
	pFrameQueue->init(encodeConfig.width, encodeConfig.height);

	VideoEncoder* pEncoder = new VideoEncoder(ctxLock);
	assert(pEncoder->GetHWEncoder());

	// initialize NVENC HW Encoder
	nvStatus = pEncoder->GetHWEncoder()->Initialize(cudaCtx, NV_ENC_DEVICE_TYPE_CUDA);
	if (nvStatus != NV_ENC_SUCCESS)
	{
		emit Error(ERR_NVENC_ENC_INIT);
		return;
	}

	// get preset GUID
	encodeConfig.presetGUID = pEncoder->GetHWEncoder()->GetPresetGUID(encodeConfig.encoderPreset, encodeConfig.codec);

	// create encoder
	nvStatus = pEncoder->GetHWEncoder()->CreateEncoder(&encodeConfig);
	if (nvStatus != NV_ENC_SUCCESS)
	{
		emit Error(ERR_NVENC_ENC_CREATE);
		return;
	}

	// create buffer
	nvStatus = pEncoder->AllocateIOBuffers(&encodeConfig);
	if (nvStatus != NV_ENC_SUCCESS)
	{
		emit Error(ERR_NVENC_ENC_BUFFER);
		return;
	}

	// print details to text window, start counter
	emit PrintDetails();
	NvQueryPerformanceCounter(&results.lStart);

	//start decoding thread
#ifdef _WIN32
	HANDLE decodeThread = CreateThread(NULL, 0, DecodeProc, (LPVOID)pDecoder, 0, NULL);
#else
	pthread_t pid;
	pthread_create(&pid, NULL, DecodeProc, (void*)pDecoder);
#endif

	int encodedFrames = 0;

	//start encoding thread
	while (!(pFrameQueue->isEndOfDecode() && pFrameQueue->isEmpty())) 
	{
		CUVIDPARSERDISPINFO pInfo;
		if (pFrameQueue->dequeue(&pInfo)) 
		{
			CUdeviceptr dMappedFrame = 0;
			unsigned int pitch;
			CUVIDPROCPARAMS oVPP = { 0 };
			oVPP.unpaired_field = 1;
			oVPP.progressive_frame = 1;

			cuvidMapVideoFrame(pDecoder->GetDecoder(), pInfo.picture_index, &dMappedFrame, &pitch, &oVPP);

			EncodeFrameConfig stEncodeConfig = { 0 };
			stEncodeConfig.dptr = dMappedFrame;
			stEncodeConfig.pitch = pitch;
			stEncodeConfig.width = encodeConfig.width;
			stEncodeConfig.height = encodeConfig.height;
			pEncoder->EncodeFrame(&stEncodeConfig);

			cuvidUnmapVideoFrame(pDecoder->GetDecoder(), dMappedFrame);
			pFrameQueue->releaseFrame(&pInfo);
			//emit IncrementEncodedFrames();
		}
	}

	// flush
	pEncoder->EncodeFrame(NULL, true);

	// end decoding thread
#ifdef _WIN32
	WaitForSingleObject(decodeThread, INFINITE);
#else
	pthread_join(pid, NULL);
#endif

	// print transcoding details
	if (pEncoder->GetEncodedFrames() > 0)
	{
		results.decodedFrames = pDecoder->m_decodedFrames;
		results.encodedFrames = pEncoder->GetEncodedFrames();

		NvQueryPerformanceCounter(&results.lEnd);
		NvQueryPerformanceFrequency(&results.lFreq);
		results.elapsedTime = (double)(results.lEnd - results.lStart) / (double)results.lFreq;
	}
	emit TranscodingEnd();

	// clean up

	cuvidCtxLockDestroy(ctxLock);
	pEncoder->Deinitialize();
	delete pDecoder;
	delete pEncoder;
	delete pFrameQueue;

	result = cuCtxDestroy(cudaCtx);
	if (result != CUDA_SUCCESS)
	{
		emit Error(ERR_CUDA_CTX_DESTROY);
		return;
	}

	return;
}
Ejemplo n.º 9
0
static int cuvid_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
{
    CuvidContext *ctx = avctx->priv_data;
    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
    AVFrame *frame = data;
    CUVIDSOURCEDATAPACKET cupkt;
    AVPacket filter_packet = { 0 };
    AVPacket filtered_packet = { 0 };
    CUdeviceptr mapped_frame = 0;
    int ret = 0, eret = 0;

    if (ctx->bsf && avpkt->size) {
        if ((ret = av_packet_ref(&filter_packet, avpkt)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_packet_ref failed\n");
            return ret;
        }

        if ((ret = av_bsf_send_packet(ctx->bsf, &filter_packet)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_bsf_send_packet failed\n");
            av_packet_unref(&filter_packet);
            return ret;
        }

        if ((ret = av_bsf_receive_packet(ctx->bsf, &filtered_packet)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_bsf_receive_packet failed\n");
            return ret;
        }

        avpkt = &filtered_packet;
    }

    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
    if (ret < 0) {
        av_packet_unref(&filtered_packet);
        return ret;
    }

    memset(&cupkt, 0, sizeof(cupkt));

    if (avpkt->size) {
        cupkt.payload_size = avpkt->size;
        cupkt.payload = avpkt->data;

        if (avpkt->pts != AV_NOPTS_VALUE) {
            cupkt.flags = CUVID_PKT_TIMESTAMP;
            if (avctx->pkt_timebase.num && avctx->pkt_timebase.den)
                cupkt.timestamp = av_rescale_q(avpkt->pts, avctx->pkt_timebase, (AVRational){1, 10000000});
            else
                cupkt.timestamp = avpkt->pts;
        }
    } else {
        cupkt.flags = CUVID_PKT_ENDOFSTREAM;
    }

    ret = CHECK_CU(cuvidParseVideoData(ctx->cuparser, &cupkt));

    av_packet_unref(&filtered_packet);

    if (ret < 0) {
        goto error;
    }

    // cuvidParseVideoData doesn't return an error just because stuff failed...
    if (ctx->internal_error) {
        av_log(avctx, AV_LOG_ERROR, "cuvid decode callback error\n");
        ret = ctx->internal_error;
        goto error;
    }

    if (av_fifo_size(ctx->frame_queue)) {
        CUVIDPARSERDISPINFO dispinfo;
        CUVIDPROCPARAMS params;
        unsigned int pitch = 0;
        int offset = 0;
        int i;

        av_fifo_generic_read(ctx->frame_queue, &dispinfo, sizeof(CUVIDPARSERDISPINFO), NULL);

        memset(&params, 0, sizeof(params));
        params.progressive_frame = dispinfo.progressive_frame;
        params.second_field = 0;
        params.top_field_first = dispinfo.top_field_first;

        ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, dispinfo.picture_index, &mapped_frame, &pitch, &params));
        if (ret < 0)
            goto error;

        if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
            ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n");
                goto error;
            }

            ret = ff_decode_frame_props(avctx, frame);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n");
                goto error;
            }

            for (i = 0; i < 2; i++) {
                CUDA_MEMCPY2D cpy = {
                    .srcMemoryType = CU_MEMORYTYPE_DEVICE,
                    .dstMemoryType = CU_MEMORYTYPE_DEVICE,
                    .srcDevice     = mapped_frame,
                    .dstDevice     = (CUdeviceptr)frame->data[i],
                    .srcPitch      = pitch,
                    .dstPitch      = frame->linesize[i],
                    .srcY          = offset,
                    .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
                    .Height        = avctx->coded_height >> (i ? 1 : 0),
                };

                ret = CHECK_CU(cuMemcpy2D(&cpy));
                if (ret < 0)
                    goto error;

                offset += avctx->coded_height;
            }
        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {