Пример #1
0
void GPUInterface::LaunchKernelConcurrent(GPUFunction deviceFunction,
                                         Dim3Int block,
                                         Dim3Int grid,
                                         int streamIndex,
                                         int waitIndex,
                                         int parameterCountV,
                                         int totalParameterCount,
                                         ...) { // parameters
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernelConcurrent\n");
#endif

    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    void** params;
    GPUPtr* paramPtrs;
    unsigned int* paramInts;

    params = (void**)malloc(sizeof(void*) * totalParameterCount);
    paramPtrs = (GPUPtr*)malloc(sizeof(GPUPtr) * totalParameterCount);
    paramInts = (unsigned int*)malloc(sizeof(unsigned int) * totalParameterCount);

    va_list parameters;
    va_start(parameters, totalParameterCount);
    for(int i = 0; i < parameterCountV; i++) {
       paramPtrs[i] = (GPUPtr)(size_t)va_arg(parameters, GPUPtr);
       params[i] = (void*)&paramPtrs[i];
    }
    for(int i = parameterCountV; i < totalParameterCount; i++) {
       paramInts[i-parameterCountV] = va_arg(parameters, unsigned int);
       params[i] = (void*)&paramInts[i-parameterCountV];
    }

    va_end(parameters);

    if (streamIndex >= 0) {
        int streamIndexMod = streamIndex % numStreams;

        if (waitIndex >= 0) {
            int waitIndexMod = waitIndex % numStreams;
            SAFE_CUDA(cuStreamWaitEvent(cudaStreams[streamIndexMod], cudaEvents[waitIndexMod], 0));
        }

        SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z,
                                 block.x, block.y, block.z, 0,
                                 cudaStreams[streamIndexMod], params, NULL));

        SAFE_CUDA(cuEventRecord(cudaEvents[streamIndexMod], cudaStreams[streamIndexMod]));
    } else {
        SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z,
                                 block.x, block.y, block.z, 0,
                                 cudaStreams[0], params, NULL));
    }

    free(params);
    free(paramPtrs);
    free(paramInts);

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::LaunchKernelConcurrent\n");
#endif

}
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last,
		  gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
#if !GIB_USE_MMAP
  if (buf_size > gib_buf_size) {
    int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int i, j;
  int n = c->n;
  int m = c->m;
  unsigned char A[128*128], inv[128*128], modA[128*128];
  for (i = n; i < n+recover_last; i++)
    if (buf_ids[i] >= n) {
      fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n");
      return GIB_ERR;
    }

  gib_galois_gen_A(A, m+n, n);

  /* Modify the matrix to have the failed drives reflected */
  for (i = 0; i < n; i++) 
    for (j = 0; j < n; j++) 
      modA[i*n+j] = A[buf_ids[i]*n+j];

  gib_galois_gaussian_elim(modA, inv, n, n);

  /* Copy row buf_ids[i] into row i */
  for (i = n; i < n+recover_last; i++)
    for (j = 0; j < n; j++)
      modA[i*n+j] = inv[buf_ids[i]*n+j];

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;

  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n)));

#if !GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size));
#endif
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)gpu_c->buffers;
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, 
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, 
			       sizeof(recover_last)));
  offset += sizeof(recover_last);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1));
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size));
#else
  cuCtxSynchronize();
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
	// Run the Cuda part of the computation
	bool CudaVideoRender::copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive)
	{
		CUVIDPARSERDISPINFO oDisplayInfo;
		if (m_pFrameQueue->dequeue(&oDisplayInfo))
		{
			CCtxAutoLock lck  ( m_CtxLock );
			// Push the current CUDA context (only if we are using CUDA decoding path)
			CUresult result = cuCtxPushCurrent(m_cuContext);

			CUdeviceptr	 pDecodedFrame[2] = { 0, 0 };
			CUdeviceptr  pInteropFrame[2] = { 0, 0 };

			int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field));
			*pbIsProgressive = oDisplayInfo.progressive_frame;
			m_bIsProgressive = oDisplayInfo.progressive_frame ? true : false;
			for (int active_field=0; active_field<num_fields; active_field++)
			{
				nRepeats = oDisplayInfo.repeat_first_field;
				CUVIDPROCPARAMS oVideoProcessingParameters;
				memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS));

				oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame;
				oVideoProcessingParameters.second_field      = active_field;
				oVideoProcessingParameters.top_field_first   = oDisplayInfo.top_field_first;
				oVideoProcessingParameters.unpaired_field    = (num_fields == 1);

				unsigned int nDecodedPitch = 0;
				unsigned int nWidth = 0;
				unsigned int nHeight = 0;

				// map decoded video frame to CUDA surfae
				m_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, (unsigned int*)&pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters);
				nWidth  = m_pVideoDecoder->targetWidth();
				nHeight = m_pVideoDecoder->targetHeight();
				// map DirectX texture to CUDA surface
				unsigned int nTexturePitch = 0;

				// If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks
				if (m_bReadback && m_bFirstFrame && m_ReadbackSID) {
					CUresult result;
					cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2) ) );
					cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2) ) );
					m_bFirstFrame = false;
					if (result != CUDA_SUCCESS) {
						printf("cuMemAllocHost returned %d\n", (int)result);
					}
				}

				// If streams are enabled, we can perform the readback to the host while the kernel is executing
				if (m_bReadback && m_ReadbackSID) {
					//TODO: test if &m_bFrameData[active_field] is the correct void*
					CUresult result = cuMemcpyDtoHAsync(&m_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), m_ReadbackSID);
					if (result != CUDA_SUCCESS) {
						printf("cuMemAllocHost returned %d\n", (int)result);
					}
				}

#if ENABLE_DEBUG_OUT
				printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n", 
					(oDisplayInfo.progressive_frame ? "Frame" : "Field"),
					m_nDecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp);
#endif

				if (true) {
					// map the texture surface
					//m_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field);
					//TODO: map interop frames to d3d9surface
					map(&pInteropFrame[active_field], &nTexturePitch, active_field);
				} else {
					pInteropFrame[active_field] = m_pInteropFrame[active_field];
					nTexturePitch = m_pVideoDecoder->targetWidth() * 2; 
				}

				// perform post processing on the CUDA surface (performs colors space conversion and post processing)
				// comment this out if we inclue the line of code seen above 
				cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, m_pCudaModule->getModule(), m_fpNV12toARGB, m_KernelSID);
				if (true) {
					// unmap the texture surface
					//m_pImageDX->unmap(active_field);
					//TODO: map interop frames to d3d9surface
					unmap(active_field);
				}

				// unmap video frame
				// unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
				m_pVideoDecoder->unmapFrame((unsigned int*)&pDecodedFrame[active_field]);
				// release the frame, so it can be re-used in decoder
				m_pFrameQueue->releaseFrame(&oDisplayInfo);
				m_nDecodeFrameCount++;
			}

			// Detach from the Current thread
			cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) );
		} else {
			return false;
		}

		// check if decoding has come to an end.
		// if yes, signal the app to shut down.
		if (!m_pVideoSource->isStarted() || m_pFrameQueue->isEndOfDecode())
		{
			// Let's free the Frame Data
			if (m_ReadbackSID && m_bFrameData) {
				cuMemFreeHost((void *)m_bFrameData[0]);
				cuMemFreeHost((void *)m_bFrameData[1]);
				m_bFrameData[0] = NULL;
				m_bFrameData[1] = NULL;
			}

			// Let's just stop, and allow the user to quit, so they can at least see the results
			m_pVideoSource->stop();

			// If we want to loop reload the video file and restart
			if (m_bLoop && !m_bAutoQuit) {
				reinitCudaResources();
				m_nFrameCount = 0;
				m_nDecodeFrameCount = 0;
				m_pVideoSource->start();
			}
			if (m_bAutoQuit) {
				m_bDone = true;
			}
		}
		return true;
	}
int gib_generate ( void *buffers, int buf_size, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
  /* Do it all at once if the buffers are small enough */
#if !GIB_USE_MMAP
  /* This is too large to do at once in the GPU memory we have allocated.
   * Split it into several noncontiguous jobs. 
   */
  if (buf_size > gib_buf_size) {
    int rc = gib_generate_nc(buffers, buf_size, buf_size, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;
  
  unsigned char F[256*256];
  gib_galois_gen_F(F, c->m, c->n);
  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n)));
  
#if !GIB_USE_MMAP
  /* Copy the buffers to memory */
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, 
				(c->n)*buf_size));
#endif
  /* Configure and launch */
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block,
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)(gpu_c->buffers);
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size,
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1));

  /* Get the results back */
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size));
#else
  ERROR_CHECK_FAIL(cuCtxSynchronize());
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC; 
}
int gib_free ( void *buffers, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
  ERROR_CHECK_FAIL(cuMemFreeHost(buffers));
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
Пример #6
0
static av_cold int cudascale_config_props(AVFilterLink *outlink)
{
    AVFilterContext *ctx = outlink->src;
    AVFilterLink *inlink = outlink->src->inputs[0];
    CUDAScaleContext *s  = ctx->priv;
    AVHWFramesContext     *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
    int w, h;
    int ret;

    extern char vf_scale_cuda_ptx[];

    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
    if (ret < 0)
        goto fail;

    ret = CHECK_CU(cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx));
    if (ret < 0)
        goto fail;

    CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2"));
    CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4"));

    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex"));
    CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex"));

    CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER));
    CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER));

    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR));
    CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR));

    CHECK_CU(cuCtxPopCurrent(&dummy));

    if ((ret = ff_scale_eval_dimensions(s,
                                        s->w_expr, s->h_expr,
                                        inlink, outlink,
                                        &w, &h)) < 0)
        goto fail;

    if (((int64_t)h * inlink->w) > INT_MAX  ||
        ((int64_t)w * inlink->h) > INT_MAX)
        av_log(ctx, AV_LOG_ERROR, "Rescaled value for width or height is too big.\n");

    outlink->w = w;
    outlink->h = h;

    ret = init_processing_chain(ctx, inlink->w, inlink->h, w, h);
    if (ret < 0)
        return ret;

    av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d -> w:%d h:%d\n",
           inlink->w, inlink->h, outlink->w, outlink->h);

    if (inlink->sample_aspect_ratio.num) {
        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w,
                                                             outlink->w*inlink->h},
                                                inlink->sample_aspect_ratio);
    } else {
        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
    }

    return 0;

fail:
    return ret;
}
Пример #7
0
void cuda_enter(cuda_context *ctx) {
  ASSERT_CTX(ctx);
  if (!ctx->enter)
    cuCtxPushCurrent(ctx->ctx);
  ctx->enter++;
}
Пример #8
0
static int cuvid_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
{
    CuvidContext *ctx = avctx->priv_data;
    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
    AVFrame *frame = data;
    CUVIDSOURCEDATAPACKET cupkt;
    AVPacket filter_packet = { 0 };
    AVPacket filtered_packet = { 0 };
    CUdeviceptr mapped_frame = 0;
    int ret = 0, eret = 0;

    if (ctx->bsf && avpkt->size) {
        if ((ret = av_packet_ref(&filter_packet, avpkt)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_packet_ref failed\n");
            return ret;
        }

        if ((ret = av_bsf_send_packet(ctx->bsf, &filter_packet)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_bsf_send_packet failed\n");
            av_packet_unref(&filter_packet);
            return ret;
        }

        if ((ret = av_bsf_receive_packet(ctx->bsf, &filtered_packet)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_bsf_receive_packet failed\n");
            return ret;
        }

        avpkt = &filtered_packet;
    }

    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
    if (ret < 0) {
        av_packet_unref(&filtered_packet);
        return ret;
    }

    memset(&cupkt, 0, sizeof(cupkt));

    if (avpkt->size) {
        cupkt.payload_size = avpkt->size;
        cupkt.payload = avpkt->data;

        if (avpkt->pts != AV_NOPTS_VALUE) {
            cupkt.flags = CUVID_PKT_TIMESTAMP;
            if (avctx->pkt_timebase.num && avctx->pkt_timebase.den)
                cupkt.timestamp = av_rescale_q(avpkt->pts, avctx->pkt_timebase, (AVRational){1, 10000000});
            else
                cupkt.timestamp = avpkt->pts;
        }
    } else {
        cupkt.flags = CUVID_PKT_ENDOFSTREAM;
    }

    ret = CHECK_CU(cuvidParseVideoData(ctx->cuparser, &cupkt));

    av_packet_unref(&filtered_packet);

    if (ret < 0) {
        goto error;
    }

    // cuvidParseVideoData doesn't return an error just because stuff failed...
    if (ctx->internal_error) {
        av_log(avctx, AV_LOG_ERROR, "cuvid decode callback error\n");
        ret = ctx->internal_error;
        goto error;
    }

    if (av_fifo_size(ctx->frame_queue)) {
        CUVIDPARSERDISPINFO dispinfo;
        CUVIDPROCPARAMS params;
        unsigned int pitch = 0;
        int offset = 0;
        int i;

        av_fifo_generic_read(ctx->frame_queue, &dispinfo, sizeof(CUVIDPARSERDISPINFO), NULL);

        memset(&params, 0, sizeof(params));
        params.progressive_frame = dispinfo.progressive_frame;
        params.second_field = 0;
        params.top_field_first = dispinfo.top_field_first;

        ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, dispinfo.picture_index, &mapped_frame, &pitch, &params));
        if (ret < 0)
            goto error;

        if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
            ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n");
                goto error;
            }

            ret = ff_decode_frame_props(avctx, frame);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n");
                goto error;
            }

            for (i = 0; i < 2; i++) {
                CUDA_MEMCPY2D cpy = {
                    .srcMemoryType = CU_MEMORYTYPE_DEVICE,
                    .dstMemoryType = CU_MEMORYTYPE_DEVICE,
                    .srcDevice     = mapped_frame,
                    .dstDevice     = (CUdeviceptr)frame->data[i],
                    .srcPitch      = pitch,
                    .dstPitch      = frame->linesize[i],
                    .srcY          = offset,
                    .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
                    .Height        = avctx->coded_height >> (i ? 1 : 0),
                };

                ret = CHECK_CU(cuMemcpy2D(&cpy));
                if (ret < 0)
                    goto error;

                offset += avctx->coded_height;
            }
        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {