static AVBufferRef *cuda_pool_alloc(void *opaque, int size) { AVHWFramesContext *ctx = opaque; AVCUDADeviceContext *hwctx = ctx->device_ctx->hwctx; CudaFunctions *cu = hwctx->internal->cuda_dl; AVBufferRef *ret = NULL; CUcontext dummy = NULL; CUdeviceptr data; CUresult err; err = cu->cuCtxPushCurrent(hwctx->cuda_ctx); if (err != CUDA_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Error setting current CUDA context\n"); return NULL; } err = cu->cuMemAlloc(&data, size); if (err != CUDA_SUCCESS) goto fail; ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0); if (!ret) { cu->cuMemFree(data); goto fail; } fail: cu->cuCtxPopCurrent(&dummy); return ret; }
static void cuda_buffer_free(void *opaque, uint8_t *data) { AVHWFramesContext *ctx = opaque; AVCUDADeviceContext *hwctx = ctx->device_ctx->hwctx; CudaFunctions *cu = hwctx->internal->cuda_dl; CUcontext dummy; cu->cuCtxPushCurrent(hwctx->cuda_ctx); cu->cuMemFree((CUdeviceptr)data); cu->cuCtxPopCurrent(&dummy); }
static int cuda_device_create(AVHWDeviceContext *ctx, const char *device, AVDictionary *opts, int flags) { AVCUDADeviceContext *hwctx = ctx->hwctx; CudaFunctions *cu; CUdevice cu_device; CUcontext dummy; CUresult err; int device_idx = 0; if (device) device_idx = strtol(device, NULL, 0); if (cuda_device_init(ctx) < 0) goto error; cu = hwctx->internal->cuda_dl; err = cu->cuInit(0); if (err != CUDA_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n"); goto error; } err = cu->cuDeviceGet(&cu_device, device_idx); if (err != CUDA_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx); goto error; } err = cu->cuCtxCreate(&hwctx->cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, cu_device); if (err != CUDA_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n"); goto error; } cu->cuCtxPopCurrent(&dummy); hwctx->internal->is_allocated = 1; return 0; error: cuda_device_uninit(ctx); return AVERROR_UNKNOWN; }
static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) { AVFilterContext *ctx = link->dst; CUDAScaleContext *s = ctx->priv; AVFilterLink *outlink = ctx->outputs[0]; CudaFunctions *cu = s->hwctx->internal->cuda_dl; AVFrame *out = NULL; CUcontext dummy; int ret = 0; out = av_frame_alloc(); if (!out) { ret = AVERROR(ENOMEM); goto fail; } ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); if (ret < 0) goto fail; ret = cudascale_scale(ctx, out, in); CHECK_CU(cu->cuCtxPopCurrent(&dummy)); if (ret < 0) goto fail; av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den, (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w, (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h, INT_MAX); av_frame_free(&in); return ff_filter_frame(outlink, out); fail: av_frame_free(&in); av_frame_free(&out); return ret; }
static int cuda_transfer_data_from(AVHWFramesContext *ctx, AVFrame *dst, const AVFrame *src) { CUDAFramesContext *priv = ctx->internal->priv; AVCUDADeviceContext *device_hwctx = ctx->device_ctx->hwctx; CudaFunctions *cu = device_hwctx->internal->cuda_dl; CUcontext dummy; CUresult err; int i; err = cu->cuCtxPushCurrent(device_hwctx->cuda_ctx); if (err != CUDA_SUCCESS) return AVERROR_UNKNOWN; for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) { CUDA_MEMCPY2D cpy = { .srcMemoryType = CU_MEMORYTYPE_DEVICE, .dstMemoryType = CU_MEMORYTYPE_HOST, .srcDevice = (CUdeviceptr)src->data[i], .dstHost = dst->data[i], .srcPitch = src->linesize[i], .dstPitch = dst->linesize[i], .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), .Height = src->height >> (i ? priv->shift_height : 0), }; err = cu->cuMemcpy2D(&cpy); if (err != CUDA_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n"); return AVERROR_UNKNOWN; } } cu->cuCtxPopCurrent(&dummy); return 0; } static int cuda_transfer_data_to(AVHWFramesContext *ctx, AVFrame *dst, const AVFrame *src) { CUDAFramesContext *priv = ctx->internal->priv; AVCUDADeviceContext *device_hwctx = ctx->device_ctx->hwctx; CudaFunctions *cu = device_hwctx->internal->cuda_dl; CUcontext dummy; CUresult err; int i; err = cu->cuCtxPushCurrent(device_hwctx->cuda_ctx); if (err != CUDA_SUCCESS) return AVERROR_UNKNOWN; for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) { CUDA_MEMCPY2D cpy = { .srcMemoryType = CU_MEMORYTYPE_HOST, .dstMemoryType = CU_MEMORYTYPE_DEVICE, .srcHost = src->data[i], .dstDevice = (CUdeviceptr)dst->data[i], .srcPitch = src->linesize[i], .dstPitch = dst->linesize[i], .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), .Height = src->height >> (i ? priv->shift_height : 0), }; err = cu->cuMemcpy2D(&cpy); if (err != CUDA_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n"); return AVERROR_UNKNOWN; } } cu->cuCtxPopCurrent(&dummy); return 0; } static void cuda_device_uninit(AVHWDeviceContext *ctx) { AVCUDADeviceContext *hwctx = ctx->hwctx; if (hwctx->internal) { if (hwctx->internal->is_allocated && hwctx->cuda_ctx) { hwctx->internal->cuda_dl->cuCtxDestroy(hwctx->cuda_ctx); hwctx->cuda_ctx = NULL; } cuda_free_functions(&hwctx->internal->cuda_dl); } av_freep(&hwctx->internal); }
static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, int channels, uint8_t *src_dptr, int src_width, int src_height, int src_pitch, uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, int pixel_size) { CUDAScaleContext *s = ctx->priv; CudaFunctions *cu = s->hwctx->internal->cuda_dl; CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr; CUtexObject tex = 0; void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height }; int ret; CUDA_TEXTURE_DESC tex_desc = { .filterMode = CU_TR_FILTER_MODE_LINEAR, .flags = CU_TRSF_READ_AS_INTEGER, }; CUDA_RESOURCE_DESC res_desc = { .resType = CU_RESOURCE_TYPE_PITCH2D, .res.pitch2D.format = pixel_size == 1 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16, .res.pitch2D.numChannels = channels, .res.pitch2D.width = src_width, .res.pitch2D.height = src_height, .res.pitch2D.pitchInBytes = src_pitch * pixel_size, .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, }; ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); if (ret < 0) goto exit; ret = CHECK_CU(cu->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); exit: if (tex) CHECK_CU(cu->cuTexObjectDestroy(tex)); return ret; } static int scalecuda_resize(AVFilterContext *ctx, AVFrame *out, AVFrame *in) { AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data; CUDAScaleContext *s = ctx->priv; switch (in_frames_ctx->sw_format) { case AV_PIX_FMT_YUV420P: call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0], in->width, in->height, in->linesize[0], out->data[0], out->width, out->height, out->linesize[0], 1); call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[1], in->width/2, in->height/2, in->linesize[0]/2, out->data[1], out->width/2, out->height/2, out->linesize[0]/2, 1); call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[2], in->width/2, in->height/2, in->linesize[0]/2, out->data[2], out->width/2, out->height/2, out->linesize[0]/2, 1); break; case AV_PIX_FMT_YUV444P: call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0], in->width, in->height, in->linesize[0], out->data[0], out->width, out->height, out->linesize[0], 1); call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[1], in->width, in->height, in->linesize[0], out->data[1], out->width, out->height, out->linesize[0], 1); call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[2], in->width, in->height, in->linesize[0], out->data[2], out->width, out->height, out->linesize[0], 1); break; case AV_PIX_FMT_YUV444P16: call_resize_kernel(ctx, s->cu_func_ushort, 1, in->data[0], in->width, in->height, in->linesize[0] / 2, out->data[0], out->width, out->height, out->linesize[0] / 2, 2); call_resize_kernel(ctx, s->cu_func_ushort, 1, in->data[1], in->width, in->height, in->linesize[1] / 2, out->data[1], out->width, out->height, out->linesize[1] / 2, 2); call_resize_kernel(ctx, s->cu_func_ushort, 1, in->data[2], in->width, in->height, in->linesize[2] / 2, out->data[2], out->width, out->height, out->linesize[2] / 2, 2); break; case AV_PIX_FMT_NV12: call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0], in->width, in->height, in->linesize[0], out->data[0], out->width, out->height, out->linesize[0], 1); call_resize_kernel(ctx, s->cu_func_uchar2, 2, in->data[1], in->width/2, in->height/2, in->linesize[1], out->data[1], out->width/2, out->height/2, out->linesize[1]/2, 1); break; case AV_PIX_FMT_P010LE: call_resize_kernel(ctx, s->cu_func_ushort, 1, in->data[0], in->width, in->height, in->linesize[0]/2, out->data[0], out->width, out->height, out->linesize[0]/2, 2); call_resize_kernel(ctx, s->cu_func_ushort2, 2, in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2, out->data[1], out->width / 2, out->height / 2, out->linesize[1] / 4, 2); break; case AV_PIX_FMT_P016LE: call_resize_kernel(ctx, s->cu_func_ushort, 1, in->data[0], in->width, in->height, in->linesize[0] / 2, out->data[0], out->width, out->height, out->linesize[0] / 2, 2); call_resize_kernel(ctx, s->cu_func_ushort2, 2, in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2, out->data[1], out->width / 2, out->height / 2, out->linesize[1] / 4, 2); break; default: return AVERROR_BUG; } return 0; }
static av_cold int cudascale_config_props(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; AVFilterLink *inlink = outlink->src->inputs[0]; CUDAScaleContext *s = ctx->priv; AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; CudaFunctions *cu = device_hwctx->internal->cuda_dl; int w, h; int ret; extern char vf_scale_cuda_ptx[]; s->hwctx = device_hwctx; s->cu_stream = s->hwctx->stream; ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); if (ret < 0) goto fail; ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx)); if (ret < 0) goto fail; CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar")); if (ret < 0) goto fail; CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2")); if (ret < 0) goto fail; CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4")); if (ret < 0) goto fail; CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort")); if (ret < 0) goto fail; CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2")); if (ret < 0) goto fail; CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4")); if (ret < 0) goto fail; CHECK_CU(cu->cuCtxPopCurrent(&dummy)); if ((ret = ff_scale_eval_dimensions(s, s->w_expr, s->h_expr, inlink, outlink, &w, &h)) < 0) goto fail; if (((int64_t)h * inlink->w) > INT_MAX || ((int64_t)w * inlink->h) > INT_MAX) av_log(ctx, AV_LOG_ERROR, "Rescaled value for width or height is too big.\n"); outlink->w = w; outlink->h = h; ret = init_processing_chain(ctx, inlink->w, inlink->h, w, h); if (ret < 0) return ret; av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d -> w:%d h:%d\n", inlink->w, inlink->h, outlink->w, outlink->h); if (inlink->sample_aspect_ratio.num) { outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w, outlink->w*inlink->h}, inlink->sample_aspect_ratio); } else { outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; } return 0; fail: return ret; }