[FFmpeg-devel] [PATCH] avfilter: add libvmaf_cuda

Mon Aug 14 20:09:07 EEST 2023

> From 2665b7ddaefe6739bfeef4573183981582bdb995 Mon Sep 17 00:00:00 2001
> From: Kyle Swanson <kswanson at netflix.com>
> Date: Mon, 7 Aug 2023 10:38:12 -0700
> Subject: [PATCH] avfilter: add libvmaf_cuda
> 
> ---
>  configure                |   4 +
>  libavfilter/Makefile     |   1 +
>  libavfilter/allfilters.c |   1 +
>  libavfilter/vf_libvmaf.c | 211 ++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 216 insertions(+), 1 deletion(-)
> 
> diff --git a/configure b/configure
> index d9372c5be6..387221c314 100755
> --- a/configure
> +++ b/configure
> @@ -286,6 +286,7 @@ External library support:
>    --enable-libv4l2         enable libv4l2/v4l-utils [no]
>    --enable-libvidstab      enable video stabilization using vid.stab [no]
>    --enable-libvmaf         enable vmaf filter via libvmaf [no]
> +  --enable-libvmaf-cuda    enable cuda vmaf filter via libvmaf [no]
>    --enable-libvo-amrwbenc  enable AMR-WB encoding via libvo-amrwbenc [no]
>    --enable-libvorbis       enable Vorbis en/decoding via libvorbis,
>                             native implementation exists [no]
> @@ -1902,6 +1903,7 @@ EXTERNAL_LIBRARY_LIST="
>      libuavs3d
>      libv4l2
>      libvmaf
> +    libvmaf_cuda
>      libvorbis
>      libvpx
>      libwebp
> @@ -3830,6 +3832,7 @@ vflip_vulkan_filter_deps="vulkan spirv_compiler"
>  vidstabdetect_filter_deps="libvidstab"
>  vidstabtransform_filter_deps="libvidstab"
>  libvmaf_filter_deps="libvmaf"
> +libvmaf_cuda_filter_deps="libvmaf"

This is missing dependencies on at least ffnvcodec.

>  zmq_filter_deps="libzmq"
>  zoompan_filter_deps="swscale"
>  zscale_filter_deps="libzimg const_nan"
> @@ -6806,6 +6809,7 @@ enabled libuavs3d         && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uav
>  enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
>  enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
>  enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init
> +enabled libvmaf_cuda      && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init

Why is this a separate library if it checks for the literal same thing?
Shouldn't this check for the vmaf cuda header being there at the very least?
Or a specific version since which it's included?

>  enabled libvo_amrwbenc    && require libvo_amrwbenc vo-amrwbenc/enc_if.h E_IF_init -lvo-amrwbenc
>  enabled libvorbis         && require_pkg_config libvorbis vorbis vorbis/codec.h vorbis_info_init &&
>                               require_pkg_config libvorbisenc vorbisenc vorbis/vorbisenc.h vorbis_encode_init
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 30a0e22ef8..c1405ae924 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -361,6 +361,7 @@ OBJS-$(CONFIG_LENSCORRECTION_FILTER)         += vf_lenscorrection.o
>  OBJS-$(CONFIG_LENSFUN_FILTER)                += vf_lensfun.o
>  OBJS-$(CONFIG_LIBPLACEBO_FILTER)             += vf_libplacebo.o vulkan.o vulkan_filter.o
>  OBJS-$(CONFIG_LIBVMAF_FILTER)                += vf_libvmaf.o framesync.o
> +OBJS-$(CONFIG_LIBVMAF_CUDA_FILTER)           += vf_libvmaf.o framesync.o
>  OBJS-$(CONFIG_LIMITDIFF_FILTER)              += vf_limitdiff.o framesync.o
>  OBJS-$(CONFIG_LIMITER_FILTER)                += vf_limiter.o
>  OBJS-$(CONFIG_LOOP_FILTER)                   += f_loop.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 089ad3a0ed..8349ba9469 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -337,6 +337,7 @@ extern const AVFilter ff_vf_lenscorrection;
>  extern const AVFilter ff_vf_lensfun;
>  extern const AVFilter ff_vf_libplacebo;
>  extern const AVFilter ff_vf_libvmaf;
> +extern const AVFilter ff_vf_libvmaf_cuda;
>  extern const AVFilter ff_vf_limitdiff;
>  extern const AVFilter ff_vf_limiter;
>  extern const AVFilter ff_vf_loop;
> diff --git a/libavfilter/vf_libvmaf.c b/libavfilter/vf_libvmaf.c
> index 2586f37d99..33fcbcb4ae 100644
> --- a/libavfilter/vf_libvmaf.c
> +++ b/libavfilter/vf_libvmaf.c
> @@ -24,6 +24,8 @@
>   * Calculate the VMAF between two input videos.
>   */
>  
> +#include "config.h"
> +
>  #include <libvmaf.h>
>  
>  #include "libavutil/avstring.h"
> @@ -36,6 +38,13 @@
>  #include "internal.h"
>  #include "video.h"
>  
> +#ifdef CONFIG_LIBVMAF_CUDA
> +#include <libvmaf_cuda.h>
> +
> +#include "libavutil/hwcontext.h"
> +#include "libavutil/hwcontext_cuda_internal.h"
> +#endif
> +
>  typedef struct LIBVMAFContext {
>      const AVClass *class;
>      FFFrameSync fs;
> @@ -58,6 +67,7 @@ typedef struct LIBVMAFContext {
>      unsigned model_cnt;
>      unsigned frame_cnt;
>      unsigned bpc;
> +    VmafCudaState *cu_state;

Looks like it's missing the usual #ifdef around it.

>  } LIBVMAFContext;
>  
>  #define OFFSET(x) offsetof(LIBVMAFContext, x)
> @@ -682,7 +692,8 @@ static const AVFilterPad libvmaf_inputs[] = {
>      {
>          .name         = "main",
>          .type         = AVMEDIA_TYPE_VIDEO,
> -    },{
> +    },
> +    {

Unrelated change.

>          .name         = "reference",
>          .type         = AVMEDIA_TYPE_VIDEO,
>          .config_props = config_input_ref,
> @@ -710,3 +721,201 @@ const AVFilter ff_vf_libvmaf = {
>      FILTER_OUTPUTS(libvmaf_outputs),
>      FILTER_PIXFMTS_ARRAY(pix_fmts),
>  };
> +
> +#ifdef CONFIG_LIBVMAF_CUDA
> +static const enum AVPixelFormat supported_formats[] = {
> +    AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_YUV444P16,
> +};
> +
> +static int format_is_supported(enum AVPixelFormat fmt)
> +{
> +    int i;
> +
> +    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
> +        if (supported_formats[i] == fmt)
> +            return 1;
> +    return 0;
> +}
> +
> +static int config_props_cuda(AVFilterLink *outlink)
> +{
> +    int err;
> +    AVFilterContext *ctx = outlink->src;
> +    LIBVMAFContext *s = ctx->priv;
> +    AVFilterLink *inlink = ctx->inputs[0];
> +    AVHWFramesContext *frames_ctx = (AVHWFramesContext*) inlink->hw_frames_ctx->data;
> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
> +    CUcontext cu_ctx = device_hwctx->cuda_ctx;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frames_ctx->sw_format);
> +
> +    VmafConfiguration cfg = {
> +        .log_level = log_level_map(av_log_get_level()),
> +        .n_subsample = s->n_subsample,
> +        .n_threads = s->n_threads,
> +    };
> +
> +    VmafCudaPictureConfiguration cuda_pic_cfg = {
> +        .pic_params = {
> +            .bpc = desc->comp[0].depth,
> +            .w = inlink->w,
> +            .h = inlink->h,
> +            .pix_fmt = pix_fmt_map(frames_ctx->sw_format),
> +        },
> +        .pic_prealloc_method = VMAF_CUDA_PICTURE_PREALLOCATION_METHOD_DEVICE,
> +    };
> +
> +    VmafCudaConfiguration cuda_cfg = {
> +        .cu_ctx = cu_ctx,
> +    };
> +
> +    if (!format_is_supported(frames_ctx->sw_format)) {
> +        av_log(s, AV_LOG_ERROR,
> +               "Unsupported input format: %s\n", desc->name);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    err = vmaf_init(&s->vmaf, cfg);
> +    if (err)
> +        return AVERROR(EINVAL);
> +
> +    err = vmaf_cuda_state_init(&s->cu_state, cuda_cfg);
> +    if (err)
> +        return AVERROR(EINVAL);
> +
> +    err = vmaf_cuda_import_state(s->vmaf, s->cu_state);
> +    if (err)
> +        return AVERROR(EINVAL);
> +
> +    err = vmaf_cuda_preallocate_pictures(s->vmaf, cuda_pic_cfg);
> +    if (err < 0)
> +        return err;
> +
> +    err = parse_deprecated_options(ctx);
> +    if (err)
> +        return err;
> +
> +    err = parse_models(ctx);
> +    if (err)
> +        return err;
> +
> +    err = parse_features(ctx);
> +    if (err)
> +        return err;
> +
> +    return config_output(outlink);
> +}
> +
> +static int copy_picture_data_cuda(VmafContext* vmaf,
> +                                  AVCUDADeviceContext* device_hwctx,
> +                                  AVFrame* src, VmafPicture* dst,
> +                                  enum AVPixelFormat pix_fmt)
> +{
> +    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
> +    CudaFunctions *cu = device_hwctx->internal->cuda_dl;
> +
> +    CUDA_MEMCPY2D m = {
> +        .srcMemoryType = CU_MEMORYTYPE_DEVICE,
> +        .dstMemoryType = CU_MEMORYTYPE_DEVICE,
> +    };
> +
> +    int err = vmaf_cuda_fetch_preallocated_picture(vmaf, dst);
> +    if (err)
> +        return AVERROR(ENOMEM);
> +
> +    err = cu->cuCtxPushCurrent(device_hwctx->cuda_ctx);
> +    if (err)

We tend to do a bit more checking here for cuda, but for push/pop, it's 
probably fine.
Would still prefer to see this compared against CUDA_SUCCESS, even 
though it's technically 0.

> +        return AVERROR(ENOMEM);

ENOMEM does not sound correct.
Not sure what fits here, we usually just return EXTERNAL.

> +
> +    for (unsigned i = 0; i < pix_desc->nb_components; i++) {
> +        m.srcDevice = (CUdeviceptr) src->data[i];
> +        m.srcPitch = src->linesize[i];
> +        m.dstDevice = (CUdeviceptr) dst->data[i];
> +        m.dstPitch = dst->stride[i];
> +        m.WidthInBytes = dst->w[i] * ((dst->bpc + 7) / 8);
> +        m.Height = dst->h[i];
> +
> +        err = cu->cuMemcpy2D(&m);
> +        if (err)
> +            return AVERROR(ENOMEM);

This is also not a nomem-situation.

> +        break;
> +    }
> +
> +    err = cu->cuCtxPopCurrent(NULL);
> +    if (err)
> +        return AVERROR(ENOMEM);
> +
> +    return 0;
> +}
> +
> +static int do_vmaf_cuda(FFFrameSync* fs)
> +{
> +    AVFilterContext* ctx = fs->parent;
> +    LIBVMAFContext* s = ctx->priv;
> +    AVFilterLink *inlink = ctx->inputs[0];
> +    AVHWFramesContext *frames_ctx = (AVHWFramesContext*) inlink->hw_frames_ctx->data;
> +    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
> +    VmafPicture pic_ref, pic_dist;
> +    AVFrame *ref, *dist;
> +
> +    int err = 0;
> +
> +    err = ff_framesync_dualinput_get(fs, &dist, &ref);
> +    if (err < 0)
> +        return err;
> +    if (ctx->is_disabled || !ref)
> +        return ff_filter_frame(ctx->outputs[0], dist);
> +
> +    err = copy_picture_data_cuda(s->vmaf, device_hwctx, ref, &pic_ref,
> +                                 frames_ctx->sw_format);
> +    if (err) {
> +        av_log(s, AV_LOG_ERROR, "problem during copy_picture_data_cuda.\n");
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    err = copy_picture_data_cuda(s->vmaf, device_hwctx, dist, &pic_dist,
> +                                 frames_ctx->sw_format);
> +    if (err) {
> +        av_log(s, AV_LOG_ERROR, "problem during copy_picture_data_cuda.\n");
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    err = vmaf_read_pictures(s->vmaf, &pic_ref, &pic_dist, s->frame_cnt++);
> +    if (err) {
> +        av_log(s, AV_LOG_ERROR, "problem during vmaf_read_pictures.\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    return ff_filter_frame(ctx->outputs[0], dist);
> +}
> +
> +static av_cold int init_cuda(AVFilterContext *ctx)
> +{
> +    LIBVMAFContext *s = ctx->priv;
> +    s->fs.on_event = do_vmaf_cuda;
> +    return 0;
> +}
> +
> +static const AVFilterPad libvmaf_outputs_cuda[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = config_props_cuda,
> +    },
> +};
> +
> +const AVFilter ff_vf_libvmaf_cuda = {
> +    .name           = "libvmaf_cuda",
> +    .description    = NULL_IF_CONFIG_SMALL("Calculate the VMAF between two video streams."),
> +    .preinit        = libvmaf_framesync_preinit,
> +    .init           = init_cuda,
> +    .uninit         = uninit,
> +    .activate       = activate,
> +    .priv_size      = sizeof(LIBVMAFContext),
> +    .priv_class     = &libvmaf_class,
> +    FILTER_INPUTS(libvmaf_inputs),
> +    FILTER_OUTPUTS(libvmaf_outputs_cuda),
> +    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
> +#endif
> -- 
> 2.24.3 (Apple Git-128)
> 

rest of the code generally looks sound to me.
copy_picture_data_cuda looks like mostly a copy of 
av_hwframe_transfer_data(), but wrapping the vmaf image into an AVFrame 
is probably just as much boilerplate.