[FFmpeg-devel] [PATCH 04/15] avfilter/vf_bwdif: Add neon for filter_intra

Martin Storsjö martin at martin.st
Sun Jul 2 00:37:35 EEST 2023


On Thu, 29 Jun 2023, John Cox wrote:

> Signed-off-by: John Cox <jc at kynesim.co.uk>
> ---
> libavfilter/aarch64/vf_bwdif_init_aarch64.c | 17 +++++++
> libavfilter/aarch64/vf_bwdif_neon.S         | 53 +++++++++++++++++++++
> 2 files changed, 70 insertions(+)
>
> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> index 86d53b2ca1..3ffaa07ab3 100644
> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> @@ -24,6 +24,22 @@
> #include "libavfilter/bwdif.h"
> #include "libavutil/aarch64/cpu.h"
>
> +void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
> +                                int prefs3, int mrefs3, int parity, int clip_max);
> +
> +
> +static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
> +                                int prefs3, int mrefs3, int parity, int clip_max)
> +{
> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
> +
> +    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
> +
> +    if (w0 < w)
> +        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
> +                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
> +}
> +
> void
> ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
> {
> @@ -35,5 +51,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>     if (!have_neon(cpu_flags))
>         return;
>
> +    s->filter_intra = filter_intra_helper;
> }
>
> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
> index a8f0ed525a..b863b3447d 100644
> --- a/libavfilter/aarch64/vf_bwdif_neon.S
> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
> @@ -69,3 +69,56 @@ coeffs:
>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>         .hword          5077, 981                       // sp[0] = v0.h[6]
>
> +// ============================================================================
> +//
> +// void ff_bwdif_filter_intra_neon(
> +//      void *dst1,     // x0
> +//      void *cur1,     // x1
> +//      int w,          // w2
> +//      int prefs,      // w3
> +//      int mrefs,      // w4
> +//      int prefs3,     // w5
> +//      int mrefs3,     // w6
> +//      int parity,     // w7       unused
> +//      int clip_max)   // [sp, #0] unused

This bit is great to have

> +
> +function ff_bwdif_filter_intra_neon, export=1
> +        cmp             w2, #0
> +        ble             99f
> +
> +        ldr             q0, coeffs
> +
> +//    for (x = 0; x < w; x++) {
> +10:
> +
> +//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;

I guess the style with intermixed C code is a bit uncommon in our 
assembly, but as long as it doesn't affect the overall code style I guess 
we can keep it.

> +        ldr             q31, [x1, w4, SXTW]
> +        ldr             q30, [x1, w3, SXTW]
> +        ldr             q29, [x1, w6, SXTW]
> +        ldr             q28, [x1, w5, SXTW]

Don't use shouty uppercase SXTW here

> +
> +        uaddl           v20.8h,  v31.8b,  v30.8b
> +        uaddl2          v21.8h,  v31.16b, v30.16b
> +
> +        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
> +
> +        uaddl           v20.8h,  v29.8b,  v28.8b
> +        uaddl2          v21.8h,  v29.16b, v28.16b
> +
> +        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
> +
> +//        dst[0] = av_clip(interpol, 0, clip_max);
> +        SQSHRUNN        v2, v2, v3, v4, v5, 13
> +        str             q2, [x0], #16
> +
> +//        dst++;
> +//        cur++;
> +//    }
> +
> +        subs            w2,  w2,  #16
> +        add             x1,  x1,  #16

For in-order cores, it might be good to update these variables sometime 
sooner, e.g. before the str instruction. But such scheduling breaks your 
mapping between neat C code and assembly.

// Martin



More information about the ffmpeg-devel mailing list