[FFmpeg-devel] [PATCH 11/15] avfilter/vf_bwdif: Add neon for filter_line

Martin Storsjö martin at martin.st
Sun Jul 2 00:44:10 EEST 2023


On Thu, 29 Jun 2023, John Cox wrote:

> Signed-off-by: John Cox <jc at kynesim.co.uk>
> ---
> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
> libavfilter/aarch64/vf_bwdif_neon.S         | 215 ++++++++++++++++++++
> 2 files changed, 236 insertions(+)
>
> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> index e75cf2f204..21e67884ab 100644
> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>                                 int prefs3, int mrefs3, int parity, int clip_max);
>
> +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
> +                               int parity, int clip_max);
> +
> +
> +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
> +                               int parity, int clip_max)
> +{
> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
> +
> +    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
> +                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
> +
> +    if (w0 < w)
> +        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
> +                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
> +}
>
> static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
>                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
> @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>         return;
>
>     s->filter_intra = filter_intra_helper;
> +    s->filter_line  = filter_line_helper;
>     s->filter_edge  = filter_edge_helper;
> }
>
> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
> index a33b235882..675e97d966 100644
> --- a/libavfilter/aarch64/vf_bwdif_neon.S
> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
> @@ -128,6 +128,221 @@ coeffs:
>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>         .hword          5077, 981                       // sp[0] = v0.h[6]
>
> +// ===========================================================================
> +//
> +// void filter_line(
> +//      void *dst1,     // x0
> +//      void *prev1,    // x1
> +//      void *cur1,     // x2
> +//      void *next1,    // x3
> +//      int w,          // w4
> +//      int prefs,      // w5
> +//      int mrefs,      // w6
> +//      int prefs2,     // w7
> +//      int mrefs2,     // [sp, #0]
> +//      int prefs3,     // [sp, #8]
> +//      int mrefs3,     // [sp, #16]
> +//      int prefs4,     // [sp, #24]
> +//      int mrefs4,     // [sp, #32]
> +//      int parity,     // [sp, #40]
> +//      int clip_max)   // [sp, #48]
> +
> +function ff_bwdif_filter_line_neon, export=1
> +        // Sanity check w
> +        cmp             w4, #0
> +        ble             99f
> +
> +        // Rearrange regs to be the same as line3 for ease of debug!
> +        mov             w10, w4                         // w10 = loop count
> +        mov             w9,  w6                         // w9  = mref
> +        mov             w12, w7                         // w12 = pref2
> +        mov             w11, w5                         // w11 = pref
> +        ldr             w8,  [sp, #0]                   // w8 =  mref2
> +        ldr             w7,  [sp, #16]                  // w7  = mref3
> +        ldr             w6,  [sp, #32]                  // w6  = mref4
> +        ldr             w13, [sp, #8]                   // w13 = pref3
> +        ldr             w14, [sp, #24]                  // w14 = pref4

Btw, remember that you can load two arguments from the stack at once with 
ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you 
won't have an issue with garbage in the upper 32 bits either.



> +
> +        mov             x4,  x3
> +        mov             x3,  x2
> +        mov             x2,  x1
> +
> +// #define prev2 cur
> +//        const uint8_t * restrict next2 = parity ? prev : next;
> +        ldr             w17, [sp, #40]                  // parity
> +        cmp             w17, #0
> +        csel            x17, x2, x4, ne
> +
> +        // We want all the V registers - save all the ones we must
> +        stp             d14, d15, [sp, #-64]!
> +        stp             d8,  d9,  [sp, #48]
> +        stp             d10, d11, [sp, #32]
> +        stp             d12, d13, [sp, #16]

The order looks a bit weird here even if they end up sequential on the 
stack. If you'd fill it from the bottom up, e.g.

stp d8, d9, [sp, #-64]!
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]

they're sequential both in code and on the stack.

// Martin



More information about the ffmpeg-devel mailing list