[FFmpeg-cvslog] avfilter/vf_bwdif: Add neon for filter_intra

Thu Jul 6 00:20:26 EEST 2023

ffmpeg | branch: master | John Cox <jc at kynesim.co.uk> | Tue Jul  4 14:04:40 2023 +0000| [866028970358b7d22d1fe5c53b6f80119dba4272] | committer: Martin Storsjö

avfilter/vf_bwdif: Add neon for filter_intra

Adds an outline for aarch neon functions
Adds common macros and consts for aarch64 neon
Exports C filter_intra needed for tail fixup of neon code
Adds neon for filter_intra

Signed-off-by: John Cox <jc at kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin at martin.st>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=866028970358b7d22d1fe5c53b6f80119dba4272
---

 libavfilter/aarch64/Makefile                |   2 +
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  56 ++++++++++++
 libavfilter/aarch64/vf_bwdif_neon.S         | 136 ++++++++++++++++++++++++++++
 libavfilter/bwdif.h                         |   4 +
 libavfilter/vf_bwdif.c                      |   8 +-
 5 files changed, 203 insertions(+), 3 deletions(-)

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index b58daa3a3f..b68209bc94 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,3 +1,5 @@
+OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
+NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
new file mode 100644
index 0000000000..3ffaa07ab3
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -0,0 +1,56 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc at kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavfilter/bwdif.h"
+#include "libavutil/aarch64/cpu.h"
+
+void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max);
+
+
+static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
+                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+}
+
+void
+ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
+{
+    const int cpu_flags = av_get_cpu_flags();
+
+    if (bit_depth != 8)
+        return;
+
+    if (!have_neon(cpu_flags))
+        return;
+
+    s->filter_intra = filter_intra_helper;
+}
+
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
new file mode 100644
index 0000000000..e288efbe6c
--- /dev/null
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -0,0 +1,136 @@
+/*
+ * bwdif aarch64 NEON optimisations
+ *
+ * Copyright (c) 2023 John Cox <jc at kynesim.co.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+
+// Space taken on the stack by an int (32-bit)
+#ifdef __APPLE__
+.set    SP_INT, 4
+#else
+.set    SP_INT, 8
+#endif
+
+.macro SQSHRUNN b, s0, s1, s2, s3, n
+        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
+        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
+        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
+        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
+        uzp2            \b\().16b, \s0\().16b, \s1\().16b
+.endm
+
+.macro SMULL4K a0, a1, a2, a3, s0, s1, k
+        smull           \a0\().4s, \s0\().4h, \k
+        smull2          \a1\().4s, \s0\().8h, \k
+        smull           \a2\().4s, \s1\().4h, \k
+        smull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMULL4K a0, a1, a2, a3, s0, s1, k
+        umull           \a0\().4s, \s0\().4h, \k
+        umull2          \a1\().4s, \s0\().8h, \k
+        umull           \a2\().4s, \s1\().4h, \k
+        umull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
+        umlal           \a0\().4s, \s0\().4h, \k
+        umlal2          \a1\().4s, \s0\().8h, \k
+        umlal           \a2\().4s, \s1\().4h, \k
+        umlal2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
+        umlsl           \a0\().4s, \s0\().4h, \k
+        umlsl2          \a1\().4s, \s0\().8h, \k
+        umlsl           \a2\().4s, \s1\().4h, \k
+        umlsl2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro LDR_COEFFS d, t0
+        movrel          \t0, coeffs, 0
+        ld1             {\d\().8h}, [\t0]
+.endm
+
+// static const uint16_t coef_lf[2] = { 4309, 213 };
+// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
+// static const uint16_t coef_sp[2] = { 5077, 981 };
+
+const coeffs, align=4   // align 4 means align on 2^4 boundry
+        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
+        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
+        .hword          5077, 981                       // sp[0] = v0.h[6]
+endconst
+
+// ============================================================================
+//
+// void ff_bwdif_filter_intra_neon(
+//      void *dst1,     // x0
+//      void *cur1,     // x1
+//      int w,          // w2
+//      int prefs,      // w3
+//      int mrefs,      // w4
+//      int prefs3,     // w5
+//      int mrefs3,     // w6
+//      int parity,     // w7       unused
+//      int clip_max)   // [sp, #0] unused
+
+function ff_bwdif_filter_intra_neon, export=1
+        cmp             w2, #0
+        ble             99f
+
+        LDR_COEFFS      v0, x17
+
+//    for (x = 0; x < w; x++) {
+10:
+
+//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
+        ldr             q31, [x1, w4, sxtw]
+        ldr             q30, [x1, w3, sxtw]
+        ldr             q29, [x1, w6, sxtw]
+        ldr             q28, [x1, w5, sxtw]
+
+        uaddl           v20.8h,  v31.8b,  v30.8b
+        uaddl2          v21.8h,  v31.16b, v30.16b
+
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
+
+        uaddl           v20.8h,  v29.8b,  v28.8b
+        uaddl2          v21.8h,  v29.16b, v28.16b
+
+        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
+
+//        dst[0] = av_clip(interpol, 0, clip_max);
+        SQSHRUNN        v2, v2, v3, v4, v5, 13
+        str             q2, [x0], #16
+
+//        dst++;
+//        cur++;
+//    }
+
+        subs            w2,  w2,  #16
+        add             x1,  x1,  #16
+        bgt             10b
+
+99:
+        ret
+endfunc
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
index 5749345f78..ae6f6ce223 100644
--- a/libavfilter/bwdif.h
+++ b/libavfilter/bwdif.h
@@ -39,5 +39,9 @@ typedef struct BWDIFContext {
 
 void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
 void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
+void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
+
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                             int prefs3, int mrefs3, int parity, int clip_max);
 
 #endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index e278cf1217..035fc58670 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -122,8 +122,8 @@ typedef struct ThreadData {
         next2++; \
     }
 
-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
-                         int prefs3, int mrefs3, int parity, int clip_max)
+void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                             int prefs3, int mrefs3, int parity, int clip_max)
 {
     uint8_t *dst = dst1;
     uint8_t *cur = cur1;
@@ -362,13 +362,15 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
         s->filter_line  = filter_line_c_16bit;
         s->filter_edge  = filter_edge_16bit;
     } else {
-        s->filter_intra = filter_intra;
+        s->filter_intra = ff_bwdif_filter_intra_c;
         s->filter_line  = filter_line_c;
         s->filter_edge  = filter_edge;
     }
 
 #if ARCH_X86
     ff_bwdif_init_x86(s, bit_depth);
+#elif ARCH_AARCH64
+    ff_bwdif_init_aarch64(s, bit_depth);
 #endif
 }