[FFmpeg-devel] [PATCH] avfilter/scene_sad: add AArch64 SIMD

quinkblack at foxmail.com quinkblack at foxmail.com
Sat Feb 1 11:57:17 EET 2020


From: Zhao Zhili <quinkblack at foxmail.com>

For 8 bit depth:
    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null -benchmark -

    Test results on Snapdragon 845:
    Before:
        frame=  250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.924x
	bench: utime=8.360s stime=2.350s rtime=10.820s
    After:
        frame=  250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=2.04x
	bench: utime=2.650s stime=2.210s rtime=4.909s

    Test results on HiSilicon Kirin 970:
    Before:
        frame=  250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.239x
        bench: utime=35.156s stime=6.604s rtime=41.820s
    After:
        frame=  250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.403x
	bench: utime=18.400s stime=6.376s rtime=24.798s

For 16 bit depth:
    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null -benchmark -

    Test results on Snapdragon 845
    Before:
        frame=  250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.756x
	bench: utime=8.700s stime=4.410s rtime=13.226s
    After:
	frame=  250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=1.07x
	bench: utime=4.920s stime=4.350s rtime=9.356s

    Test results on HiSilicon Kirin 970:
    Before:
        frame=  250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.161x
	bench: utime=48.868s stime=13.124s rtime=62.110s
    After:
        frame=  250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.205x
	bench: utime=35.600s stime=13.036s rtime=48.708s
---
 libavfilter/aarch64/Makefile         |   2 +
 libavfilter/aarch64/scene_sad_init.c |  37 +++++++
 libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++
 libavfilter/scene_sad.c              |   2 +
 libavfilter/scene_sad.h              |   2 +
 5 files changed, 192 insertions(+)
 create mode 100644 libavfilter/aarch64/scene_sad_init.c
 create mode 100644 libavfilter/aarch64/scene_sad_neon.S

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index 6c727f9859..3a458f511f 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,7 +1,9 @@
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_afir_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_anlmdn_init.o
+OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/scene_sad_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_afir_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_anlmdn_neon.o
+NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/scene_sad_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/scene_sad_init.c b/libavfilter/aarch64/scene_sad_init.c
new file mode 100644
index 0000000000..8de769ac10
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_init.c
@@ -0,0 +1,37 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/scene_sad.h"
+
+void ff_scene_sad_neon(SCENE_SAD_PARAMS);
+
+void ff_scene_sad16_neon(SCENE_SAD_PARAMS);
+
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_neon(cpu_flags)) {
+        if (depth == 8)
+            return ff_scene_sad_neon;
+        if (depth == 16)
+            return ff_scene_sad16_neon;
+    }
+
+    return NULL;
+}
diff --git a/libavfilter/aarch64/scene_sad_neon.S b/libavfilter/aarch64/scene_sad_neon.S
new file mode 100644
index 0000000000..5b3b027a53
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_neon.S
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Zhao Zhili
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1,
+//                         const uint8_t *src2, ptrdiff_t stride2,
+//                         ptrdiff_t width, ptrdiff_t height,
+//                         uint64_t *sum)
+.macro	scene_sad_neon, depth=8
+	// x0: src1
+	// x1: stride1
+	// x2: src2
+	// x3: stride2
+	// x4: width
+	// x5: height
+	// x6: sum
+
+	// x7: step of width loop
+	// x8: index of row
+	// x9: width / x7 * x7
+	// x10: sad
+	// x11: index of column
+	// w12: src1[x]
+	// w13: src2[x]
+
+	mov	x8, xzr
+	mov	x10, xzr
+
+.if \depth == 8
+	mov	x7, #64
+	and	x9, x4, #0xFFFFFFFFFFFFFFC0
+.endif
+
+.if \depth == 16
+	mov	x7, #32
+	and	x9, x4, #0xFFFFFFFFFFFFFFE0
+.endif
+
+1:	cmp	x4, x7		// check width
+	mov	x11, xzr
+	b.lt	3f
+
+	mov	v0.d[0], x10
+
+	// vector loop
+2:
+.if \depth == 8
+	add	x14, x0, x11
+	add	x15, x2, x11
+.endif
+
+.if \depth == 16
+	add	x14, x0, x11, lsl #1
+	add	x15, x2, x11, lsl #1
+.endif
+	ld1	{v16.4S, v17.4S, v18.4S, v19.4S}, [x14]
+	ld1	{v20.4S, v21.4S, v22.4S, v23.4S}, [x15]
+	add	x11, x11, x7
+	cmp	x9, x11
+
+.if \depth == 8
+	uabd	v16.16B, v16.16B, v20.16B
+	uabd	v17.16B, v17.16B, v21.16B
+	uabd	v18.16B, v18.16B, v22.16B
+	uabd	v19.16B, v19.16B, v23.16B
+	uaddlv	h16, v16.16B
+	uaddlv	h17, v17.16B
+	uaddlv	h18, v18.16B
+	uaddlv	h19, v19.16B
+.endif
+
+.if \depth == 16
+	uabd	v16.8H, v16.8H, v20.8H
+	uabd	v17.8H, v17.8H, v21.8H
+	uabd	v18.8H, v18.8H, v22.8H
+	uabd	v19.8H, v19.8H, v23.8H
+	uaddlv	s16, v16.8H
+	uaddlv	s17, v17.8H
+	uaddlv	s18, v18.8H
+	uaddlv	s19, v19.8H
+.endif
+
+	add	d16, d16, d17
+	add	d18, d18, d19
+	add	d0, d0, d16
+	add	d0, d0, d18
+
+	b.ne	2b
+
+	cmp	x9, x4
+	fmov	x10, d0
+	b.eq	4f
+
+	// scalar loop
+3:
+.if \depth == 8
+	ldrb	w12, [x0, x11]
+	ldrb	w13, [x2, x11]
+.endif
+
+.if \depth == 16
+	ldrh	w12, [x0, x11, lsl #1]
+	ldrh	w13, [x2, x11, lsl #1]
+.endif
+	add	x11, x11, #1
+	subs	w12, w12, w13
+	cneg	w12, w12, mi
+	add	x10, x10, x12
+	cmp	x11, x4
+	b.ne	3b
+
+	// next row
+4:
+	add	x8, x8, #1              // =1
+	add	x0, x0, x1
+	cmp	x8, x5
+	add	x2, x2, x3
+	b.ne	1b
+
+5:
+	str	x10, [x6]
+	ret
+.endm
+
+function ff_scene_sad_neon, export=1
+	scene_sad_neon	depth=8
+endfunc
+
+function ff_scene_sad16_neon, export=1
+	scene_sad_neon	depth=16
+endfunc
diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
index 73d3eacbfa..ee0c71f659 100644
--- a/libavfilter/scene_sad.c
+++ b/libavfilter/scene_sad.c
@@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
     ff_scene_sad_fn sad = NULL;
     if (ARCH_X86)
         sad = ff_scene_sad_get_fn_x86(depth);
+    if (ARCH_AARCH64)
+        sad = ff_scene_sad_get_fn_aarch64(depth);
     if (!sad) {
         if (depth == 8)
             sad = ff_scene_sad_c;
diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
index 173a051f2b..c868200dc4 100644
--- a/libavfilter/scene_sad.h
+++ b/libavfilter/scene_sad.h
@@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS);
 
 void ff_scene_sad16_c(SCENE_SAD_PARAMS);
 
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth);
+
 ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
 
 ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
-- 
2.22.0



More information about the ffmpeg-devel mailing list