[FFmpeg-devel] [PATCH] avfilter/scene_sad: add AArch64 SIMD

Marton Balint cus at passwd.hu
Sat Feb 1 22:26:47 EET 2020



On Sat, 1 Feb 2020, quinkblack at foxmail.com wrote:

> From: Zhao Zhili <quinkblack at foxmail.com>
>
> For 8 bit depth:
>    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null -benchmark -
>
>    Test results on Snapdragon 845:
>    Before:
>        frame=  250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.924x
> 	bench: utime=8.360s stime=2.350s rtime=10.820s
>    After:
>        frame=  250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=2.04x
> 	bench: utime=2.650s stime=2.210s rtime=4.909s
>
>    Test results on HiSilicon Kirin 970:
>    Before:
>        frame=  250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.239x
>        bench: utime=35.156s stime=6.604s rtime=41.820s
>    After:
>        frame=  250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.403x
> 	bench: utime=18.400s stime=6.376s rtime=24.798s
>
> For 16 bit depth:
>    ./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null -benchmark -
>
>    Test results on Snapdragon 845
>    Before:
>        frame=  250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.756x
> 	bench: utime=8.700s stime=4.410s rtime=13.226s
>    After:
> 	frame=  250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=1.07x
> 	bench: utime=4.920s stime=4.350s rtime=9.356s
>
>    Test results on HiSilicon Kirin 970:
>    Before:
>        frame=  250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.161x
> 	bench: utime=48.868s stime=13.124s rtime=62.110s
>    After:
>        frame=  250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.205x
> 	bench: utime=35.600s stime=13.036s rtime=48.708s
> ---
> libavfilter/aarch64/Makefile         |   2 +
> libavfilter/aarch64/scene_sad_init.c |  37 +++++++
> libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++
> libavfilter/scene_sad.c              |   2 +
> libavfilter/scene_sad.h              |   2 +
> 5 files changed, 192 insertions(+)
> create mode 100644 libavfilter/aarch64/scene_sad_init.c
> create mode 100644 libavfilter/aarch64/scene_sad_neon.S

Does your ASM handles cases when width is not a multiple of the 
vector size? If not, then you should probably do something similar to what 
is done for X86.

Thanks,
Marton

>
> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
> index 6c727f9859..3a458f511f 100644
> --- a/libavfilter/aarch64/Makefile
> +++ b/libavfilter/aarch64/Makefile
> @@ -1,7 +1,9 @@
> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_afir_init.o
> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_anlmdn_init.o
> +OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/scene_sad_init.o
> OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
> 
> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_afir_neon.o
> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_anlmdn_neon.o
> +NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/scene_sad_neon.o
> NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
> diff --git a/libavfilter/aarch64/scene_sad_init.c b/libavfilter/aarch64/scene_sad_init.c
> new file mode 100644
> index 0000000000..8de769ac10
> --- /dev/null
> +++ b/libavfilter/aarch64/scene_sad_init.c
> @@ -0,0 +1,37 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavfilter/scene_sad.h"
> +
> +void ff_scene_sad_neon(SCENE_SAD_PARAMS);
> +
> +void ff_scene_sad16_neon(SCENE_SAD_PARAMS);
> +
> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +    if (have_neon(cpu_flags)) {
> +        if (depth == 8)
> +            return ff_scene_sad_neon;
> +        if (depth == 16)
> +            return ff_scene_sad16_neon;
> +    }
> +
> +    return NULL;
> +}
> diff --git a/libavfilter/aarch64/scene_sad_neon.S b/libavfilter/aarch64/scene_sad_neon.S
> new file mode 100644
> index 0000000000..5b3b027a53
> --- /dev/null
> +++ b/libavfilter/aarch64/scene_sad_neon.S
> @@ -0,0 +1,149 @@
> +/*
> + * Copyright (c) 2020 Zhao Zhili
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1,
> +//                         const uint8_t *src2, ptrdiff_t stride2,
> +//                         ptrdiff_t width, ptrdiff_t height,
> +//                         uint64_t *sum)
> +.macro	scene_sad_neon, depth=8
> +	// x0: src1
> +	// x1: stride1
> +	// x2: src2
> +	// x3: stride2
> +	// x4: width
> +	// x5: height
> +	// x6: sum
> +
> +	// x7: step of width loop
> +	// x8: index of row
> +	// x9: width / x7 * x7
> +	// x10: sad
> +	// x11: index of column
> +	// w12: src1[x]
> +	// w13: src2[x]
> +
> +	mov	x8, xzr
> +	mov	x10, xzr
> +
> +.if \depth == 8
> +	mov	x7, #64
> +	and	x9, x4, #0xFFFFFFFFFFFFFFC0
> +.endif
> +
> +.if \depth == 16
> +	mov	x7, #32
> +	and	x9, x4, #0xFFFFFFFFFFFFFFE0
> +.endif
> +
> +1:	cmp	x4, x7		// check width
> +	mov	x11, xzr
> +	b.lt	3f
> +
> +	mov	v0.d[0], x10
> +
> +	// vector loop
> +2:
> +.if \depth == 8
> +	add	x14, x0, x11
> +	add	x15, x2, x11
> +.endif
> +
> +.if \depth == 16
> +	add	x14, x0, x11, lsl #1
> +	add	x15, x2, x11, lsl #1
> +.endif
> +	ld1	{v16.4S, v17.4S, v18.4S, v19.4S}, [x14]
> +	ld1	{v20.4S, v21.4S, v22.4S, v23.4S}, [x15]
> +	add	x11, x11, x7
> +	cmp	x9, x11
> +
> +.if \depth == 8
> +	uabd	v16.16B, v16.16B, v20.16B
> +	uabd	v17.16B, v17.16B, v21.16B
> +	uabd	v18.16B, v18.16B, v22.16B
> +	uabd	v19.16B, v19.16B, v23.16B
> +	uaddlv	h16, v16.16B
> +	uaddlv	h17, v17.16B
> +	uaddlv	h18, v18.16B
> +	uaddlv	h19, v19.16B
> +.endif
> +
> +.if \depth == 16
> +	uabd	v16.8H, v16.8H, v20.8H
> +	uabd	v17.8H, v17.8H, v21.8H
> +	uabd	v18.8H, v18.8H, v22.8H
> +	uabd	v19.8H, v19.8H, v23.8H
> +	uaddlv	s16, v16.8H
> +	uaddlv	s17, v17.8H
> +	uaddlv	s18, v18.8H
> +	uaddlv	s19, v19.8H
> +.endif
> +
> +	add	d16, d16, d17
> +	add	d18, d18, d19
> +	add	d0, d0, d16
> +	add	d0, d0, d18
> +
> +	b.ne	2b
> +
> +	cmp	x9, x4
> +	fmov	x10, d0
> +	b.eq	4f
> +
> +	// scalar loop
> +3:
> +.if \depth == 8
> +	ldrb	w12, [x0, x11]
> +	ldrb	w13, [x2, x11]
> +.endif
> +
> +.if \depth == 16
> +	ldrh	w12, [x0, x11, lsl #1]
> +	ldrh	w13, [x2, x11, lsl #1]
> +.endif
> +	add	x11, x11, #1
> +	subs	w12, w12, w13
> +	cneg	w12, w12, mi
> +	add	x10, x10, x12
> +	cmp	x11, x4
> +	b.ne	3b
> +
> +	// next row
> +4:
> +	add	x8, x8, #1              // =1
> +	add	x0, x0, x1
> +	cmp	x8, x5
> +	add	x2, x2, x3
> +	b.ne	1b
> +
> +5:
> +	str	x10, [x6]
> +	ret
> +.endm
> +
> +function ff_scene_sad_neon, export=1
> +	scene_sad_neon	depth=8
> +endfunc
> +
> +function ff_scene_sad16_neon, export=1
> +	scene_sad_neon	depth=16
> +endfunc
> diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
> index 73d3eacbfa..ee0c71f659 100644
> --- a/libavfilter/scene_sad.c
> +++ b/libavfilter/scene_sad.c
> @@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
>     ff_scene_sad_fn sad = NULL;
>     if (ARCH_X86)
>         sad = ff_scene_sad_get_fn_x86(depth);
> +    if (ARCH_AARCH64)
> +        sad = ff_scene_sad_get_fn_aarch64(depth);
>     if (!sad) {
>         if (depth == 8)
>             sad = ff_scene_sad_c;
> diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
> index 173a051f2b..c868200dc4 100644
> --- a/libavfilter/scene_sad.h
> +++ b/libavfilter/scene_sad.h
> @@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS);
> 
> void ff_scene_sad16_c(SCENE_SAD_PARAMS);
> 
> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth);
> +
> ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
> 
> ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
> -- 
> 2.22.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".


More information about the ffmpeg-devel mailing list