[FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif
Thomas Mundt
tmundt75 at gmail.com
Sat Mar 11 18:14:29 EET 2023
Hi James,
Am Mo., 20. Feb. 2023 um 20:59 Uhr schrieb James Darnley <jdarnley at obe.tv>:
> 2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
> ---
> libavfilter/x86/vf_bwdif.asm | 29 ++++++++++++++++++++++++-----
> libavfilter/x86/vf_bwdif_init.c | 12 ++++++++++++
> 2 files changed, 36 insertions(+), 5 deletions(-)
>
> diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
> index 0b453da53b..5cc61435fd 100644
> --- a/libavfilter/x86/vf_bwdif.asm
> +++ b/libavfilter/x86/vf_bwdif.asm
> @@ -26,18 +26,22 @@
>
> %include "libavutil/x86/x86util.asm"
>
> -SECTION_RODATA
> +SECTION_RODATA 32
>
> -pw_coefhf: times 4 dw 1016, 5570
> -pw_coefhf1: times 8 dw -3801
> -pw_coefsp: times 4 dw 5077, -981
> -pw_splfdif: times 4 dw -768, 768
> +pw_coefhf: times 8 dw 1016, 5570
> +pw_coefhf1: times 16 dw -3801
> +pw_coefsp: times 8 dw 5077, -981
> +pw_splfdif: times 8 dw -768, 768
>
> SECTION .text
>
> %macro LOAD8 2
> + %if mmsize == 32
> + pmovzxbw %1, %2
> + %else
> movh %1, %2
> punpcklbw %1, m7
> + %endif
> %endmacro
>
> %macro LOAD12 2
> @@ -45,8 +49,14 @@ SECTION .text
> %endmacro
>
> %macro DISP8 0
> + %if mmsize == 32
> + vextracti128 xm1, m2, 1
> + packuswb xm2, xm1
> + movu [dstq], xm2
> + %else
> packuswb m2, m2
> movh [dstq], m2
> + %endif
> %endmacro
>
> %macro DISP12 0
> @@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst,
> prev, cur, next, w, \
> prefs, mrefs, prefs2,
> mrefs2, \
> prefs3, mrefs3, prefs4, \
> mrefs4, parity, clip_max
> + %if mmsize == 32
> + vpbroadcastd m12, DWORD clip_maxm
>
I get a green pattern at bit depths > 8.
Looks good with:
vpbroadcastw m12, WORD clip_maxm
+ %else
> movd m12, DWORD clip_maxm
> SPLATW m12, m12, 0
> + %endif
> %else
> cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
> prefs, mrefs, prefs2,
> mrefs2, \
> @@ -264,3 +278,8 @@ INIT_XMM ssse3
> BWDIF
> INIT_XMM sse2
> BWDIF
> +
> +%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
> +INIT_YMM avx2
> +BWDIF
> +%endif
> diff --git a/libavfilter/x86/vf_bwdif_init.c
> b/libavfilter/x86/vf_bwdif_init.c
> index ba7bc40c3d..f833318c10 100644
> --- a/libavfilter/x86/vf_bwdif_init.c
> +++ b/libavfilter/x86/vf_bwdif_init.c
> @@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev,
> void *cur, void *next,
> int w, int prefs, int mrefs, int prefs2,
> int mrefs2, int prefs3, int mrefs3, int
> prefs4,
> int mrefs4, int parity, int clip_max);
> +void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void
> *next,
> + int w, int prefs, int mrefs, int prefs2,
> + int mrefs2, int prefs3, int mrefs3, int
> prefs4,
> + int mrefs4, int parity, int clip_max);
>
> void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur,
> void *next,
> int w, int prefs, int mrefs, int
> prefs2,
> @@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void
> *prev, void *cur, void *ne
> int w, int prefs, int mrefs, int
> prefs2,
> int mrefs2, int prefs3, int mrefs3,
> int prefs4,
> int mrefs4, int parity, int
> clip_max);
> +void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur,
> void *next,
> + int w, int prefs, int mrefs, int
> prefs2,
> + int mrefs2, int prefs3, int mrefs3,
> int prefs4,
> + int mrefs4, int parity, int
> clip_max);
>
> av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
> {
> @@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif,
> int bit_depth)
> bwdif->filter_line = ff_bwdif_filter_line_sse2;
> if (EXTERNAL_SSSE3(cpu_flags))
> bwdif->filter_line = ff_bwdif_filter_line_ssse3;
> + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
> + bwdif->filter_line = ff_bwdif_filter_line_avx2;
> } else if (bit_depth <= 12) {
> if (EXTERNAL_SSE2(cpu_flags))
> bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
> if (EXTERNAL_SSSE3(cpu_flags))
> bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
> + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
> + bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
> }
> }
> --
> 2.39.1
More information about the ffmpeg-devel
mailing list