[FFmpeg-devel] [PATCH 3/3] avfilter: add avx2 filter_line function for bwdif
James Darnley
jdarnley at obe.tv
Mon Feb 20 21:57:03 EET 2023
2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
---
libavfilter/x86/vf_bwdif.asm | 29 ++++++++++++++++++++++++-----
libavfilter/x86/vf_bwdif_init.c | 12 ++++++++++++
2 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
index 0b453da53b..5cc61435fd 100644
--- a/libavfilter/x86/vf_bwdif.asm
+++ b/libavfilter/x86/vf_bwdif.asm
@@ -26,18 +26,22 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
-pw_coefhf: times 4 dw 1016, 5570
-pw_coefhf1: times 8 dw -3801
-pw_coefsp: times 4 dw 5077, -981
-pw_splfdif: times 4 dw -768, 768
+pw_coefhf: times 8 dw 1016, 5570
+pw_coefhf1: times 16 dw -3801
+pw_coefsp: times 8 dw 5077, -981
+pw_splfdif: times 8 dw -768, 768
SECTION .text
%macro LOAD8 2
+ %if mmsize == 32
+ pmovzxbw %1, %2
+ %else
movh %1, %2
punpcklbw %1, m7
+ %endif
%endmacro
%macro LOAD12 2
@@ -45,8 +49,14 @@ SECTION .text
%endmacro
%macro DISP8 0
+ %if mmsize == 32
+ vextracti128 xm1, m2, 1
+ packuswb xm2, xm1
+ movu [dstq], xm2
+ %else
packuswb m2, m2
movh [dstq], m2
+ %endif
%endmacro
%macro DISP12 0
@@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
prefs, mrefs, prefs2, mrefs2, \
prefs3, mrefs3, prefs4, \
mrefs4, parity, clip_max
+ %if mmsize == 32
+ vpbroadcastd m12, DWORD clip_maxm
+ %else
movd m12, DWORD clip_maxm
SPLATW m12, m12, 0
+ %endif
%else
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
prefs, mrefs, prefs2, mrefs2, \
@@ -264,3 +278,8 @@ INIT_XMM ssse3
BWDIF
INIT_XMM sse2
BWDIF
+
+%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
+INIT_YMM avx2
+BWDIF
+%endif
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
index ba7bc40c3d..f833318c10 100644
--- a/libavfilter/x86/vf_bwdif_init.c
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
@@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
{
@@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
bwdif->filter_line = ff_bwdif_filter_line_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+ if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_avx2;
} else if (bit_depth <= 12) {
if (EXTERNAL_SSE2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+ if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
}
}
--
2.39.1
More information about the ffmpeg-devel
mailing list