[FFmpeg-devel] [PATCH 3/3] avfilter/yadif: add avx2 filter_line function
James Darnley
jdarnley at obe.tv
Fri Feb 10 15:06:57 EET 2023
Zen 2 (Ryzen 7 3700X):
1.73x faster (3603±586.3 vs. 2082±317.1 decicycles) compared with ssse3
Using an SD y4m file speed increases from ~ 3600 fps to ~4700.
---
libavfilter/x86/vf_yadif.asm | 83 +++++++++++++++++++++++----------
libavfilter/x86/vf_yadif_init.c | 4 ++
2 files changed, 62 insertions(+), 25 deletions(-)
diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index 809cebdd3f..571febfca3 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -25,11 +25,30 @@
SECTION_RODATA
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
+pb_1: times 32 db 1
+pw_1: times 16 dw 1
SECTION .text
+%unmacro RSHIFT 2
+
+%macro RSHIFT 2
+%if mmsize == 32
+ vextracti128 xm7, %1, 1
+ palignr xmm %+ %1, xm7, xmm %+ %1, 2
+%else
+ psrldq %1, %2
+%endif
+%endmacro
+
+%macro UNPACK 1
+%if mmsize == 32
+ pmovzxbw %1, xmm %+ %1
+%else
+ punpcklbw %1, m7
+%endif
+%endmacro
+
%macro CHECK 2
movu m2, [curq+t1+%1]
movu m3, [curq+t0+%2]
@@ -40,7 +59,7 @@ SECTION .text
pand m4, [pb_1]
psubusb m5, m4
RSHIFT m5, 1
- punpcklbw m5, m7
+ UNPACK m5
mova m4, m2
psubusb m2, m3
psubusb m3, m4
@@ -49,9 +68,9 @@ SECTION .text
mova m4, m2
RSHIFT m3, 1
RSHIFT m4, 2
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
+ UNPACK m2
+ UNPACK m3
+ UNPACK m4
paddw m2, m3
paddw m2, m4
%endmacro
@@ -81,13 +100,19 @@ SECTION .text
%endmacro
%macro LOAD 2
- movh %1, %2
- punpcklbw %1, m7
+ %if mmsize == 32
+ pmovzxbw %1, %2
+ %else
+ movh %1, %2
+ punpcklbw %1, m7
+ %endif
%endmacro
%macro FILTER 3
.loop%1:
- pxor m7, m7
+ %if mmsize != 32
+ pxor m7, m7
+ %endif
LOAD m0, [curq+t1]
LOAD m1, [curq+t0]
LOAD m2, [%2]
@@ -95,9 +120,9 @@ SECTION .text
mova m4, m3
paddw m3, m2
psraw m3, 1
- mova [rsp+ 0], m0
- mova [rsp+16], m3
- mova [rsp+32], m1
+ mova [rsp+0*mmsize], m0
+ mova [rsp+1*mmsize], m3
+ mova [rsp+2*mmsize], m1
psubw m2, m4
ABS1 m2, m4
LOAD m3, [prevq+t1]
@@ -119,7 +144,7 @@ SECTION .text
paddw m3, m4
psrlw m3, 1
pmaxsw m2, m3
- mova [rsp+48], m2
+ mova [rsp+3*mmsize], m2
paddw m1, m0
paddw m0, m0
@@ -134,9 +159,9 @@ SECTION .text
psubusb m3, m4
pmaxub m2, m3
mova m3, m2
- psrldq m3, 2
- punpcklbw m2, m7
- punpcklbw m3, m7
+ RSHIFT m3, 2
+ UNPACK m2
+ UNPACK m3
paddw m0, m2
paddw m0, m3
psubw m0, [pw_1]
@@ -150,7 +175,7 @@ SECTION .text
CHECK 1, -3
CHECK2
- mova m6, [rsp+48]
+ mova m6, [rsp+3*mmsize]
cmp DWORD r8m, 2
jge .end%1
LOAD m2, [%2+t1*2]
@@ -161,9 +186,9 @@ SECTION .text
paddw m3, m5
psrlw m2, 1
psrlw m3, 1
- mova m4, [rsp+ 0]
- mova m5, [rsp+16]
- mova m7, [rsp+32]
+ mova m4, [rsp+0*mmsize]
+ mova m5, [rsp+1*mmsize]
+ mova m7, [rsp+2*mmsize]
psubw m2, m4
psubw m3, m7
mova m0, m5
@@ -182,15 +207,21 @@ SECTION .text
pmaxsw m6, m4
.end%1:
- mova m2, [rsp+16]
+ mova m2, [rsp+1*mmsize]
mova m3, m2
psubw m2, m6
paddw m3, m6
pmaxsw m1, m2
pminsw m1, m3
- packuswb m1, m1
- movh [dstq], m1
+ %if mmsize == 32
+ vextracti128 xm4, ym1, 1
+ packuswb xm1, xm4
+ movu [dstq], xm1
+ %else
+ packuswb m1, m1
+ movh [dstq], m1
+ %endif
add dstq, mmsize/2
add prevq, mmsize/2
add curq, mmsize/2
@@ -201,10 +232,10 @@ SECTION .text
%macro YADIF 0
%if ARCH_X86_32
-cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 6, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
mrefs, parity, mode
%else
-cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 7, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
mrefs, parity, mode
%endif
%if ARCH_X86_32
@@ -233,3 +264,5 @@ INIT_XMM ssse3
YADIF
INIT_XMM sse2
YADIF
+INIT_YMM avx2
+YADIF
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index d648f0f835..48858dc295 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,8 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
+void ff_yadif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int parity, int mode);
void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
@@ -68,5 +70,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth)
yadif->filter_line = ff_yadif_filter_line_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
yadif->filter_line = ff_yadif_filter_line_ssse3;
+ if (EXTERNAL_AVX2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_avx2;
}
}
--
2.39.1
More information about the ffmpeg-devel
mailing list