[FFmpeg-devel] [PATCH 2/4] libswscale: Avx2 hscale can process any input of size which is a multiple of 4.
Alan Kelly
alankelly at google.com
Mon Jan 10 16:58:34 EET 2022
The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.
---
libswscale/x86/scale_avx2.asm | 48 +++++++++++++++++++++++++++++++++--
1 file changed, 46 insertions(+), 2 deletions(-)
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 20acdbd633..dc42abb100 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize,
mova m14, [four]
shr fltsized, 2
%endif
+ cmp wq, 16
+ jl .tail_loop
+ mov countq, 0x10
.loop:
movu m1, [fltposq]
movu m2, [fltposq+32]
@@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize,
vpsrad m6, 7
vpackssdw m5, m5, m6
vpermd m5, m15, m5
- vmovdqu [dstq + countq * 2], m5
+ vmovdqu [dstq], m5
+ add dstq, 0x20
add fltposq, 0x40
add countq, 0x10
cmp countq, wq
- jl .loop
+ jle .loop
+
+ sub countq, 0x10
+ cmp countq, wq
+ jge .end
+
+.tail_loop:
+ movu xm1, [fltposq]
+%ifidn %1, X4
+ pxor xm9, xm9
+ pxor xm10, xm10
+ xor innerq, innerq
+.tail_innerloop:
+%endif
+ vpcmpeqd xm13, xm13
+ vpgatherdd xm3,[srcmemq + xm1], xm13
+ vpunpcklbw xm5, xm3, xm0
+ vpunpckhbw xm6, xm3, xm0
+ vpmaddwd xm5, xm5, [filterq]
+ vpmaddwd xm6, xm6, [filterq + 16]
+ add filterq, 0x20
+%ifidn %1, X4
+ paddd xm9, xm5
+ paddd xm10, xm6
+ paddd xm1, xm14
+ add innerq, 1
+ cmp innerq, fltsizeq
+ jl .tail_innerloop
+ vphaddd xm5, xm9, xm10
+%else
+ vphaddd xm5, xm5, xm6
+%endif
+ vpsrad xm5, 7
+ vpackssdw xm5, xm5, xm5
+ vmovq [dstq], xm5
+ add dstq, 0x8
+ add fltposq, 0x10
+ add countq, 0x4
+ cmp countq, wq
+ jl .tail_loop
+.end:
REP_RET
%endmacro
--
2.34.1.575.g55b058a8bb-goog
More information about the ffmpeg-devel
mailing list