[FFmpeg-devel] [PATCH v2 7/7] aarch64: me_cmp: Improve scheduling in vsse_intra8
Grzegorz Bernacki
gjb at semihalf.com
Mon Oct 3 17:10:20 EEST 2022
From: Martin Storsjö <martin at martin.st>
Before: Cortex A53 A72 A73
vsse_5_neon: 74.7 31.5 26.0
After:
vsse_5_neon: 62.7 32.5 25.7
---
libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 61e4f68335..d8a18cd4b8 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
+ sub w4, w4, #1 // we need to make h-1 iterations
ld1 {v0.8b}, [x1], x3
+ cmp w4, #3
movi v16.4s, #0
- sub w4, w4, #1 // we need to make h-1 iterations
- cmp w4, #3
b.lt 2f
1:
@@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1
ld1 {v2.8b}, [x1], x3
uabd v30.8b, v0.8b, v1.8b
ld1 {v3.8b}, [x1], x3
- umull v29.8h, v30.8b, v30.8b
uabd v27.8b, v1.8b, v2.8b
- uadalp v16.4s, v29.8h
- umull v26.8h, v27.8b, v27.8b
+ umull v29.8h, v30.8b, v30.8b
uabd v25.8b, v2.8b, v3.8b
- uadalp v16.4s, v26.8h
+ umull v26.8h, v27.8b, v27.8b
+ uadalp v16.4s, v29.8h
umull v24.8h, v25.8b, v25.8b
+ uadalp v16.4s, v26.8h
sub w4, w4, #3
uadalp v16.4s, v24.8h
cmp w4, #3
--
2.37.1
More information about the ffmpeg-devel
mailing list