[FFmpeg-devel] [PATCH v2 2/7] aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon

Grzegorz Bernacki gjb at semihalf.com
Mon Oct 3 17:10:15 EEST 2022


From: Martin Storsjö <martin at martin.st>

Before:       Cortex A53    A72    A73
pix_abs_1_2_neon:   73.7   31.0   25.7
After:
pix_abs_1_2_neon:   61.7   30.2   24.7

Signed-off-by: Martin Storsjö <martin at martin.st>
---
 libavcodec/aarch64/me_cmp_neon.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 43e068bb7f..3662419edf 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -193,21 +193,20 @@ function ff_pix_abs8_y2_neon, export=1
 1:
         ld1             {v2.8b}, [x2], x3
         ld1             {v0.8b}, [x1], x3
-        ld1             {v6.8b}, [x1], x3
         urhadd          v30.8b, v1.8b, v2.8b
         ld1             {v5.8b}, [x2], x3
-        ld1             {v21.8b}, [x1], x3
+        ld1             {v6.8b}, [x1], x3
         uabal           v26.8h, v0.8b, v30.8b
         urhadd          v29.8b, v2.8b, v5.8b
         ld1             {v20.8b}, [x2], x3
-        ld1             {v24.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
         uabal           v26.8h, v6.8b, v29.8b
         urhadd          v28.8b, v5.8b, v20.8b
-        uabal           v26.8h, v21.8b, v28.8b
-        ld1             {v23.8b}, [x2], x3
-        mov             v1.8b, v23.8b
+        ld1             {v1.8b},  [x2], x3
+        ld1             {v24.8b}, [x1], x3
+        urhadd          v27.8b, v20.8b, v1.8b
         sub             w4, w4, #4
-        urhadd          v27.8b, v20.8b, v23.8b
+        uabal           v26.8h, v21.8b, v28.8b
         cmp             w4, #4
         uabal           v26.8h, v24.8b, v27.8b
 
-- 
2.37.1



More information about the ffmpeg-devel mailing list