[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: Add pix_abs16_x2 neon implementation
Swinney, Jonathan
jswinney at amazon.com
Mon Jul 11 22:59:40 EEST 2022
> + // accumulate the result in d18
> + add d18, d18, d16
> + add d18, d18, d17
> + add d18, d18, d19
> + add d18, d18, d21
Did you experiment with distributing these instructions to each of the iteration blocks? It might be marginally faster since you could reduce the data dependencies in adjacent instructions.
--
Jonathan Swinney
From: Hubert Mazur <hum at semihalf.com>
Date: Monday, July 11, 2022 at 7:23 AM
To: "ffmpeg-devel at ffmpeg.org" <ffmpeg-devel at ffmpeg.org>
Cc: "Pop, Sebastian" <spop at amazon.com>, "Swinney, Jonathan" <jswinney at amazon.com>, Martin Storsjö <martin at martin.st>, Grzegorz Bernacki <gjb at semihalf.com>, Marcin Wojtas <mw at semihalf.com>
Subject: RE: [EXTERNAL][PATCH 2/2] lavc/aarch64: Add pix_abs16_x2 neon implementation
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Hi, do you have any feedback regarding the patch?
Regards,
Hubert
On Wed, Jun 29, 2022 at 10:25 AM Hubert Mazur <mailto:hum at semihalf.com> wrote:
Provide neon implementation for pix_abs16_x2 function.
Performance tests of implementation are below.
- pix_abs_0_1_c: 291.9
- pix_abs_0_1_neon: 73.7
Benchmarks and tests run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <mailto:hum at semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 +
libavcodec/aarch64/me_cmp_neon.S | 134 +++++++++++++++++++++++
2 files changed, 137 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index bec9148a1a..136b008eb7 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -27,6 +27,8 @@ int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
+int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -34,6 +36,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
if (have_neon(cpu_flags)) {
c->pix_abs[0][0] = ff_pix_abs16_neon;
+ c->pix_abs[0][1] = ff_pix_abs16_x2_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
c->sad[0] = ff_pix_abs16_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index a7937bd8be..c2fd94f4b3 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -203,3 +203,137 @@ function ff_pix_abs16_xy2_neon, export=1
fmov w0, s0 // copy result to general purpose register
ret
endfunc
+
+function ff_pix_abs16_x2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // x4 int h
+
+ // preserve value of v8-v12 registers
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+
+ // initialize buffers
+ movi d18, #0
+ movi v20.8h, #1
+ add x5, x2, #1 // pix2 + 1
+ cmp w4, #4
+ http://b.lt 2f
+
+// make 4 iterations at once
+1:
+ // v0 - pix1
+ // v1 - pix2
+ // v2 - pix2 + 1
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v2.16b}, [x5], x3
+
+ ld1 {v3.16b}, [x1], x3
+ ld1 {v4.16b}, [x2], x3
+ ld1 {v5.16b}, [x5], x3
+
+ ld1 {v6.16b}, [x1], x3
+ ld1 {v7.16b}, [x2], x3
+ ld1 {v8.16b}, [x5], x3
+
+ ld1 {v9.16b}, [x1], x3
+ ld1 {v10.16b}, [x2], x3
+ ld1 {v11.16b}, [x5], x3
+
+ // abs(pix1[0] - avg2(pix2[0], pix2[1]))
+ // avg2(a,b) = (((a) + (b) + 1) >> 1)
+ // abs(x) = (x < 0 ? -x : x)
+
+ // pix2[0] + pix2[1]
+ uaddl v30.8h, v1.8b, v2.8b
+ uaddl2 v29.8h, v1.16b, v2.16b
+ // add one to each element
+ add v30.8h, v30.8h, v20.8h
+ add v29.8h, v29.8h, v20.8h
+ // divide by 2, narrow width and store in v30
+ uqshrn v30.8b, v30.8h, #1
+ uqshrn2 v30.16b, v29.8h, #1
+
+ // abs(pix1[0] - avg2(pix2[0], pix2[1]))
+ uabd v16.16b, v0.16b, v30.16b
+ uaddlv h16, v16.16b
+
+ // 2nd iteration
+ uaddl v28.8h, v4.8b, v5.8b
+ uaddl2 v27.8h, v4.16b, v5.16b
+ add v28.8h, v28.8h, v20.8h
+ add v27.8h, v27.8h, v20.8h
+
+ uqshrn v28.8b, v28.8h, #1
+ uqshrn2 v28.16b, v27.8h, #1
+
+ uabd v17.16b, v3.16b, v28.16b
+ uaddlv h17, v17.16b
+
+ // 3rd iteration
+ uaddl v26.8h, v7.8b, v8.8b
+ uaddl2 v25.8h, v7.16b, v8.16b
+ add v26.8h, v26.8h, v20.8h
+ add v25.8h, v25.8h, v20.8h
+
+ uqshrn v26.8b, v26.8h, #1
+ uqshrn2 v26.16b, v25.8h, #1
+
+ uabd v19.16b, v6.16b, v26.16b
+ uaddlv h19, v19.16b
+
+ // 4th iteration
+ uaddl v24.8h, v10.8b, v11.8b
+ uaddl2 v23.8h, v10.16b, v11.16b
+ add v24.8h, v24.8h, v20.8h
+ add v23.8h, v23.8h, v20.8h
+
+ uqshrn v24.8b, v24.8h, #1
+ uqshrn2 v24.16b, v23.8h, #1
+
+ uabd v21.16b, v9.16b, v24.16b
+ uaddlv h21, v21.16b
+
+ sub w4, w4, #4
+
+ // accumulate the result in d18
+ add d18, d18, d16
+ add d18, d18, d17
+ add d18, d18, d19
+ add d18, d18, d21
+
+ cmp w4, #4
+ http://b.ge 1b
+ cbz w4, 3f
+
+// iterate by one
+2:
+ ld1 {v0.16b}, [x1], x3
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v2.16b}, [x5], x3
+
+ uaddl v30.8h, v1.8b, v2.8b
+ uaddl2 v29.8h, v1.16b, v2.16b
+ add v30.8h, v30.8h, v20.8h
+ add v29.8h, v29.8h, v20.8h
+
+ uqshrn v30.8b, v30.8h, #1
+ uqshrn2 v30.16b, v20.8h, #1
+
+ uabd v28.16b, v0.16b, v30.16b
+ uaddlv h28, v28.16b
+
+ add d18, d18, d28
+ subs w4, w4, #1
+ http://b.ne 2b
+
+3:
+ fmov w0, s18
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+
+ ret
+endfunc
--
2.34.1
More information about the ffmpeg-devel
mailing list