[FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
Swinney, Jonathan
jswinney at amazon.com
Sat Jul 23 00:30:57 EEST 2022
As Martin noted, this patch doesn't build. But other than, that, it would be nice if there were comments on each line at least making some note about which of the 4 iterations each instruction calculates. That would make it a little bit easier to read, in my opinion, since the instructions are manually reordered.
Thanks,
--
Jonathan Swinney
On 7/15/22, 3:03 AM, "Hubert Mazur" <hum at semihalf.com> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Provide neon implementation for sse4 function.
Performance comparison tests are shown below.
- sse_2_c: 74.0
- sse_2_neon: 24.0
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
Signed-off-by: Hubert Mazur <hum at semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++
libavcodec/aarch64/me_cmp_neon.S | 65 ++++++++++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 3ff5767bd0..72a2062e7e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h);
+int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->sad[0] = ff_pix_abs16_neon;
c->sse[0] = sse16_neon;
+ c->sse[2] = sse4_neon;
}
}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 88cd335443..bacf151314 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -360,3 +360,68 @@ function sse16_neon, export=1
ret
endfunc
+
+function sse4_neon, export=1
+ // x0 - unused
+ // x1 - pix1
+ // x2 - pix2
+ // x3 - stride
+ // w4 - h
+
+ movi d18, #0
+ movi d17, #0
+ cmp w4, #4
+ b.le 2f
+
+// make 4 iterations at once
+1:
+
+ // res = abs(pix1[0] - pix2[0])
+ // res * res
+
+ ld1 {v0.4b}, [x1], x3
+ ld1 {v1.4b}, [x2], x3
+ uabdl v30.8h, v0.4b, v1.4b
+ ld1 {v2.4b}, [x1], x3
+ ld1 {v3.4b}, [x2], x3
+ umull v16.4s, v30.4h, v30.4h
+ uabdl v29.8h, v2.4b, v3.4b
+ ld1 {v4.4b}, [x1], x3
+ ld1 {v5.4b}, [x2], x3
+ umlal v16.4s, v29.4h, v29.4h
+ uabdl v28.8h, v4.4b, v5.4b
+ ld1 {v6.4b}, [x1], x3
+ ld1 {v7.4b}, [x2], x3
+ umlal v16.4s, v28.4h, v28.4h
+ uabdl v27.8h, v6.4b, v7.4b
+ umlal v16.4s, v27.4h, v27.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ sub w4, w4, #4
+ cmp w4, #4
+ b.ge 1b
+
+ cbnz w4, 2f
+ fmov w0, s18
+
+ ret
+
+// iterate by one
+2:
+ ld1 {v0.4b}, [x1], x3
+ ld1 {v1.4b}, [x2], x3
+ uabdl v30.8h, v0.4b, v1.4b
+ umull v16.4s, v30.4h, v30.4h
+
+ uaddlv d17, v16.4s
+ add d18, d18, d17
+
+ subs w4, w4, #1
+ b.ne 2b
+ fmov w0, s18
+
+ ret
+
+endfunc
--
2.34.1
More information about the ffmpeg-devel
mailing list