[FFmpeg-devel] [PATCH] swscale/aarch64/rgb2rgb: add deinterleaveBytes neon implementation
Ramiro Polla
ramiro.polla at gmail.com
Fri Aug 30 21:56:55 EEST 2024
A55 A76
deinterleave_bytes_c: 70342.0 34497.5
deinterleave_bytes_neon: 21594.5 ( 3.26x) 5535.2 ( 6.23x)
deinterleave_bytes_aligned_c: 71340.8 34651.2
deinterleave_bytes_aligned_neon: 8616.8 ( 8.28x) 3996.2 ( 8.67x)
---
libswscale/aarch64/rgb2rgb.c | 4 ++
libswscale/aarch64/rgb2rgb_neon.S | 59 +++++++++++++++++++++++
tests/checkasm/sw_rgb.c | 77 +++++++++++++++++++++++++++++++
3 files changed, 140 insertions(+)
diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index a9bf6ff9e0..31db23bff4 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -30,6 +30,9 @@
void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
+void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
+ int width, int height, int srcStride,
+ int dst1Stride, int dst2Stride);
av_cold void rgb2rgb_init_aarch64(void)
{
@@ -37,5 +40,6 @@ av_cold void rgb2rgb_init_aarch64(void)
if (have_neon(cpu_flags)) {
interleaveBytes = ff_interleave_bytes_neon;
+ deinterleaveBytes = ff_deinterleave_bytes_neon;
}
}
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index d81110ec57..2e4f2fb766 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -77,3 +77,62 @@ function ff_interleave_bytes_neon, export=1
0:
ret
endfunc
+
+// void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
+// int width, int height, int srcStride,
+// int dst1Stride, int dst2Stride);
+function ff_deinterleave_bytes_neon, export=1
+ sub w5, w5, w3, lsl #1
+ sub w6, w6, w3
+ sub w7, w7, w3
+1:
+ ands w8, w3, #0xfffffff0 // & ~15
+ b.eq 3f
+2:
+ ld2 {v0.16b, v1.16b}, [x0], #32
+ subs w8, w8, #16
+ st1 {v0.16b}, [x1], #16
+ st1 {v1.16b}, [x2], #16
+ b.gt 2b
+
+ tst w3, #15
+ b.eq 9f
+
+3:
+ tst w3, #8
+ b.eq 4f
+ ld2 {v0.8b, v1.8b}, [x0], #16
+ st1 {v0.8b}, [x1], #8
+ st1 {v1.8b}, [x2], #8
+4:
+ tst w3, #4
+ b.eq 5f
+
+ ld1 {v0.8b}, [x0], #8
+ shrn v1.8b, v0.8h, #8
+ xtn v0.8b, v0.8h
+ st1 {v0.s}[0], [x1], #4
+ st1 {v1.s}[0], [x2], #4
+
+5:
+ ands w8, w3, #3
+ b.eq 9f
+6:
+ ldrh w9, [x0], #2
+ subs w8, w8, #1
+ ubfx w10, w9, #8, #8
+ strb w9, [x1], #1
+ strb w10, [x2], #1
+ b.gt 6b
+
+9:
+ subs w4, w4, #1
+ b.eq 0f
+ add x0, x0, w5, sxtw
+ add x1, x1, w6, sxtw
+ add x2, x2, w7, sxtw
+ b 1b
+
+0:
+ ret
+endfunc
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index f278454d3d..987841a54f 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -182,6 +182,80 @@ static void check_interleave_bytes(void)
}
}
+static void check_deinterleave_bytes(void)
+{
+ LOCAL_ALIGNED_16(uint8_t, src_buf, [2*MAX_STRIDE*MAX_HEIGHT+2]);
+ LOCAL_ALIGNED_16(uint8_t, dst0_u_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+ LOCAL_ALIGNED_16(uint8_t, dst0_v_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+ LOCAL_ALIGNED_16(uint8_t, dst1_u_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+ LOCAL_ALIGNED_16(uint8_t, dst1_v_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+ // Intentionally using unaligned buffers, as this function doesn't have
+ // any alignment requirements.
+ uint8_t *src = src_buf + 2;
+ uint8_t *dst0_u = dst0_u_buf + 1;
+ uint8_t *dst0_v = dst0_v_buf + 1;
+ uint8_t *dst1_u = dst1_u_buf + 1;
+ uint8_t *dst1_v = dst1_v_buf + 1;
+
+ declare_func(void, const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
+ int width, int height, int srcStride,
+ int dst1Stride, int dst2Stride);
+
+ randomize_buffers(src, 2*MAX_STRIDE*MAX_HEIGHT+2);
+
+ if (check_func(deinterleaveBytes, "deinterleave_bytes")) {
+ for (int i = 0; i <= 16; i++) {
+ // Try all widths [1,16], and try one random width.
+
+ int w = i > 0 ? i : (1 + (rnd() % (MAX_STRIDE-2)));
+ int h = 1 + (rnd() % (MAX_HEIGHT-2));
+
+ int src_offset = 0, src_stride = 2 * MAX_STRIDE;
+ int dst_u_offset = 0, dst_u_stride = MAX_STRIDE;
+ int dst_v_offset = 0, dst_v_stride = MAX_STRIDE;
+
+ memset(dst0_u, 0, MAX_STRIDE * MAX_HEIGHT);
+ memset(dst0_v, 0, MAX_STRIDE * MAX_HEIGHT);
+ memset(dst1_u, 0, MAX_STRIDE * MAX_HEIGHT);
+ memset(dst1_v, 0, MAX_STRIDE * MAX_HEIGHT);
+
+ // Try different combinations of negative strides
+ if (i & 1) {
+ src_offset = (h-1)*src_stride;
+ src_stride = -src_stride;
+ }
+ if (i & 2) {
+ dst_u_offset = (h-1)*dst_u_stride;
+ dst_u_stride = -dst_u_stride;
+ }
+ if (i & 4) {
+ dst_v_offset = (h-1)*dst_v_stride;
+ dst_v_stride = -dst_v_stride;
+ }
+
+ call_ref(src + src_offset, dst0_u + dst_u_offset, dst0_v + dst_v_offset,
+ w, h, src_stride, dst_u_stride, dst_v_stride);
+ call_new(src + src_offset, dst1_u + dst_u_offset, dst1_v + dst_v_offset,
+ w, h, src_stride, dst_u_stride, dst_v_stride);
+ // Check a one pixel-pair edge around the destination area,
+ // to catch overwrites past the end.
+ checkasm_check(uint8_t, dst0_u, MAX_STRIDE, dst1_u, MAX_STRIDE,
+ w + 1, h + 1, "dst_u");
+ checkasm_check(uint8_t, dst0_v, MAX_STRIDE, dst1_v, MAX_STRIDE,
+ w + 1, h + 1, "dst_v");
+ }
+
+ bench_new(src, dst1_u, dst1_v, 127, MAX_HEIGHT,
+ 2*MAX_STRIDE, MAX_STRIDE, MAX_STRIDE);
+ }
+ if (check_func(deinterleaveBytes, "deinterleave_bytes_aligned")) {
+ // Bench the function in a more typical case, with aligned
+ // buffers and widths.
+ bench_new(src_buf, dst1_u_buf, dst1_v_buf, 128, MAX_HEIGHT,
+ 2*MAX_STRIDE, MAX_STRIDE, MAX_STRIDE);
+ }
+}
+
#define MAX_LINE_SIZE 1920
static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE};
static const enum AVPixelFormat rgb_formats[] = {
@@ -315,6 +389,9 @@ void checkasm_check_sw_rgb(void)
check_interleave_bytes();
report("interleave_bytes");
+ check_deinterleave_bytes();
+ report("deinterleave_bytes");
+
ctx = sws_getContext(MAX_LINE_SIZE, MAX_LINE_SIZE, AV_PIX_FMT_RGB24,
MAX_LINE_SIZE, MAX_LINE_SIZE, AV_PIX_FMT_YUV420P,
SWS_ACCURATE_RND | SWS_BITEXACT, NULL, NULL, NULL);
--
2.30.2
More information about the ffmpeg-devel
mailing list