[FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation
Rémi Denis-Courmont
remi at remlab.net
Wed Jun 5 09:29:57 EEST 2024
Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack at foxmail.com> a écrit :
>From: Zhao Zhili <zhilizhao at tencent.com>
>
>Test on Apple M1:
>
>rgb24_to_uv_1080_c: 7.2
>rgb24_to_uv_1080_neon: 5.5
>rgb24_to_uv_1280_c: 8.2
>rgb24_to_uv_1280_neon: 6.2
>rgb24_to_uv_1920_c: 12.5
>rgb24_to_uv_1920_neon: 9.5
>
>rgb24_to_uv_half_540_c: 6.5
>rgb24_to_uv_half_540_neon: 3.0
>rgb24_to_uv_half_640_c: 7.5
>rgb24_to_uv_half_640_neon: 3.2
>rgb24_to_uv_half_960_c: 12.5
>rgb24_to_uv_half_960_neon: 6.0
>
>rgb24_to_y_1080_c: 4.5
>rgb24_to_y_1080_neon: 3.5
>rgb24_to_y_1280_c: 5.2
>rgb24_to_y_1280_neon: 4.2
>rgb24_to_y_1920_c: 8.0
>rgb24_to_y_1920_neon: 6.0
>
>Signed-off-by: Zhao Zhili <zhilizhao at tencent.com>
>---
> libswscale/aarch64/Makefile | 1 +
> libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c | 25 ++++
> 3 files changed, 255 insertions(+)
> create mode 100644 libswscale/aarch64/input.S
>
>diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>index da1d909561..adfd90a1b6 100644
>--- a/libswscale/aarch64/Makefile
>+++ b/libswscale/aarch64/Makefile
>@@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
> aarch64/swscale_unscaled.o \
>
> NEON-OBJS += aarch64/hscale.o \
>+ aarch64/input.o \
> aarch64/output.o \
> aarch64/rgb2rgb_neon.o \
> aarch64/yuv2rgb_neon.o \
>diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>new file mode 100644
>index 0000000000..ee0d223c6e
>--- /dev/null
>+++ b/libswscale/aarch64/input.S
>@@ -0,0 +1,229 @@
>+/*
>+ * Copyright (c) 2024 Zhao Zhili <quinkblack at foxmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/aarch64/asm.S"
>+
>+.macro rgb24_to_yuv_load_rgb, src
>+ ld3 { v16.16b, v17.16b, v18.16b }, [\src]
>+ ushll v19.8h, v16.8b, #0 // v19: r
>+ ushll v20.8h, v17.8b, #0 // v20: g
>+ ushll v21.8h, v18.8b, #0 // v21: b
>+ ushll2 v22.8h, v16.16b, #0 // v22: r
>+ ushll2 v23.8h, v17.16b, #0 // v23: g
>+ ushll2 v24.8h, v18.16b, #0 // v24: b
>+.endm
>+
>+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>+ mov \dst1\().16b, v6.16b // dst1 = const_offset
>+ mov \dst2\().16b, v6.16b // dst2 = const_offset
>+ smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx * r
>+ smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx * r
>+ smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx * g
>+ smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx * g
>+ smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx * b
>+ smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx * b
>+ sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half = dst1 >> right_shift
>+ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift
>+.endm
>+
>+function ff_rgb24ToY_neon, export=1
>+ cmp w4, #0 // check width > 0
>+ b.le 4f
>+
>+ ldp w10, w11, [x5], #8 // w10: ry, w11: gy
I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.
>+ dup v0.8h, w10
>+ dup v1.8h, w11
>+ ldr w12, [x5] // w12: by
>+ dup v2.8h, w12
>+
>+ mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
>+ movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
>+ dup v6.4s, w9 // w9: const_offset
>+
>+ mov x2, #0 // w2: i
>+ and w3, w4, #0xFFFFFFF0 // w3 = width / 16 * 16
>+ cbz w3, 3f
>+1:
>+ rgb24_to_yuv_load_rgb x1
>+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+ stp q16, q17, [x0], #32 // store to dst
>+
>+ add w2, w2, #16 // i += 16
>+ add x1, x1, #48 // src += 48
>+ cmp w2, w3 // i < (width / 16 * 16)
>+ b.lt 1b
>+ b 3f
>+2:
>+ ldrb w13, [x1] // w13: r
>+ ldrb w14, [x1, #1] // w14: g
>+ ldrb w15, [x1, #2] // w15: b
You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.
I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?
>+
>+ smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
>+ smaddl x13, w14, w11, x13 // x13 += gy * g
>+ smaddl x13, w15, w12, x13 // x13 += by * b
>+ asr w13, w13, #9 // x13 >>= 9
>+ strh w13, [x0], #2 // store to dst
>+
>+ add w2, w2, #1 // i++
>+ add x1, x1, #3 // src += 3
>+3:
>+ cmp w2, w4 // i < width
>+ b.lt 2b
>+4:
>+ ret
>+endfunc
>+
>+.macro rgb24_load_uv_coeff half
>+ add x6, x6, #12
>+
>+ ldp w10, w11, [x6], #8 // w10: ru, w11: gu
>+ dup v0.8h, w10
>+ dup v1.8h, w11
>+
>+ ldp w12, w13, [x6], #8 // w12: bu, w13: rv
>+ dup v2.8h, w12
>+ dup v3.8h, w13
>+
>+ ldp w14, w15, [x6], #8 // w14: gv, w15: bv
>+ dup v4.8h, w14
>+ dup v5.8h, w15
>+
>+ .if \half
>+ mov w9, #512
>+ movk w9, #128, lsl #16 // w9: const_offset
>+ .else
>+ mov w9, #256
>+ movk w9, #64, lsl #16 // w9: const_offset
>+ .endif
>+ dup v6.4s, w9
>+.endm
>+
>+function ff_rgb24ToUV_half_neon, export=1
>+ cmp w5, #0 // check width > 0
>+ b.le 4f
>+
>+ rgb24_load_uv_coeff half=1
>+
>+ mov x9, #0 // x9: i
>+ and w7, w5, #0xFFFFFFF8 // w7 = width / 8 * 8
>+ cbz w7, 3f
>+1:
>+ ld3 { v16.16b, v17.16b, v18.16b }, [x3]
>+ uaddlp v19.8h, v16.16b // v19: r
>+ uaddlp v20.8h, v17.16b // v20: g
>+ uaddlp v21.8h, v18.16b // v21: b
>+
>+ rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>+ str q16, [x0], #16 // store dst_u
>+ rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>+ str q17, [x1], #16 // store dst_v
>+
>+ add w9, w9, #8 // i += 8
>+ add x3, x3, #48 // src += 48
>+ cmp w9, w7 // i < (width * 8 / 8)
>+ b.lt 1b
>+ b 3f
>+2:
>+ ldrb w2, [x3] // w2: r1
>+ ldrb w4, [x3, #3] // w4: r2
>+ add w2, w2, w4 // w2 = r1 + r2
>+
>+ ldrb w4, [x3, #1] // w4: g1
>+ ldrb w7, [x3, #4] // w7: g2
>+ add w4, w4, w7 // w4 = g1 + g2
>+
>+ ldrb w7, [x3, #2] // w7: b1
>+ ldrb w8, [x3, #5] // w8: b2
>+ add w7, w7, w8 // w7 = b1 + b2
>+
>+ umov w8, v6.s[0] // dst_u = const_offset
>+ smaddl x8, w2, w10, x8 // dst_u += ru * r
>+ smaddl x8, w4, w11, x8 // dst_u += gu * g
>+ smaddl x8, w7, w12, x8 // dst_u += bu * b
>+ asr x8, x8, #10 // dst_u >>= 10
>+ strh w8, [x0], #2 // store dst_u
>+
>+ umov w8, v6.s[0] // dst_v = const_offset
>+ smaddl x8, w2, w13, x8 // dst_v += rv * r
>+ smaddl x8, w4, w14, x8 // dst_v += gv * g
>+ smaddl x8, w7, w15, x8 // dst_v += bv * b
>+ asr x8, x8, #10 // dst_v >>= 10
>+ strh w8, [x1], #2 // store dst_v
>+
>+ add w9, w9, #1 // i++
>+ add x3, x3, #6 // src += 6
>+3:
>+ cmp w9, w5
>+ b.lt 2b
>+4:
>+ ret
>+endfunc
>+
>+function ff_rgb24ToUV_neon, export=1
>+ cmp w5, #0 // check width > 0
>+ b.le 4f
>+
>+ rgb24_load_uv_coeff half=0
>+
>+ mov x2, #0 // w2: i
>+ and w4, w5, #0xFFFFFFF0 // w4: width / 16 * 16
>+ cbz w4, 3f
>+1:
>+ rgb24_to_yuv_load_rgb x3
>+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+ stp q16, q17, [x0], #32 // store to dst_u
>+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
>+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
>+ stp q16, q17, [x1], #32 // store to dst_v
>+
>+ add w2, w2, #16 // i += 16
>+ add x3, x3, #48 // src += 48
>+ cmp w2, w4 // i < (width / 16 * 16)
>+ b.lt 1b
>+ b 3f
>+2:
>+ ldrb w16, [x3] // w16: r
>+ ldrb w17, [x3, #1] // w17: g
>+ ldrb w4, [x3, #2] // w4: b
>+
>+ umov w7, v6.s[0] // w7 = const_offset
>+
>+ smaddl x8, w16, w10, x7 // x8 = ru * r + const_offset
>+ smaddl x8, w17, w11, x8 // x8 += gu * g
>+ smaddl x8, w4, w12, x8 // x8 += bu * b
>+ asr w8, w8, #9 // x8 >>= 9
>+ strh w8, [x0], #2 // store to dst_u
>+
>+ smaddl x8, w16, w13, x7 // x8 = rv * r + const_offset
>+ smaddl x8, w17, w14, x8 // x8 += gv * g
>+ smaddl x8, w4, w15, x8 // x8 += bv * b
>+ asr w8, w8, #9 // x8 >>= 9
>+ strh w8, [x1], #2 // store to dst_v
>+
>+ add w2, w2, #1 // i++
>+ add x3, x3, #3 // src += 3
>+3:
>+ cmp w2, w5 // i < width
>+ b.lt 2b
>+4:
>+ ret
>+endfunc
>diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>index bbd9719a44..4c4ea39dc1 100644
>--- a/libswscale/aarch64/swscale.c
>+++ b/libswscale/aarch64/swscale.c
>@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
> default: break; \
> }
>
>+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>+ const uint8_t *unused2, int width,
>+ uint32_t *rgb2yuv, void *opq);
>+
>+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+ const uint8_t *src1,
>+ const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+ void *opq);
>+
>+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+ const uint8_t *src1,
>+ const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+ void *opq);
>+
> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
>@@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> if (c->dstBpc == 8) {
> c->yuv2planeX = ff_yuv2planeX_8_neon;
> }
>+ switch (c->srcFormat) {
>+ case AV_PIX_FMT_RGB24:
>+ c->lumToYV12 = ff_rgb24ToY_neon;
>+ if (c->chrSrcHSubSample)
>+ c->chrToYV12 = ff_rgb24ToUV_half_neon;
>+ else
>+ c->chrToYV12 = ff_rgb24ToUV_neon;
>+ break;
>+ default:
>+ break;
>+ }
> }
> }
More information about the ffmpeg-devel
mailing list