[FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation

Wed Jun 5 09:29:57 EEST 2024

Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkblack at foxmail.com> a écrit :
>From: Zhao Zhili <zhilizhao at tencent.com>
>
>Test on Apple M1:
>
>rgb24_to_uv_1080_c: 7.2
>rgb24_to_uv_1080_neon: 5.5
>rgb24_to_uv_1280_c: 8.2
>rgb24_to_uv_1280_neon: 6.2
>rgb24_to_uv_1920_c: 12.5
>rgb24_to_uv_1920_neon: 9.5
>
>rgb24_to_uv_half_540_c: 6.5
>rgb24_to_uv_half_540_neon: 3.0
>rgb24_to_uv_half_640_c: 7.5
>rgb24_to_uv_half_640_neon: 3.2
>rgb24_to_uv_half_960_c: 12.5
>rgb24_to_uv_half_960_neon: 6.0
>
>rgb24_to_y_1080_c: 4.5
>rgb24_to_y_1080_neon: 3.5
>rgb24_to_y_1280_c: 5.2
>rgb24_to_y_1280_neon: 4.2
>rgb24_to_y_1920_c: 8.0
>rgb24_to_y_1920_neon: 6.0
>
>Signed-off-by: Zhao Zhili <zhilizhao at tencent.com>
>---
> libswscale/aarch64/Makefile  |   1 +
> libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c |  25 ++++
> 3 files changed, 255 insertions(+)
> create mode 100644 libswscale/aarch64/input.S
>
>diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
>index da1d909561..adfd90a1b6 100644
>--- a/libswscale/aarch64/Makefile
>+++ b/libswscale/aarch64/Makefile
>@@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
>                aarch64/swscale_unscaled.o       \
> 
> NEON-OBJS   += aarch64/hscale.o                 \
>+               aarch64/input.o                  \
>                aarch64/output.o                 \
>                aarch64/rgb2rgb_neon.o           \
>                aarch64/yuv2rgb_neon.o           \
>diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>new file mode 100644
>index 0000000000..ee0d223c6e
>--- /dev/null
>+++ b/libswscale/aarch64/input.S
>@@ -0,0 +1,229 @@
>+/*
>+ * Copyright (c) 2024 Zhao Zhili <quinkblack at foxmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/aarch64/asm.S"
>+
>+.macro rgb24_to_yuv_load_rgb, src
>+        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>+        ushll           v19.8h, v16.8b, #0         // v19: r
>+        ushll           v20.8h, v17.8b, #0         // v20: g
>+        ushll           v21.8h, v18.8b, #0         // v21: b
>+        ushll2          v22.8h, v16.16b, #0        // v22: r
>+        ushll2          v23.8h, v17.16b, #0        // v23: g
>+        ushll2          v24.8h, v18.16b, #0        // v24: b
>+.endm
>+
>+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
>+        mov             \dst1\().16b, v6.16b                    // dst1 = const_offset
>+        mov             \dst2\().16b, v6.16b                    // dst2 = const_offset
>+        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx * r
>+        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx * r
>+        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx * g
>+        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx * g
>+        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx * b
>+        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx * b
>+        sqshrn          \dst\().4h, \dst1\().4s, \right_shift   // dst_lower_half = dst1 >> right_shift
>+        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>+.endm
>+
>+function ff_rgb24ToY_neon, export=1
>+        cmp             w4, #0                  // check width > 0
>+        b.le            4f
>+
>+        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy

I don't think it affects anything on your OoO execution hardware, but you're using the result of this load right off the bat in the next instruction. Ditto below. This may hurt perfs on not-so-fancy CPUs.

>+        dup             v0.8h, w10
>+        dup             v1.8h, w11
>+        ldr             w12, [x5]               // w12: by
>+        dup             v2.8h, w12
>+
>+        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>+        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>+        dup             v6.4s, w9               // w9: const_offset
>+
>+        mov             x2, #0                  // w2: i
>+        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
>+        cbz             w3, 3f
>+1:
>+        rgb24_to_yuv_load_rgb x1
>+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+        stp             q16, q17, [x0], #32     // store to dst
>+
>+        add             w2, w2, #16             // i += 16
>+        add             x1, x1, #48             // src += 48
>+        cmp             w2, w3                  // i < (width / 16 * 16)
>+        b.lt            1b
>+        b               3f
>+2:
>+        ldrb            w13, [x1]               // w13: r
>+        ldrb            w14, [x1, #1]           // w14: g
>+        ldrb            w15, [x1, #2]           // w15: b

You can reorder instructions a little to use post-index and eliminate the ADD, though that won't make much difference.

I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?

>+
>+        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
>+        smaddl          x13, w14, w11, x13      // x13 += gy * g
>+        smaddl          x13, w15, w12, x13      // x13 += by * b
>+        asr             w13, w13, #9            // x13 >>= 9
>+        strh            w13, [x0], #2           // store to dst
>+
>+        add             w2, w2, #1              // i++
>+        add             x1, x1, #3              // src += 3
>+3:
>+        cmp             w2, w4                  // i < width
>+        b.lt            2b
>+4:
>+        ret
>+endfunc
>+
>+.macro rgb24_load_uv_coeff half
>+        add             x6, x6, #12
>+
>+        ldp             w10, w11, [x6], #8      // w10: ru, w11: gu
>+        dup             v0.8h, w10
>+        dup             v1.8h, w11
>+
>+        ldp             w12, w13, [x6], #8      // w12: bu, w13: rv
>+        dup             v2.8h, w12
>+        dup             v3.8h, w13
>+
>+        ldp             w14, w15, [x6], #8      // w14: gv, w15: bv
>+        dup             v4.8h, w14
>+        dup             v5.8h, w15
>+
>+    .if \half
>+        mov             w9, #512
>+        movk            w9, #128, lsl #16       // w9: const_offset
>+    .else
>+        mov             w9, #256
>+        movk            w9, #64, lsl #16        // w9: const_offset
>+    .endif
>+        dup             v6.4s, w9
>+.endm
>+
>+function ff_rgb24ToUV_half_neon, export=1
>+        cmp             w5, #0          // check width > 0
>+        b.le            4f
>+
>+        rgb24_load_uv_coeff half=1
>+
>+        mov             x9, #0                  // x9: i
>+        and             w7, w5, #0xFFFFFFF8     // w7 = width / 8 * 8
>+        cbz             w7, 3f
>+1:
>+        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>+        uaddlp          v19.8h, v16.16b         // v19: r
>+        uaddlp          v20.8h, v17.16b         // v20: g
>+        uaddlp          v21.8h, v18.16b         // v21: b
>+
>+        rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>+        str             q16, [x0], #16          // store dst_u
>+        rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>+        str             q17, [x1], #16          // store dst_v
>+
>+        add             w9, w9, #8              // i += 8
>+        add             x3, x3, #48             // src += 48
>+        cmp             w9, w7                  // i < (width * 8 / 8)
>+        b.lt            1b
>+        b               3f
>+2:
>+        ldrb            w2, [x3]                // w2: r1
>+        ldrb            w4, [x3, #3]            // w4: r2
>+        add             w2, w2, w4              // w2 = r1 + r2
>+
>+        ldrb            w4, [x3, #1]            // w4: g1
>+        ldrb            w7, [x3, #4]            // w7: g2
>+        add             w4, w4, w7              // w4 = g1 + g2
>+
>+        ldrb            w7, [x3, #2]            // w7: b1
>+        ldrb            w8, [x3, #5]            // w8: b2
>+        add             w7, w7, w8              // w7 = b1 + b2
>+
>+        umov            w8, v6.s[0]             // dst_u = const_offset
>+        smaddl          x8, w2, w10, x8         // dst_u += ru * r
>+        smaddl          x8, w4, w11, x8         // dst_u += gu * g
>+        smaddl          x8, w7, w12, x8         // dst_u += bu * b
>+        asr             x8, x8, #10             // dst_u >>= 10
>+        strh            w8, [x0], #2            // store dst_u
>+
>+        umov            w8, v6.s[0]             // dst_v = const_offset
>+        smaddl          x8, w2, w13, x8         // dst_v += rv * r
>+        smaddl          x8, w4, w14, x8         // dst_v += gv * g
>+        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>+        asr             x8, x8, #10             // dst_v >>= 10
>+        strh            w8, [x1], #2            // store dst_v
>+
>+        add             w9, w9, #1              // i++
>+        add             x3, x3, #6              // src += 6
>+3:
>+        cmp             w9, w5
>+        b.lt            2b
>+4:
>+        ret
>+endfunc
>+
>+function ff_rgb24ToUV_neon, export=1
>+        cmp             w5, #0                  // check width > 0
>+        b.le            4f
>+
>+        rgb24_load_uv_coeff half=0
>+
>+        mov             x2, #0                  // w2: i
>+        and             w4, w5, #0xFFFFFFF0     // w4: width / 16 * 16
>+        cbz             w4, 3f
>+1:
>+        rgb24_to_yuv_load_rgb x3
>+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>+        stp             q16, q17, [x0], #32      // store to dst_u
>+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
>+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
>+        stp             q16, q17, [x1], #32      // store to dst_v
>+
>+        add             w2, w2, #16             // i += 16
>+        add             x3, x3, #48             // src += 48
>+        cmp             w2, w4                  // i < (width / 16 * 16)
>+        b.lt            1b
>+        b               3f
>+2:
>+        ldrb            w16, [x3]               // w16: r
>+        ldrb            w17, [x3, #1]           // w17: g
>+        ldrb            w4, [x3, #2]            // w4: b
>+
>+        umov            w7, v6.s[0]            // w7 = const_offset
>+
>+        smaddl          x8, w16, w10, x7        // x8 = ru * r + const_offset
>+        smaddl          x8, w17, w11, x8        // x8 += gu * g
>+        smaddl          x8, w4, w12, x8         // x8 += bu * b
>+        asr             w8, w8, #9              // x8 >>= 9
>+        strh            w8, [x0], #2            // store to dst_u
>+
>+        smaddl          x8, w16, w13, x7        // x8 = rv * r + const_offset
>+        smaddl          x8, w17, w14, x8        // x8 += gv * g
>+        smaddl          x8, w4, w15, x8         // x8 += bv * b
>+        asr             w8, w8, #9              // x8 >>= 9
>+        strh            w8, [x1], #2            // store to dst_v
>+
>+        add             w2, w2, #1              // i++
>+        add             x3, x3, #3              // src += 3
>+3:
>+        cmp             w2, w5                  // i < width
>+        b.lt            2b
>+4:
>+        ret
>+endfunc
>diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>index bbd9719a44..4c4ea39dc1 100644
>--- a/libswscale/aarch64/swscale.c
>+++ b/libswscale/aarch64/swscale.c
>@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
>     default: break;                                                     \
>     }
> 
>+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
>+                      const uint8_t *unused2, int width,
>+                      uint32_t *rgb2yuv, void *opq);
>+
>+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+                       const uint8_t *src1,
>+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+                       void *opq);
>+
>+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
>+                       const uint8_t *src1,
>+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
>+                       void *opq);
>+
> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> {
>     int cpu_flags = av_get_cpu_flags();
>@@ -212,5 +226,16 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>         if (c->dstBpc == 8) {
>             c->yuv2planeX = ff_yuv2planeX_8_neon;
>         }
>+        switch (c->srcFormat) {
>+        case AV_PIX_FMT_RGB24:
>+            c->lumToYV12 = ff_rgb24ToY_neon;
>+            if (c->chrSrcHSubSample)
>+                c->chrToYV12 = ff_rgb24ToUV_half_neon;
>+            else
>+                c->chrToYV12 = ff_rgb24ToUV_neon;
>+            break;
>+        default:
>+            break;
>+        }
>     }
> }