[FFmpeg-devel] [PATCH 3/3] aarch64/vvc: Add put_qpel_hv

Tue Sep 10 20:35:06 EEST 2024

From: Zhao Zhili <zhilizhao at tencent.com>

With Apple M1 (no i8mm):

put_luma_hv_8_4x4_c:                                     2.2 ( 1.00x)
put_luma_hv_8_4x4_neon:                                  0.8 ( 3.00x)
put_luma_hv_8_8x8_c:                                     7.0 ( 1.00x)
put_luma_hv_8_8x8_neon:                                  0.8 ( 9.33x)
put_luma_hv_8_16x16_c:                                  22.8 ( 1.00x)
put_luma_hv_8_16x16_neon:                                2.5 ( 9.10x)
put_luma_hv_8_32x32_c:                                  84.8 ( 1.00x)
put_luma_hv_8_32x32_neon:                                9.5 ( 8.92x)
put_luma_hv_8_64x64_c:                                 333.0 ( 1.00x)
put_luma_hv_8_64x64_neon:                               35.5 ( 9.38x)
put_luma_hv_8_128x128_c:                              1294.5 ( 1.00x)
put_luma_hv_8_128x128_neon:                            137.8 ( 9.40x)

With Pixel 8 Pro:

put_luma_hv_8_4x4_c:                                     5.0 ( 1.00x)
put_luma_hv_8_4x4_neon:                                  0.8 ( 6.67x)
put_luma_hv_8_4x4_i8mm:                                  0.2 (20.00x)
put_luma_hv_8_8x8_c:                                    13.2 ( 1.00x)
put_luma_hv_8_8x8_neon:                                  1.2 (10.60x)
put_luma_hv_8_8x8_i8mm:                                  1.2 (10.60x)
put_luma_hv_8_16x16_c:                                  44.2 ( 1.00x)
put_luma_hv_8_16x16_neon:                                4.5 ( 9.83x)
put_luma_hv_8_16x16_i8mm:                                4.2 (10.41x)
put_luma_hv_8_32x32_c:                                 160.8 ( 1.00x)
put_luma_hv_8_32x32_neon:                               17.5 ( 9.19x)
put_luma_hv_8_32x32_i8mm:                               16.0 (10.05x)
put_luma_hv_8_64x64_c:                                 611.2 ( 1.00x)
put_luma_hv_8_64x64_neon:                               68.0 ( 8.99x)
put_luma_hv_8_64x64_i8mm:                               62.2 ( 9.82x)
put_luma_hv_8_128x128_c:                              2384.8 ( 1.00x)
put_luma_hv_8_128x128_neon:                            268.8 ( 8.87x)
put_luma_hv_8_128x128_i8mm:                            245.8 ( 9.70x)
---
 libavcodec/aarch64/h26x/dsp.h       |   8 ++
 libavcodec/aarch64/h26x/qpel_neon.S | 140 ++++++++++++++++++++++++++++
 libavcodec/aarch64/vvc/dsp_init.c   |  14 +++
 3 files changed, 162 insertions(+)

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 881091f39a..c54906dde2 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -282,4 +282,12 @@ void ff_vvc_put_qpel_v8_8_neon(int16_t *dst, const uint8_t *_src,
                                ptrdiff_t _srcstride, int height,
                                const int8_t *hf, const int8_t *vf, int width);
 
+NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width),);
+
+NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width), _i8mm);
+
 #endif
diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S
index 671942109a..1b3da375ba 100644
--- a/libavcodec/aarch64/h26x/qpel_neon.S
+++ b/libavcodec/aarch64/h26x/qpel_neon.S
@@ -4142,9 +4142,15 @@ endfunc
 DISABLE_I8MM
 #endif
 
+function vvc_put_qpel_hv4_8_end_neon
+         vvc_load_qpel_filterh x5
+         mov             x7, #(VVC_MAX_PB_SIZE * 2)
+         b               1f
+endfunc
 
 function hevc_put_hevc_qpel_hv4_8_end_neon
         load_qpel_filterh x5, x4
+1:
         ldr             d16, [sp]
         ldr             d17, [sp, x7]
         add             sp, sp, x7, lsl #1
@@ -4196,9 +4202,16 @@ function hevc_put_hevc_qpel_hv6_8_end_neon
         ret
 endfunc
 
+function vvc_put_qpel_hv8_8_end_neon
+        vvc_load_qpel_filterh x5
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
 function hevc_put_hevc_qpel_hv8_8_end_neon
         mov             x7, #128
         load_qpel_filterh x5, x4
+1:
         ldr             q16, [sp]
         ldr             q17, [sp, x7]
         add             sp, sp, x7, lsl #1
@@ -4249,9 +4262,16 @@ function hevc_put_hevc_qpel_hv12_8_end_neon
         ret
 endfunc
 
+function vvc_put_qpel_hv16_8_end_neon
+        vvc_load_qpel_filterh x5
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
 function hevc_put_hevc_qpel_hv16_8_end_neon
         mov             x7, #128
         load_qpel_filterh x5, x4
+1:
         ld1             {v16.8h, v17.8h}, [sp], x7
         ld1             {v18.8h, v19.8h}, [sp], x7
         ld1             {v20.8h, v21.8h}, [sp], x7
@@ -4274,6 +4294,12 @@ function hevc_put_hevc_qpel_hv16_8_end_neon
         ret
 endfunc
 
+function vvc_put_qpel_hv32_8_end_neon
+        vvc_load_qpel_filterh x5
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               0f
+endfunc
+
 function hevc_put_hevc_qpel_hv32_8_end_neon
         mov             x7, #128
         load_qpel_filterh x5, x4
@@ -4327,6 +4353,25 @@ function ff_hevc_put_hevc_qpel_hv4_8_\suffix, export=1
         b               hevc_put_hevc_qpel_hv4_8_end_neon
 endfunc
 
+function ff_vvc_put_qpel_hv4_8_\suffix, export=1
+        add             w10, w3, #8
+        lsl             x10, x10, #8
+        mov             x14, sp
+        sub             sp, sp, x10         // tmp_array
+        stp             x5,  x30, [sp, #-48]!
+        stp             x0,  x3,  [sp, #16]
+        str             x14,      [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x1, x2, lsl #1
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_vvc_put_qpel_h4_8_\suffix)
+        ldr             x14,      [sp, #32]
+        ldp             x0,  x3,  [sp, #16]
+        ldp             x5,  x30, [sp], #48
+        b               vvc_put_qpel_hv4_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_qpel_hv6_8_\suffix, export=1
         add             w10, w3, #8
         mov             x7, #128
@@ -4366,6 +4411,25 @@ function ff_hevc_put_hevc_qpel_hv8_8_\suffix, export=1
         b               hevc_put_hevc_qpel_hv8_8_end_neon
 endfunc
 
+function ff_vvc_put_qpel_hv8_8_\suffix, export=1
+        add             w10, w3, #8
+        lsl             x10, x10, #8
+        sub             x1, x1, x2, lsl #1
+        mov             x14, sp
+        sub             sp, sp, x10         // tmp_array
+        stp             x5,  x30, [sp, #-48]!
+        stp             x0,  x3,  [sp, #16]
+        str             x14,      [sp, #32]
+        add             x0, sp, #48
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_vvc_put_qpel_h8_8_\suffix)
+        ldr             x14,      [sp, #32]
+        ldp             x0,  x3,  [sp, #16]
+        ldp             x5,  x30, [sp], #48
+        b               vvc_put_qpel_hv8_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_qpel_hv12_8_\suffix, export=1
         add             w10, w3, #8
         lsl             x10, x10, #7
@@ -4405,6 +4469,25 @@ function ff_hevc_put_hevc_qpel_hv16_8_\suffix, export=1
         b               hevc_put_hevc_qpel_hv16_8_end_neon
 endfunc
 
+function ff_vvc_put_qpel_hv16_8_\suffix, export=1
+        add             w10, w3, #8
+        lsl             x10, x10, #8
+        sub             x1, x1, x2, lsl #1
+        mov             x14, sp
+        sub             sp, sp, x10         // tmp_array
+        stp             x5,  x30, [sp, #-48]!
+        stp             x0,  x3,  [sp, #16]
+        str             x14,      [sp, #32]
+        add             x3, x3, #7
+        add             x0, sp, #48
+        sub             x1, x1, x2
+        bl              X(ff_vvc_put_qpel_h16_8_\suffix)
+        ldr             x14,      [sp, #32]
+        ldp             x0,  x3,  [sp, #16]
+        ldp             x5,  x30, [sp], #48
+        b               vvc_put_qpel_hv16_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_qpel_hv24_8_\suffix, export=1
         stp             x4, x5, [sp, #-64]!
         stp             x2, x3, [sp, #16]
@@ -4441,6 +4524,26 @@ function ff_hevc_put_hevc_qpel_hv32_8_\suffix, export=1
         b               hevc_put_hevc_qpel_hv32_8_end_neon
 endfunc
 
+function ff_vvc_put_qpel_hv32_8_\suffix, export=1
+        add             w10, w3, #8
+        sub             x1, x1, x2, lsl #1
+        lsl             x10, x10, #8
+        sub             x1, x1, x2
+        mov             x14, sp
+        sub             sp, sp, x10         // tmp_array
+        stp             x5,  x30, [sp, #-48]!
+        stp             x0,  x3,  [sp, #16]
+        str             x14,      [sp, #32]
+        add             x3, x3, #7
+        add             x0, sp, #48
+        mov             w6, #32
+        bl              X(ff_vvc_put_qpel_h32_8_\suffix)
+        ldr             x14,      [sp, #32]
+        ldp             x0,  x3,  [sp, #16]
+        ldp             x5,  x30, [sp], #48
+        b               vvc_put_qpel_hv32_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_qpel_hv48_8_\suffix, export=1
         stp             x4, x5, [sp, #-64]!
         stp             x2, x3, [sp, #16]
@@ -4474,6 +4577,43 @@ function ff_hevc_put_hevc_qpel_hv64_8_\suffix, export=1
         ldr             x30, [sp], #16
         ret
 endfunc
+
+function ff_vvc_put_qpel_hv64_8_\suffix, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #32
+        bl              X(ff_vvc_put_qpel_hv32_8_\suffix)
+        ldp             x0, x1, [sp, #32]
+        ldp             x2, x3, [sp, #16]
+        ldp             x4, x5, [sp], #48
+        add             x1, x1, #32
+        add             x0, x0, #64
+        mov             x6, #32
+        bl              X(ff_vvc_put_qpel_hv32_8_\suffix)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_vvc_put_qpel_hv128_8_\suffix, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #64
+        bl              X(ff_vvc_put_qpel_hv64_8_\suffix)
+        ldp             x0, x1, [sp, #32]
+        ldp             x2, x3, [sp, #16]
+        ldp             x4, x5, [sp], #48
+        add             x1, x1, #64
+        add             x0, x0, #128
+        mov             x6, #64
+        bl              X(ff_vvc_put_qpel_hv64_8_\suffix)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
 .endm
 
 qpel_hv neon
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index ba3a49aa1a..934d918ffd 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -67,6 +67,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put[0][5][1][0] =
         c->inter.put[0][6][1][0] = ff_vvc_put_qpel_v8_8_neon;
 
+        c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon;
+        c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon;
+        c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon;
+        c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon;
+        c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon;
+        c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon;
+
         c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
         c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
         c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
@@ -103,6 +110,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
             c->inter.put[0][4][0][1] = ff_vvc_put_qpel_h32_8_neon_i8mm;
             c->inter.put[0][5][0][1] = ff_vvc_put_qpel_h64_8_neon_i8mm;
             c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h128_8_neon_i8mm;
+
+            c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon_i8mm;
+            c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon_i8mm;
+            c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon_i8mm;
+            c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon_i8mm;
+            c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon_i8mm;
+            c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon_i8mm;
         }
     } else if (bd == 10) {
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
-- 
2.42.0