[FFmpeg-devel] [PATCH v2 13/14] aarch64/vvc: Add put_epel_hv

Wed Sep 11 21:06:17 EEST 2024

From: Zhao Zhili <zhilizhao at tencent.com>

On Apple M1:

put_chroma_hv_8_4x4_c:                                   1.7 ( 1.00x)
put_chroma_hv_8_4x4_neon:                                0.2 ( 7.67x)
put_chroma_hv_8_8x8_c:                                   5.5 ( 1.00x)
put_chroma_hv_8_8x8_neon:                                0.5 (11.53x)
put_chroma_hv_8_16x16_c:                                18.5 ( 1.00x)
put_chroma_hv_8_16x16_neon:                              1.5 (12.53x)
put_chroma_hv_8_32x32_c:                                72.5 ( 1.00x)
put_chroma_hv_8_32x32_neon:                              4.7 (15.34x)
put_chroma_hv_8_64x64_c:                               274.0 ( 1.00x)
put_chroma_hv_8_64x64_neon:                             18.5 (14.83x)
put_chroma_hv_8_128x128_c:                            1058.7 ( 1.00x)
put_chroma_hv_8_128x128_neon:                           75.2 (14.07x)

On Android Pixel 8 Pro:

put_chroma_hv_8_4x4_c:                                   1.2 ( 1.00x)
put_chroma_hv_8_4x4_neon:                                0.0 ( 0.00x)
put_chroma_hv_8_4x4_i8mm:                                0.2 ( 5.00x)
put_chroma_hv_8_8x8_c:                                   4.0 ( 1.00x)
put_chroma_hv_8_8x8_neon:                                0.5 ( 8.00x)
put_chroma_hv_8_8x8_i8mm:                                0.5 ( 8.00x)
put_chroma_hv_8_16x16_c:                                15.2 ( 1.00x)
put_chroma_hv_8_16x16_neon:                              2.5 ( 6.10x)
put_chroma_hv_8_16x16_i8mm:                              2.2 ( 6.78x)
put_chroma_hv_8_32x32_c:                                61.0 ( 1.00x)
put_chroma_hv_8_32x32_neon:                              9.8 ( 6.26x)
put_chroma_hv_8_32x32_i8mm:                              8.5 ( 7.18x)
put_chroma_hv_8_64x64_c:                               229.5 ( 1.00x)
put_chroma_hv_8_64x64_neon:                             38.5 ( 5.96x)
put_chroma_hv_8_64x64_i8mm:                             34.0 ( 6.75x)
put_chroma_hv_8_128x128_c:                             919.8 ( 1.00x)
put_chroma_hv_8_128x128_neon:                          154.5 ( 5.95x)
put_chroma_hv_8_128x128_i8mm:                          140.0 ( 6.57x)
---
 libavcodec/aarch64/h26x/dsp.h       |   8 ++
 libavcodec/aarch64/h26x/epel_neon.S | 125 ++++++++++++++++++++++++++++
 libavcodec/aarch64/vvc/dsp_init.c   |  14 ++++
 3 files changed, 147 insertions(+)

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 90a42d7108..0fefb4d70f 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -297,4 +297,12 @@ NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
         const uint8_t *src, ptrdiff_t srcstride, int height,
         const int8_t *hf, const int8_t *vf, int width), _i8mm);
 
+NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width),);
+
+NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width), _i8mm);
+
 #endif
diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S
index cad8f2a5f4..e44a448b1f 100644
--- a/libavcodec/aarch64/h26x/epel_neon.S
+++ b/libavcodec/aarch64/h26x/epel_neon.S
@@ -72,6 +72,11 @@ endconst
         sxtl            v0.8h, v0.8b
 .endm
 
+.macro vvc_load_epel_filterh freg
+        ld1             {v0.8b}, [\freg]
+        sxtl            v0.8h, v0.8b
+.endm
+
 .macro calc_epelh dst, src0, src1, src2, src3
         smull           \dst\().4s, \src0\().4h, v0.h[0]
         smlal           \dst\().4s, \src1\().4h, v0.h[1]
@@ -2299,10 +2304,16 @@ endfunc
 DISABLE_I8MM
 #endif
 
+function vvc_put_epel_hv4_8_end_neon
+        vvc_load_epel_filterh x5
+        mov             x10, #(VVC_MAX_PB_SIZE * 2)
+        b               0f
+endfunc
 
 function hevc_put_hevc_epel_hv4_8_end_neon
         load_epel_filterh x5, x4
         mov             x10, #(HEVC_MAX_PB_SIZE * 2)
+0:
         ldr             d16, [sp]
         ldr             d17, [sp, x10]
         add             sp, sp, x10, lsl #1
@@ -2339,9 +2350,16 @@ function hevc_put_hevc_epel_hv6_8_end_neon
 2:      ret
 endfunc
 
+function vvc_put_epel_hv8_8_end_neon
+        vvc_load_epel_filterh x5
+        mov             x10, #(VVC_MAX_PB_SIZE * 2)
+        b               0f
+endfunc
+
 function hevc_put_hevc_epel_hv8_8_end_neon
         load_epel_filterh x5, x4
         mov             x10, #(HEVC_MAX_PB_SIZE * 2)
+0:
         ldr             q16, [sp]
         ldr             q17, [sp, x10]
         add             sp, sp, x10, lsl #1
@@ -2379,9 +2397,16 @@ function hevc_put_hevc_epel_hv12_8_end_neon
 2:      ret
 endfunc
 
+function vvc_put_epel_hv16_8_end_neon
+        vvc_load_epel_filterh x5
+        mov             x10, #(VVC_MAX_PB_SIZE * 2)
+        b               0f
+endfunc
+
 function hevc_put_hevc_epel_hv16_8_end_neon
         load_epel_filterh x5, x4
         mov             x10, #(HEVC_MAX_PB_SIZE * 2)
+0:
         ld1             {v16.8h, v17.8h}, [sp], x10
         ld1             {v18.8h, v19.8h}, [sp], x10
         ld1             {v20.8h, v21.8h}, [sp], x10
@@ -2437,6 +2462,21 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1
         b               hevc_put_hevc_epel_hv4_8_end_neon
 endfunc
 
+function ff_vvc_put_epel_hv4_8_\suffix, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #8
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_vvc_put_epel_h4_8_\suffix)
+        ldp             x0, x3, [sp, #16]
+        ldp             x5, x30, [sp], #32
+        b               vvc_put_epel_hv4_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1
         add             w10, w3, #3
         lsl             x10, x10, #7
@@ -2467,6 +2507,21 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1
         b               hevc_put_hevc_epel_hv8_8_end_neon
 endfunc
 
+function ff_vvc_put_epel_hv8_8_\suffix, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #8
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_vvc_put_epel_h8_8_\suffix)
+        ldp             x0, x3, [sp, #16]
+        ldp             x5, x30, [sp], #32
+        b               vvc_put_epel_hv8_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1
         add             w10, w3, #3
         lsl             x10, x10, #7
@@ -2497,6 +2552,21 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1
         b               hevc_put_hevc_epel_hv16_8_end_neon
 endfunc
 
+function ff_vvc_put_epel_hv16_8_\suffix, export=1
+        add             w10, w3, #3
+        lsl             x10, x10, #8
+        sub             sp, sp, x10 // tmp_array
+        stp             x5, x30, [sp, #-32]!
+        stp             x0, x3, [sp, #16]
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             w3, w3, #3
+        bl              X(ff_vvc_put_epel_h16_8_\suffix)
+        ldp             x0, x3, [sp, #16]
+        ldp             x5, x30, [sp], #32
+        b               vvc_put_epel_hv16_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1
         add             w10, w3, #3
         lsl             x10, x10, #7
@@ -2530,6 +2600,24 @@ function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1
         ret
 endfunc
 
+function ff_vvc_put_epel_hv32_8_\suffix, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #16
+        bl              X(ff_vvc_put_epel_hv16_8_\suffix)
+        ldp             x0, x1, [sp, #32]
+        ldp             x2, x3, [sp, #16]
+        ldp             x4, x5, [sp], #48
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_vvc_put_epel_hv16_8_\suffix)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
 function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1
         stp             x4, x5, [sp, #-64]!
         stp             x2, x3, [sp, #16]
@@ -2579,6 +2667,43 @@ function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1
         ldr             x30, [sp], #16
         ret
 endfunc
+
+function ff_vvc_put_epel_hv64_8_\suffix, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #32
+        bl              X(ff_vvc_put_epel_hv32_8_\suffix)
+        ldp             x0, x1, [sp, #32]
+        ldp             x2, x3, [sp, #16]
+        ldp             x4, x5, [sp], #48
+        add             x0, x0, #64
+        add             x1, x1, #32
+        mov             x6, #32
+        bl              X(ff_vvc_put_epel_hv32_8_\suffix)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
+function ff_vvc_put_epel_hv128_8_\suffix, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        str             x30, [sp, #48]
+        mov             x6, #64
+        bl              X(ff_vvc_put_epel_hv64_8_\suffix)
+        ldp             x0, x1, [sp, #32]
+        ldp             x2, x3, [sp, #16]
+        ldp             x4, x5, [sp], #48
+        add             x0, x0, #128
+        add             x1, x1, #64
+        mov             x6, #64
+        bl              X(ff_vvc_put_epel_hv64_8_\suffix)
+        ldr             x30, [sp], #16
+        ret
+endfunc
+
 .endm
 
 epel_hv neon
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index c947885145..4867491620 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -84,6 +84,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put[1][5][0][1] =
         c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon;
 
+        c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon;
+        c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon;
+        c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon;
+        c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon;
+        c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon;
+        c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon;
+
         c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
         c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
         c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
@@ -134,6 +141,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
             c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
             c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
             c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
+
+            c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm;
+            c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm;
+            c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm;
+            c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm;
+            c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
+            c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
         }
     } else if (bd == 10) {
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
-- 
2.42.0