[FFmpeg-devel] [PATCH v3 1/3] aarch64/vvc: Add w_avg

Thu Sep 26 18:57:30 EEST 2024

From: Zhao Zhili <zhilizhao at tencent.com>

w_avg_8_2x2_c:                                           0.0 ( 0.00x)
w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
w_avg_8_4x4_c:                                           0.2 ( 1.00x)
w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
w_avg_8_8x8_c:                                           1.2 ( 1.00x)
w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
w_avg_8_16x16_c:                                         4.2 ( 1.00x)
w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
w_avg_8_32x32_c:                                        16.2 ( 1.00x)
w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
w_avg_8_64x64_c:                                        64.5 ( 1.00x)
w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
w_avg_8_128x128_c:                                     269.5 ( 1.00x)
w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
w_avg_10_2x2_c:                                          0.2 ( 1.00x)
w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
w_avg_10_4x4_c:                                          0.2 ( 1.00x)
w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
w_avg_10_8x8_c:                                          1.0 ( 1.00x)
w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
w_avg_10_16x16_c:                                        4.2 ( 1.00x)
w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
w_avg_10_32x32_c:                                       16.2 ( 1.00x)
w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
w_avg_10_64x64_c:                                       66.2 ( 1.00x)
w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
w_avg_10_128x128_c:                                    277.8 ( 1.00x)
w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
w_avg_12_2x2_c:                                          0.0 ( 0.00x)
w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
w_avg_12_4x4_c:                                          0.2 ( 1.00x)
w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
w_avg_12_8x8_c:                                          1.2 ( 1.00x)
w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
w_avg_12_16x16_c:                                        4.8 ( 1.00x)
w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
w_avg_12_32x32_c:                                       17.0 ( 1.00x)
w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
w_avg_12_64x64_c:                                       64.0 ( 1.00x)
w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
w_avg_12_128x128_c:                                    269.2 ( 1.00x)
w_avg_12_128x128_neon:                                  42.0 ( 6.41x)
---
 libavcodec/aarch64/vvc/dsp_init.c | 34 +++++++++++
 libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
 2 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index ad767d17e2..41d0e02d62 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -52,6 +52,37 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *src0, const int16_t *src1, int width,
                         int height);
 
+void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         int width, int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                         const int16_t *src0, const int16_t *src1,
+                         int width, int height,
+                         uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
+                          const int16_t *src0, const int16_t *src1,
+                          int width, int height,
+                          uintptr_t w0_w1, uintptr_t offset_shift);
+/* When passing arguments to functions, Apple platforms diverge from the ARM64
+ * standard ABI, that we can't implement the function directly in asm.
+ */
+#define W_AVG_FUN(bit_depth) \
+static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
+    const int16_t *src0, const int16_t *src1, int width, int height, \
+    int denom, int w0, int w1, int o0, int o1) \
+{ \
+    const int shift = denom + FFMAX(3, 15 - bit_depth); \
+    const int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
+    uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
+    uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
+    ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
+}
+
+W_AVG_FUN(8)
+W_AVG_FUN(10)
+W_AVG_FUN(12)
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -123,6 +154,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
 
         c->inter.avg = ff_vvc_avg_8_neon;
+        c->inter.w_avg = vvc_w_avg_8;
 
         for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
             c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@@ -163,11 +195,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         }
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
+        c->inter.w_avg = vvc_w_avg_10;
 
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
     } else if (bd == 12) {
         c->inter.avg = ff_vvc_avg_12_neon;
+        c->inter.w_avg = vvc_w_avg_12;
 
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 2f69274b86..c4c6ab1a72 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -22,9 +22,9 @@
 
 #define VVC_MAX_PB_SIZE 128
 
-.macro vvc_avg, bit_depth
+.macro vvc_avg type, bit_depth
 
-.macro vvc_avg_\bit_depth\()_2_4, tap
+.macro vvc_\type\()_\bit_depth\()_2_4 tap
 .if \tap == 2
         ldr             s0, [src0]
         ldr             s2, [src1]
@@ -32,9 +32,19 @@
         ldr             d0, [src0]
         ldr             d2, [src1]
 .endif
+
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         add             v4.4s, v4.4s, v16.4s
         sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+.endif
+
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
 .if \tap == 2
@@ -57,7 +67,7 @@
         add             dst, dst, dst_stride
 .endm
 
-function ff_vvc_avg_\bit_depth\()_neon, export=1
+function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         dst             .req x0
         dst_stride      .req x1
         src0            .req x2
@@ -67,42 +77,64 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
 
         mov             x10, #(VVC_MAX_PB_SIZE * 2)
         cmp             width, #8
-.if \bit_depth == 8
-        movi            v16.4s, #64
-.else
-.if \bit_depth == 10
-        mov             w6, #1023
-        movi            v16.4s, #16
+.ifc \type, avg
+        movi            v16.4s, #(1 << (14 - \bit_depth))
 .else
-        mov             w6, #4095
-        movi            v16.4s, #4
-.endif
+        lsr             x11, x6, #32        // weight0
+        mov             w12, w6             // weight1
+        lsr             x13, x7, #32        // offset
+        mov             w14, w7             // shift
+
+        dup             v19.8h, w11
+        neg             w14, w14            // so we can use sqshl
+        dup             v20.8h, w12
+        dup             v16.4s, w13
+        dup             v22.4s, w14
+.endif // avg
+
+ .if \bit_depth >= 10
+        // clip pixel
+        mov             w6, #((1 << \bit_depth) - 1)
         movi            v18.8h, #0
         dup             v17.8h, w6
 .endif
+
         b.eq            8f
         b.hi            16f
         cmp             width, #4
         b.eq            4f
 2:      // width == 2
         subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 2
+        vvc_\type\()_\bit_depth\()_2_4 2
         b.ne            2b
         b               32f
 4:      // width == 4
         subs            height, height, #1
-        vvc_avg_\bit_depth\()_2_4 4
+        vvc_\type\()_\bit_depth\()_2_4 4
         b.ne            4b
         b               32f
 8:      // width == 8
         ld1             {v0.8h}, [src0], x10
         ld1             {v2.8h}, [src1], x10
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         saddl2          v5.4s, v0.8h, v2.8h
         add             v4.4s, v4.4s, v16.4s
         add             v5.4s, v5.4s, v16.4s
         sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
         sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+.else
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn2          v4.8h, v5.4s
+.endif
         subs            height, height, #1
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
@@ -122,6 +154,7 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
 17:
         ldp             q0, q1, [x7], #32
         ldp             q2, q3, [x8], #32
+.ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         saddl2          v5.4s, v0.8h, v2.8h
         saddl           v6.4s, v1.4h, v3.4h
@@ -134,6 +167,28 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
         sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
         sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
         sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
+.else   // avg
+        mov             v4.16b, v16.16b
+        mov             v5.16b, v16.16b
+        mov             v6.16b, v16.16b
+        mov             v7.16b, v16.16b
+        smlal           v4.4s, v0.4h, v19.4h
+        smlal           v4.4s, v2.4h, v20.4h
+        smlal2          v5.4s, v0.8h, v19.8h
+        smlal2          v5.4s, v2.8h, v20.8h
+        smlal           v6.4s, v1.4h, v19.4h
+        smlal           v6.4s, v3.4h, v20.4h
+        smlal2          v7.4s, v1.8h, v19.8h
+        smlal2          v7.4s, v3.8h, v20.8h
+        sqshl           v4.4s, v4.4s, v22.4s
+        sqshl           v5.4s, v5.4s, v22.4s
+        sqshl           v6.4s, v6.4s, v22.4s
+        sqshl           v7.4s, v7.4s, v22.4s
+        sqxtn           v4.4h, v4.4s
+        sqxtn           v6.4h, v6.4s
+        sqxtn2          v4.8h, v5.4s
+        sqxtn2          v6.8h, v7.4s
+.endif  // w_avg
         subs            w6, w6, #16
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
@@ -155,9 +210,19 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
         b.ne            16b
 32:
         ret
+
+.unreq dst
+.unreq dst_stride
+.unreq src0
+.unreq src1
+.unreq width
+.unreq height
 endfunc
 .endm
 
-vvc_avg 8
-vvc_avg 10
-vvc_avg 12
+vvc_avg avg, 8
+vvc_avg avg, 10
+vvc_avg avg, 12
+vvc_avg w_avg, 8
+vvc_avg w_avg, 10
+vvc_avg w_avg, 12
-- 
2.46.0