[FFmpeg-devel] [PATCH] avcodec/aarch64/vvc: Optimize derive_bdof_vx_vy
Nuo Mi
nuomi2021 at gmail.com
Sun Jun 22 04:42:22 EEST 2025
On Fri, Jun 20, 2025 at 9:15 PM Zhao Zhili <
quinkblack-at-foxmail.com at ffmpeg.org> wrote:
> From: Zhao Zhili <zhilizhao at tencent.com>
>
> Before After
> -----------------------------------------------------------------
> apply_bdof_8_8x16_c: | 7375.5 ( 1.00x) | 7473.8 ( 1.00x)
> apply_bdof_8_8x16_neon: | 1875.1 ( 3.93x) | 1135.8 ( 6.58x)
> apply_bdof_8_16x8_c: | 7273.9 ( 1.00x) | 7204.0 ( 1.00x)
> apply_bdof_8_16x8_neon: | 1738.2 ( 4.18x) | 1013.0 ( 7.11x)
> apply_bdof_8_16x16_c: | 14744.9 ( 1.00x) | 14712.6 ( 1.00x)
> apply_bdof_8_16x16_neon: | 3446.7 ( 4.28x) | 1997.7 ( 7.36x)
> apply_bdof_10_8x16_c: | 7352.4 ( 1.00x) | 7485.7 ( 1.00x)
> apply_bdof_10_8x16_neon: | 1861.0 ( 3.95x) | 1134.1 ( 6.60x)
> apply_bdof_10_16x8_c: | 7330.5 ( 1.00x) | 7232.8 ( 1.00x)
> apply_bdof_10_16x8_neon: | 1747.2 ( 4.20x) | 1002.6 ( 7.21x)
> apply_bdof_10_16x16_c: | 14522.4 ( 1.00x) | 14664.8 ( 1.00x)
> apply_bdof_10_16x16_neon: | 3490.5 ( 4.16x) | 1978.4 ( 7.41x)
> apply_bdof_12_8x16_c: | 7389.0 ( 1.00x) | 7380.1 ( 1.00x)
> apply_bdof_12_8x16_neon: | 1861.3 ( 3.97x) | 1134.0 ( 6.51x)
> apply_bdof_12_16x8_c: | 7283.1 ( 1.00x) | 7336.9 ( 1.00x)
> apply_bdof_12_16x8_neon: | 1749.1 ( 4.16x) | 1002.3 ( 7.32x)
> apply_bdof_12_16x16_c: | 14580.7 ( 1.00x) | 14502.7 ( 1.00x)
> apply_bdof_12_16x16_neon: | 3472.9 ( 4.20x) | 1978.3 ( 7.33x)
>
Great!.
2x than the previous version
https://github.com/FFmpeg/FFmpeg/commit/952508ae059dcf5448a48f525d15f0fbf52df92c
Thank you, Zhili.
> ---
> libavcodec/aarch64/vvc/dsp_init.c | 17 +-
> libavcodec/aarch64/vvc/inter.S | 632 ++++++++++++++++-----------
> libavcodec/aarch64/vvc/of_template.c | 15 +-
> 3 files changed, 399 insertions(+), 265 deletions(-)
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c
> b/libavcodec/aarch64/vvc/dsp_init.c
> index 9a171234f6..1db38ebb1d 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -37,11 +37,18 @@ void ff_vvc_prof_grad_filter_8x_neon(int16_t
> *gradient_h,
> ptrdiff_t src_stride,
> int width, int height);
>
> -void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t
> *_src1,
> - int pad_mask,
> - const int16_t **gradient_h,
> - const int16_t **gradient_v,
> - int16_t *vx, int16_t *vy);
> +void ff_vvc_derive_bdof_vx_vy_8x_neon(const int16_t *_src0,
> + const int16_t *_src1,
> + int16_t *const gradient_h[2],
> + int16_t *const gradient_v[2],
> + int16_t vx[16], int16_t vy[16],
> + int block_h);
> +void ff_vvc_derive_bdof_vx_vy_16x_neon(const int16_t *_src0,
> + const int16_t *_src1,
> + int16_t *const gradient_h[2],
> + int16_t *const gradient_v[2],
> + int16_t vx[16], int16_t vy[16],
> + int block_h);
> #define BIT_DEPTH 8
> #include "alf_template.c"
> #include "of_template.c"
> diff --git a/libavcodec/aarch64/vvc/inter.S
> b/libavcodec/aarch64/vvc/inter.S
> index c299e6f68b..06c6f3619b 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -804,262 +804,388 @@ function ff_vvc_apply_bdof_block_12_neon, export=1
> vvc_apply_bdof_block 12
> endfunc
>
> -function ff_vvc_derive_bdof_vx_vy_neon, export=1
> - src0 .req x0
> - src1 .req x1
> - pad_mask .req w2
> - gh .req x3
> - gv .req x4
> - vx .req x5
> - vy .req x6
> -
> - gh0 .req x7
> - gh1 .req x8
> - gv0 .req x9
> - gv1 .req x10
> - y .req x12
> -
> - sgx2 .req w7
> - sgy2 .req w8
> - sgxgy .req w9
> - sgxdi .req w10
> - sgydi .req w11
> -
> - sgx2_v .req v22
> - sgy2_v .req v23
> - sgxgy_v .req v24
> - sgxdi_v .req v25
> - sgydi_v .req v26
> -
> - sgx2_v2 .req v27
> - sgy2_v2 .req v28
> - sgxgy_v2 .req v29
> - sgxdi_v2 .req v30
> - sgydi_v2 .req v31
> -
> - ldp gh0, gh1, [gh]
> - ldp gv0, gv1, [gv]
> - movi sgx2_v.4s, #0
> - movi sgy2_v.4s, #0
> - movi sgxgy_v.4s, #0
> - movi sgxdi_v.4s, #0
> - movi sgydi_v.4s, #0
> - movi sgx2_v2.4s, #0
> - movi sgy2_v2.4s, #0
> - movi sgxgy_v2.4s, #0
> - movi sgxdi_v2.4s, #0
> - movi sgydi_v2.4s, #0
> - mov x13, #-1 // dy
> - movi v6.4s, #0
> - mov y, #-1
> - tbz pad_mask, #1, 1f // check pad top
> - mov x13, #0 // dy: pad top
> +const bdof_vx_vy_8x_tbl
> + .byte 0, 1, 16, 16, 16, 16, 8, 9
> + .byte 6, 7, 16, 16, 16, 16, 14, 15
> +endconst
> +
> +const bdof_vx_vy_16x_tbl
> + .byte 0, 1, 64, 64, 64, 64, 8, 9
> + .byte 6, 7, 64, 64, 64, 64, 16, 17
> + .byte 14, 15, 64, 64, 64, 64, 24, 25
> + .byte 22, 23, 64, 64, 64, 64, 30, 31
> +endconst
> +
> +// line(-1), line0, line1, line2, line3, line4
> +// line3 and line4 becomes line(-1) and line0 in the next block.
> +.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
> + mov \tmp0\().16b, v28.16b
> + mov \tmp1\().16b, v29.16b
> + mov \tmp2\().16b, v30.16b
> + mov \tmp3\().16b, v31.16b
> + mov \tmp4\().16b, v8.16b
> +.endm
> +
> +.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
> + add v25.4s, v25.4s, \tmp0\().4s
> + add v27.4s, v27.4s, \tmp1\().4s
> + add v23.4s, v23.4s, \tmp2\().4s
> + sub v26.4s, v26.4s, \tmp3\().4s
> + sub v24.4s, v24.4s, \tmp4\().4s
> +.endm
> +
> +.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
> + tbl \tmp0\().16b, { \src\().16b }, v0.16b
> + saddl \tmp1\().4s, \tmp0\().4h, \src\().4h
> + saddl2 \dst\().4s, \tmp0\().8h, \src\().8h
> + addp \dst\().4s, \tmp1\().4s, \dst\().4s
> +.endm
> +
> +.macro bdof_vx_vy_sign src, tmp0, tmp1, dst
> + cmlt \tmp0\().8h, \src\().8h, #0
> + cmgt \tmp1\().8h, \src\().8h, #0
> + sub \dst\().8h, \tmp0\().8h, \tmp1\().8h
> +.endm
> +
> +.macro bdof_vx_vy_clip_mask src, max, min, mask, dst
> + smin \src\().4s, \src\().4s, \max\().4s
> + smax \src\().4s, \src\().4s, \min\().4s
> + cmgt \mask\().4s, \mask\().4s, #0
> + and \dst\().16b, \src\().16b, \mask\().16b
> +.endm
> +
> +.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
> + mov \tmp0\().16b, v29.16b
> + mov \tmp1\().16b, v30.16b
> + mov \tmp2\().16b, v31.16b
> + mov \tmp3\().16b, v8.16b
> + mov \tmp4\().16b, v9.16b
> +.endm
> +
> +.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
> + add v25.4s, v25.4s, \tmp0\().4s
> + add v24.4s, v24.4s, \tmp1\().4s
> + add v26.4s, v26.4s, \tmp2\().4s
> + sub v28.4s, v28.4s, \tmp3\().4s
> + sub v27.4s, v27.4s, \tmp4\().4s
> +.endm
> +
> +.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
> + tbl \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
> + tbl v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
> + saddl \tmp1\().4s, \tmp0\().4h, \src0\().4h
> + saddl \tmp2\().4s, v2.4h, \src1\().4h
> + saddl2 \tmp0\().4s, \tmp0\().8h, \src0\().8h
> + saddl2 \dst\().4s, v2.8h, \src1\().8h
> + addp \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
> + addp \dst\().4s, \tmp2\().4s, \dst\().4s
> + addp \dst\().4s, \tmp0\().4s, \dst\().4s
> +.endm
> +
> +/*
> + * x0: const int16_t *_src0,
> + * x1: const int16_t *_src1,
> + * x2: int16_t *gradient_h[2],
> + * x3: int16_t *gradient_v[2],
> + * x4: int16_t vx[16],
> + * x5: int16_t vy[16],
> + * w6: int block_h
> + */
> +function ff_vvc_derive_bdof_vx_vy_8x_neon, export=1
> + stp d11, d10, [sp, #-0x20]!
> + stp d9, d8, [sp, #0x10]
> +
> + ldp x14, x13, [x2] // gh0, gh1
> + ldp x10, x9, [x3] // gv0, gv1
> + movrel x11, bdof_vx_vy_8x_tbl
> + ldr q0, [x11] // table
> + mvni v2.4s, #30 // -31, for
> log2
> + movi v3.4s, #15 // clip to 15
> + mvni v4.4s, #14 // clip to -15
> +
> + mov w11, #0x8
> + mov w12, w6 // y = block_h
> + b 4f
> +
> 1:
> - mov x16, #-2 // dx
> - add x14, src0, x13, lsl #8 // local src0
> - add x15, src1, x13, lsl #8 // local src1
> - add x17, x16, x13, lsl #5
> - ldr q0, [x14, x16]
> - ldr q1, [x15, x16]
> - ldr q2, [gh0, x17]
> - ldr q3, [gh1, x17]
> - ldr q4, [gv0, x17]
> - ldr q5, [gv1, x17]
> - add x16, x16, #8
> - add x17, x17, #8
> - ins v0.s[3], v6.s[3]
> - ins v1.s[3], v6.s[3]
> - ins v2.s[3], v6.s[3]
> - ins v3.s[3], v6.s[3]
> - ins v4.s[3], v6.s[3]
> - ins v5.s[3], v6.s[3]
> -
> - ldr q16, [x14, x16]
> - ldr q17, [x15, x16]
> - ldr q18, [gh0, x17]
> - ldr q19, [gh1, x17]
> - ldr q20, [gv0, x17]
> - ldr q21, [gv1, x17]
> - ins v16.s[3], v6.s[3]
> - ins v17.s[3], v6.s[3]
> - ins v18.s[3], v6.s[3]
> - ins v19.s[3], v6.s[3]
> - ins v20.s[3], v6.s[3]
> - ins v21.s[3], v6.s[3]
> -
> - tbz pad_mask, #0, 20f
> - // pad left
> - ins v0.h[0], v0.h[1]
> - ins v1.h[0], v1.h[1]
> - ins v2.h[0], v2.h[1]
> - ins v3.h[0], v3.h[1]
> - ins v4.h[0], v4.h[1]
> - ins v5.h[0], v5.h[1]
> -20:
> - tbz pad_mask, #2, 21f
> - // pad right
> - ins v16.h[5], v16.h[4]
> - ins v17.h[5], v17.h[4]
> - ins v18.h[5], v18.h[4]
> - ins v19.h[5], v19.h[4]
> - ins v20.h[5], v20.h[4]
> - ins v21.h[5], v21.h[4]
> -21:
> - sshr v0.8h, v0.8h, #4
> - sshr v1.8h, v1.8h, #4
> - add v2.8h, v2.8h, v3.8h
> - add v4.8h, v4.8h, v5.8h
> - sub v0.8h, v0.8h, v1.8h // diff
> - sshr v2.8h, v2.8h, #1 // temph
> - sshr v4.8h, v4.8h, #1 // tempv
> -
> - sshr v16.8h, v16.8h, #4
> - sshr v17.8h, v17.8h, #4
> - add v18.8h, v18.8h, v19.8h
> - add v20.8h, v20.8h, v21.8h
> - sub v16.8h, v16.8h, v17.8h // diff
> - sshr v18.8h, v18.8h, #1 // temph
> - sshr v20.8h, v20.8h, #1 // tempv
> -
> - abs v3.8h, v2.8h
> - abs v5.8h, v4.8h
> - uxtl v19.4s, v3.4h
> - uxtl v21.4s, v5.4h
> - uxtl2 v3.4s, v3.8h
> - uxtl2 v5.4s, v5.8h
> - add v3.4s, v3.4s, v19.4s
> - add v5.4s, v5.4s, v21.4s
> - add sgx2_v.4s, sgx2_v.4s, v3.4s
> - add sgy2_v.4s, sgy2_v.4s, v5.4s
> -
> - abs v3.8h, v18.8h
> - abs v5.8h, v20.8h
> - uxtl v19.4s, v3.4h
> - uxtl v21.4s, v5.4h
> - uxtl2 v3.4s, v3.8h
> - uxtl2 v5.4s, v5.8h
> - add v3.4s, v3.4s, v19.4s
> - add v5.4s, v5.4s, v21.4s
> - add sgx2_v2.4s, sgx2_v2.4s, v3.4s
> - add sgy2_v2.4s, sgy2_v2.4s, v5.4s
> -
> - cmgt v17.8h, v4.8h, #0
> - cmlt v7.8h, v4.8h, #0
> - cmgt v19.8h, v20.8h, #0
> - cmlt v21.8h, v20.8h, #0
> - sub v17.8h, v7.8h, v17.8h // VVC_SIGN(tempv)
> - sub v19.8h, v21.8h, v19.8h // VVC_SIGN(tempv)
> -
> - smlal sgxgy_v.4s, v17.4h, v2.4h
> - smlal2 sgxgy_v.4s, v17.8h, v2.8h
> - smlsl sgydi_v.4s, v17.4h, v0.4h
> - smlsl2 sgydi_v.4s, v17.8h, v0.8h
> -
> - cmgt v3.8h, v2.8h, #0
> - cmlt v5.8h, v2.8h, #0
> - cmgt v17.8h, v18.8h, #0
> - cmlt v21.8h, v18.8h, #0
> - sub v3.8h, v5.8h, v3.8h // VVC_SIGN(temph)
> - sub v17.8h, v21.8h, v17.8h // VVC_SIGN(temph)
> -
> - smlal sgxgy_v2.4s, v19.4h, v18.4h
> - smlal2 sgxgy_v2.4s, v19.8h, v18.8h
> - smlsl sgydi_v2.4s, v19.4h, v16.4h
> - smlsl2 sgydi_v2.4s, v19.8h, v16.8h
> -
> - smlsl sgxdi_v.4s, v3.4h, v0.4h
> - smlsl2 sgxdi_v.4s, v3.8h, v0.8h
> - smlsl sgxdi_v2.4s, v17.4h, v16.4h
> - smlsl2 sgxdi_v2.4s, v17.8h, v16.8h
> -3:
> - add y, y, #1
> - cmp y, #(BDOF_MIN_BLOCK_SIZE)
> - mov x13, y
> - b.gt 4f
> - b.lt 1b
> - tbz pad_mask, #3, 1b
> - sub x13, x13, #1 // pad bottom
> - b 1b
> + // save line4 results
> + bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
> +2:
> + addp v25.4s, v25.4s, v25.4s
> + addp v27.4s, v27.4s, v27.4s
> + addp v26.4s, v26.4s, v26.4s
> + addp v23.4s, v23.4s, v23.4s
> + addp v24.4s, v24.4s, v24.4s
> +
> + clz v28.4s, v25.4s
> + add v28.4s, v28.4s, v2.4s // log2
> + shl v26.4s, v26.4s, #0x2
> + sshl v26.4s, v26.4s, v28.4s
> +
> + bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
> + sqxtn v26.4h, v25.4s
> + st1 {v26.s}[0], [x4], x11
> +
> + subs x12, x12, #(BDOF_MIN_BLOCK_SIZE)
> +
> + clz v26.4s, v27.4s
> + add v26.4s, v26.4s, v2.4s
> + shl v24.4s, v24.4s, #0x2
> + mul v23.4s, v25.4s, v23.4s
> + sshr v23.4s, v23.4s, #0x1
> + sub v23.4s, v24.4s, v23.4s
> + sshl v23.4s, v23.4s, v26.4s
> +
> + bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
> + sqxtn v23.4h, v23.4s
> + st1 {v23.s}[0], [x5], x11
> +
> + b.eq 16f
> 4:
> - addv s22, sgx2_v.4s
> - addv s23, sgy2_v.4s
> - addv s24, sgxgy_v.4s
> - addv s25, sgxdi_v.4s
> - addv s26, sgydi_v.4s
> -
> - mov w3, #31
> - mov w16, #-15
> - mov w17, #15
> -40:
> - mov w14, #0
> -
> - mov sgx2, v22.s[0]
> - mov sgy2, v23.s[0]
> - mov sgxgy, v24.s[0]
> - mov sgxdi, v25.s[0]
> - mov sgydi, v26.s[0]
> -
> - cbz sgx2, 5f
> - clz w12, sgx2
> - lsl sgxdi, sgxdi, #2
> - sub w13, w3, w12 // log2(sgx2)
> - asr sgxdi, sgxdi, w13
> - cmp sgxdi, w16
> - csel w14, w16, sgxdi, lt // clip to -15
> - b.le 5f
> - cmp sgxdi, w17
> - csel w14, w17, sgxdi, gt // clip to 15
> + mov x15, #0x0 // dy, inner
> loop
> +
> + movi v25.2d, #0
> + movi v27.2d, #0
> + movi v23.2d, #0
> + movi v26.2d, #0
> + movi v24.2d, #0
> + b 8f
> +
> 5:
> - strh w14, [vx], #2
> -
> - mov w15, #0
> - cbz sgy2, 6f
> - lsl sgydi, sgydi, #2
> - smull x14, w14, sgxgy
> - asr w14, w14, #1
> - sub sgydi, sgydi, w14
> - clz w12, sgy2
> - sub w13, w3, w12 // log2(sgy2)
> - asr sgydi, sgydi, w13
> - cmp sgydi, w16
> - csel w15, w16, sgydi, lt // clip to -15
> - b.le 6f
> - cmp sgydi, w17
> - csel w15, w17, sgydi, gt // clip to 15
> -6:
> - strh w15, [vy], #2
> - cbz x0, 7f
> - addv s22, sgx2_v2.4s
> - addv s23, sgy2_v2.4s
> - addv s24, sgxgy_v2.4s
> - addv s25, sgxdi_v2.4s
> - addv s26, sgydi_v2.4s
> - mov x0, #0
> - b 40b
> -7:
> + // add line(-1) and line0 from previous results
> + bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
> + bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
> + add x15, x15, #1
> +8:
> + cmp w12, w6
> + b.hs 9f
> + // y < block_h && dy == 0, reuse previous results
> + cbz x15, 5b
> +9:
> + ldr q28, [x0] //
> src0
> + ldr q29, [x1] //
> src1
> + ldr q30, [x14], #(BDOF_BLOCK_SIZE * 2) // gh0
> + ldr q31, [x13], #(BDOF_BLOCK_SIZE * 2) // gh1
> + ldr q8, [x10], #(BDOF_BLOCK_SIZE * 2) // gv0
> + ldr q9, [x9], #(BDOF_BLOCK_SIZE * 2) // gv1
> + add x0, x0, #(VVC_MAX_PB_SIZE * 2)
> + add x1, x1, #(VVC_MAX_PB_SIZE * 2)
> +
> + sshr v28.8h, v28.8h, #0x4
> + sshr v29.8h, v29.8h, #0x4
> + shadd v30.8h, v30.8h, v31.8h //
> tmph
> + shadd v31.8h, v8.8h, v9.8h //
> tmpv
> + sub v8.8h, v28.8h, v29.8h //
> diff
> +
> + abs v28.8h, v30.8h
> + abs v29.8h, v31.8h
> +
> + bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
> + bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29
> +
> + bdof_vx_vy_sign v30, v9, v10, v9
> + bdof_vx_vy_sign v31, v10, v31, v31
> +
> + mul v30.8h, v31.8h, v30.8h
> + mul v9.8h, v9.8h, v8.8h
> + mul v8.8h, v31.8h, v8.8h
> +
> + bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
> + bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
> + bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8
> +
> + bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
> +
> + cmp w12, w6
> + b.ne 10f
> + cbnz x15, 10f
> +
> + // y == block_h && dy == 0, duplicate first line results
> + bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
> + add x15, x15, #0x1
> + b 9b
> +10:
> + cmp x15, #(BDOF_MIN_BLOCK_SIZE - 1)
> + b.eq 11f
> + cmp x15, #(BDOF_MIN_BLOCK_SIZE)
> + b.ne 12f
> + b 1b
> +11:
> + // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
> + // duplicate the results and break
> + cmp x12, #(BDOF_MIN_BLOCK_SIZE)
> + b.eq 13f
> + bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
> +12:
> + add x15, x15, #1
> + b 8b
> +13:
> + // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
> + // padding bottom then break
> + bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
> + b 2b
> +16:
> + ldp d9, d8, [sp, #0x10]
> + ldp d11, d10, [sp], #0x20
> ret
> +endfunc
>
> -.unreq src0
> -.unreq src1
> -.unreq pad_mask
> -.unreq gh
> -.unreq gv
> -.unreq vx
> -.unreq vy
> -.unreq sgx2
> -.unreq sgy2
> -.unreq sgxgy
> -.unreq sgxdi
> -.unreq sgydi
> -.unreq sgx2_v
> -.unreq sgy2_v
> -.unreq sgxgy_v
> -.unreq sgxdi_v
> -.unreq sgydi_v
> -.unreq sgx2_v2
> -.unreq sgy2_v2
> -.unreq sgxgy_v2
> -.unreq sgxdi_v2
> -.unreq sgydi_v2
> -.unreq y
> +/*
> + * x0: const int16_t *_src0,
> + * x1: const int16_t *_src1,
> + * x2: int16_t *gradient_h[2],
> + * x3: int16_t *gradient_v[2],
> + * x4: int16_t vx[16],
> + * x5: int16_t vy[16],
> + * w6: int block_h
> + */
> +function ff_vvc_derive_bdof_vx_vy_16x_neon, export=1
> + sub sp, sp, #0x80
> + stp d15, d14, [sp, #0x30]
> + stp d13, d12, [sp, #0x40]
> + stp d11, d10, [sp, #0x50]
> + stp d9, d8, [sp, #0x60]
> + stp x29, x30, [sp, #0x70]
> +
> + ldp x8, x9, [x2] // gh0, gh1
> + ldp x10, x11, [x3] // gv0, gv1
> + movrel x12, bdof_vx_vy_16x_tbl
> + ldp q0, q1, [x12] // table
> + mov w13, w6 // y = block_h
> + b 4f
> +
> +1:
> + // save line4
> + bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
> +2:
> + clz v3.4s, v25.4s
> + mvni v5.4s, #0x1e
> + add v3.4s, v3.4s, v5.4s // -log2()
> + shl v4.4s, v28.4s, #0x2
> + sshl v3.4s, v4.4s, v3.4s
> +
> + movi v28.4s, #0xf // clip to 15
> + mvni v29.4s, #0xe // clip to -15
> + bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
> + sqxtn v4.4h, v3.4s
> + st1 {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)
> +
> + subs x13, x13, #(BDOF_MIN_BLOCK_SIZE) // y -=
> BDOF_MIN_BLOCK_SIZE
> +
> + clz v4.4s, v24.4s
> + add v4.4s, v4.4s, v5.4s // -log2()
> + shl v5.4s, v27.4s, #0x2
> + mul v3.4s, v3.4s, v26.4s
> + sshr v3.4s, v3.4s, #0x1
> + sub v3.4s, v5.4s, v3.4s
> + sshl v3.4s, v3.4s, v4.4s
> +
> + bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
> + sqxtn v3.4h, v3.4s
> + st1 {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
> + b.eq 16f
> +4:
> + mov w14, #0x0 // dy, inner
> loop
> +
> + movi v25.2d, #0
> + movi v24.2d, #0
> + movi v26.2d, #0
> + movi v28.2d, #0
> + movi v27.2d, #0
> + b 8f
> +
> +5:
> + // add line(-1) and line0 from previous results
> + bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
> + bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
> + add w14, w14, #0x1
> +
> + 8:
> + cmp w13, w6
> + b.hs 9f
> + // y < block_h && dy == 0, reuse previous results
> + cbz w14, 5b
> +9:
> + ld1 {v29.8h, v30.8h}, [x0] // src0
> + sshr v31.8h, v29.8h, #0x4
> + ld1 {v8.8h, v9.8h}, [x1] // src1
> + sshr v10.8h, v8.8h, #0x4
> + ld1 {v11.8h, v12.8h}, [x8], #32 // gh0
> + sshr v29.8h, v30.8h, #0x4
> + sshr v30.8h, v9.8h, #0x4
> + ld1 {v8.8h, v9.8h}, [x9], #32 // gh1
> + shadd v13.8h, v11.8h, v8.8h // (gh0 +
> gh1) >> 1, left half
> + ld1 {v14.8h, v15.8h}, [x10], #32 // gv0
> + ld1 {v3.8h, v4.8h}, [x11], #32 // gv1
> + shadd v5.8h, v14.8h, v3.8h // (gv0 +
> gv1) >> 1, left half
> + sub v31.8h, v31.8h, v10.8h // diff, left
> half
> + shadd v8.8h, v12.8h, v9.8h // (gh0 +
> gh1) >> 1, right half
> + shadd v3.8h, v15.8h, v4.8h // (gv0 +
> gv1) >> 1, right half
> + sub v4.8h, v29.8h, v30.8h // diff,
> right half
> +
> + abs v29.8h, v13.8h
> + abs v30.8h, v8.8h
> + abs v9.8h, v5.8h
> + abs v10.8h, v3.8h
> +
> + add x0, x0, #(VVC_MAX_PB_SIZE * 2)
> + add x1, x1, #(VVC_MAX_PB_SIZE * 2)
> +
> + bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
> + bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30
> +
> + bdof_vx_vy_sign v13, v9, v10, v9
> + bdof_vx_vy_sign v8, v10, v11, v10
> + bdof_vx_vy_sign v5, v11, v5, v5
> + bdof_vx_vy_sign v3, v11, v3, v3
> +
> + mul v11.8h, v5.8h, v13.8h
> + mul v12.8h, v3.8h, v8.8h
> + mul v8.8h, v9.8h, v31.8h
> + mul v9.8h, v10.8h, v4.8h
> + mul v13.8h, v5.8h, v31.8h
> + mul v14.8h, v3.8h, v4.8h
> +
> + bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
> + bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
> + bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9
> +
> + bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
> + // check whether padding top
> + cmp w13, w6
> + b.ne 10f
> + cbnz w14, 10f
> + // y == block_h && dy == 0, padding top
> + bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
> + add w14, w14, #0x1
> + b 9b
> +10:
> + cmp w14, #(BDOF_MIN_BLOCK_SIZE - 1)
> + b.eq 11f
> + cmp w14, #(BDOF_MIN_BLOCK_SIZE)
> + b.ne 12f
> + // save line4
> + b 1b
> + 11:
> + // y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1,
> padding bottom
> + cmp x13, #(BDOF_MIN_BLOCK_SIZE)
> + b.eq 13f
> + // save line3
> + bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
> +12:
> + add w14, w14, #0x1 // dy++
> + b 8b
> +13:
> + // padding bottom
> + bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
> + b 2b
> +16:
> + // restore
> + ldp x29, x30, [sp, #0x70]
> + ldp d9, d8, [sp, #0x60]
> + ldp d11, d10, [sp, #0x50]
> + ldp d13, d12, [sp, #0x40]
> + ldp d15, d14, [sp, #0x30]
> + add sp, sp, #0x80
> + ret
> endfunc
> diff --git a/libavcodec/aarch64/vvc/of_template.c
> b/libavcodec/aarch64/vvc/of_template.c
> index ac6182b09d..d8ddaacb14 100644
> --- a/libavcodec/aarch64/vvc/of_template.c
> +++ b/libavcodec/aarch64/vvc/of_template.c
> @@ -41,6 +41,11 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t
> _dst_stride,
> ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1],
> BDOF_BLOCK_SIZE,
> _src1, MAX_PB_SIZE, block_w, block_h);
> + int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
> + if (block_w == 8)
> + ff_vvc_derive_bdof_vx_vy_8x_neon(_src0, _src1, gradient_h,
> gradient_v, vx, vy, block_h);
> + else
> + ff_vvc_derive_bdof_vx_vy_16x_neon(_src0, _src1, gradient_h,
> gradient_v, vx, vy, block_h);
>
> for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
> for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE * 2) {
> @@ -50,14 +55,10 @@ static void FUNC(apply_bdof)(uint8_t *_dst, ptrdiff_t
> _dst_stride,
> int idx = BDOF_BLOCK_SIZE * y + x;
> const int16_t *gh[] = {gradient_h[0] + idx, gradient_h[1] +
> idx};
> const int16_t *gv[] = {gradient_v[0] + idx, gradient_v[1] +
> idx};
> - int16_t vx[2], vy[2];
> - int pad_mask = !x | ((!y) << 1) |
> - ((x + 2 * BDOF_MIN_BLOCK_SIZE == block_w) <<
> 2) |
> - ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
> - ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv,
> vx, vy);
> + int idx1 = y + x / BDOF_MIN_BLOCK_SIZE;
> FUNC2(ff_vvc_apply_bdof_block, BIT_DEPTH, _neon)(d,
> dst_stride,
> - src0, src1,
> gh, gv,
> - vx, vy);
> + src0, src1, gh, gv,
> + vx + idx1, vy +
> idx1);
> }
> dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
> }
> --
> 2.46.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>
More information about the ffmpeg-devel
mailing list