[FFmpeg-devel] [PATCH 2/4] aarch64/vvc: Add apply_bdof
Zhao Zhili
quinkblack at foxmail.com
Mon Sep 23 12:09:55 EEST 2024
Drop patch 2/4 for now. It needs more polish.
See patch v2
https://ffmpeg.org/pipermail/ffmpeg-devel/2024-September/333800.html
> On Sep 22, 2024, at 01:41, Zhao Zhili <quinkblack at foxmail.com> wrote:
>
> From: Zhao Zhili <zhilizhao at tencent.com>
>
> apply_bdof_8_8x16_c: 18.7 ( 1.00x)
> apply_bdof_8_8x16_neon: 9.7 ( 1.93x)
> apply_bdof_8_16x8_c: 20.0 ( 1.00x)
> apply_bdof_8_16x8_neon: 9.5 ( 2.11x)
> apply_bdof_8_16x16_c: 36.7 ( 1.00x)
> apply_bdof_8_16x16_neon: 19.0 ( 1.94x)
> apply_bdof_10_8x16_c: 18.0 ( 1.00x)
> apply_bdof_10_8x16_neon: 10.0 ( 1.80x)
> apply_bdof_10_16x8_c: 18.0 ( 1.00x)
> apply_bdof_10_16x8_neon: 9.5 ( 1.90x)
> apply_bdof_10_16x16_c: 35.5 ( 1.00x)
> apply_bdof_10_16x16_neon: 19.0 ( 1.87x)
> apply_bdof_12_8x16_c: 17.5 ( 1.00x)
> apply_bdof_12_8x16_neon: 9.7 ( 1.80x)
> apply_bdof_12_16x8_c: 18.2 ( 1.00x)
> apply_bdof_12_16x8_neon: 9.5 ( 1.92x)
> apply_bdof_12_16x16_c: 34.5 ( 1.00x)
> apply_bdof_12_16x16_neon: 18.7 ( 1.84x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c | 9 +
> libavcodec/aarch64/vvc/inter.S | 351 +++++++++++++++++++++++++++
> libavcodec/aarch64/vvc/of_template.c | 70 ++++++
> 3 files changed, 430 insertions(+)
> create mode 100644 libavcodec/aarch64/vvc/of_template.c
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index b39ebb83fc..03a4c62310 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -27,16 +27,22 @@
> #include "libavcodec/vvc/dec.h"
> #include "libavcodec/vvc/ctu.h"
>
> +#define BDOF_BLOCK_SIZE 16
> +#define BDOF_MIN_BLOCK_SIZE 4
> +
> #define BIT_DEPTH 8
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
>
> #define BIT_DEPTH 10
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
>
> #define BIT_DEPTH 12
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
>
> int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
> @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
>
> c->inter.avg = ff_vvc_avg_8_neon;
> c->inter.w_avg = vvc_w_avg_8;
> + c->inter.apply_bdof = apply_bdof_8;
>
> for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
> c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
> @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
> } else if (bd == 10) {
> c->inter.avg = ff_vvc_avg_10_neon;
> c->inter.w_avg = vvc_w_avg_10;
> + c->inter.apply_bdof = apply_bdof_10;
>
> c->alf.filter[LUMA] = alf_filter_luma_10_neon;
> c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
> } else if (bd == 12) {
> c->inter.avg = ff_vvc_avg_12_neon;
> c->inter.w_avg = vvc_w_avg_12;
> + c->inter.apply_bdof = apply_bdof_12;
>
> c->alf.filter[LUMA] = alf_filter_luma_12_neon;
> c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index 49e1050aee..8cfacef44f 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -21,6 +21,8 @@
> #include "libavutil/aarch64/asm.S"
>
> #define VVC_MAX_PB_SIZE 128
> +#define BDOF_BLOCK_SIZE 16
> +#define BDOF_MIN_BLOCK_SIZE 4
>
> .macro vvc_avg type, bit_depth
>
> @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> 32:
> ret
> endfunc
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq width
> +.unreq height
> .endm
>
> vvc_avg avg, 8
> @@ -219,3 +228,345 @@ vvc_avg avg, 12
> vvc_avg w_avg, 8
> vvc_avg w_avg, 10
> vvc_avg w_avg, 12
> +
> +function ff_vvc_prof_grad_filter_8x_neon, export=1
> + gh .req x0
> + gv .req x1
> + gstride .req x2
> + src .req x3
> + src_stride .req x4
> + width .req w5
> + height .req w6
> +
> + lsl src_stride, src_stride, #1
> + neg x7, src_stride
> +1:
> + mov x10, src
> + mov w11, width
> + mov x12, gh
> + mov x13, gv
> +2:
> + ldur q0, [x10, #2]
> + ldur q1, [x10, #-2]
> + subs w11, w11, #8
> + ldr q2, [x10, src_stride]
> + ldr q3, [x10, x7]
> + sshr v0.8h, v0.8h, #6
> + sshr v1.8h, v1.8h, #6
> + sshr v2.8h, v2.8h, #6
> + sshr v3.8h, v3.8h, #6
> + sub v0.8h, v0.8h, v1.8h
> + sub v2.8h, v2.8h, v3.8h
> + st1 {v0.8h}, [x12], #16
> + st1 {v2.8h}, [x13], #16
> + add x10, x10, #16
> + b.ne 2b
> +
> + subs height, height, #1
> + add gh, gh, gstride, lsl #1
> + add gv, gv, gstride, lsl #1
> + add src, src, src_stride
> + b.ne 1b
> + ret
> +
> +.unreq gh
> +.unreq gv
> +.unreq gstride
> +.unreq src
> +.unreq src_stride
> +.unreq width
> +.unreq height
> +
> +endfunc
> +
> +.macro vvc_apply_bdof_min_block bit_depth
> + dst .req x0
> + dst_stride .req x1
> + src0 .req x2
> + src1 .req x3
> + gh .req x4
> + gv .req x5
> + vx .req w6
> + vy .req w7
> +
> + dup v0.4h, vx
> + dup v1.4h, vy
> + movi v7.4s, #(1 << (14 - \bit_depth))
> + ldp x8, x9, [gh]
> + ldp x10, x11, [gv]
> + mov x12, #(BDOF_BLOCK_SIZE * 2)
> + mov w13, #(BDOF_MIN_BLOCK_SIZE)
> + mov x14, #(VVC_MAX_PB_SIZE * 2)
> +.if \bit_depth >= 10
> + // clip pixel
> + mov w15, #((1 << \bit_depth) - 1)
> + movi v18.8h, #0
> + lsl dst_stride, dst_stride, #1
> + dup v17.8h, w15
> +.endif
> +1:
> + ld1 {v2.4h}, [x8], x12
> + ld1 {v3.4h}, [x9], x12
> + ld1 {v4.4h}, [x10], x12
> + ld1 {v5.4h}, [x11], x12
> + sub v2.4h, v2.4h, v3.4h
> + sub v4.4h, v4.4h, v5.4h
> + smull v2.4s, v0.4h, v2.4h
> + smlal v2.4s, v1.4h, v4.4h
> +
> + ld1 {v5.4h}, [src0], x14
> + ld1 {v6.4h}, [src1], x14
> + saddl v5.4s, v5.4h, v6.4h
> + add v5.4s, v5.4s, v7.4s
> + add v5.4s, v5.4s, v2.4s
> + sqshrn v5.4h, v5.4s, #(15 - \bit_depth)
> + subs w13, w13, #1
> +.if \bit_depth == 8
> + sqxtun v5.8b, v5.8h
> + str s5, [dst]
> + add dst, dst, dst_stride
> +.else
> + smin v5.4h, v5.4h, v17.4h
> + smax v5.4h, v5.4h, v18.4h
> + st1 {v5.4h}, [dst], dst_stride
> +.endif
> + b.ne 1b
> + ret
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.endm
> +
> +function ff_vvc_apply_bdof_min_block_8_neon, export=1
> + vvc_apply_bdof_min_block 8
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_10_neon, export=1
> + vvc_apply_bdof_min_block 10
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_12_neon, export=1
> + vvc_apply_bdof_min_block 12
> +endfunc
> +
> +.macro derive_bdof_vx_vy_x_begin_end
> + ldrh w19, [x14, x16, lsl #1] // load from src0
> + ldrh w20, [x15, x16, lsl #1] // load from src1
> + sxth w19, w19
> + sxth w20, w20
> + asr w19, w19, #4
> + asr w20, w20, #4
> + sub w19, w19, w20 // diff
> + add x17, x16, x13, lsl #4 // idx
> + ldrh w3, [gh0, x17, lsl #1] // load from gh0
> + ldrh w4, [gh1, x17, lsl #1] // load from gh1
> + sxth w3, w3
> + sxth w4, w4
> + ldrh w22, [gv0, x17, lsl #1] // load from gv0
> + ldrh w23, [gv1, x17, lsl #1] // load from gv1
> + add w3, w3, w4
> + asr w21, w3, #1 // temph
> + sxth w3, w22
> + sxth w4, w23
> + add w3, w3, w4
> + cmp w21, #0
> + asr w22, w3, #1 // tempv
> + cneg w20, w21, mi
> + csetm w23, ne
> + csinc w23, w23, wzr, ge // -VVC_SIGN(temph)
> + cmp w22, #0
> + add sgx2, sgx2, w20
> + cneg w20, w22, mi
> + cset w24, ne
> + csinv w24, w24, wzr, ge // VVC_SIGN(tempv)
> + add sgy2, sgy2, w20
> + madd sgxgy, w24, w21, sgxgy
> + madd sgxdi, w23, w19, sgxdi
> + csetm w24, ne
> + csinc w24, w24, wzr, ge // -VVC_SIGN(tempv)
> + madd sgydi, w24, w19, sgydi
> +.endm
> +
> +function ff_vvc_derive_bdof_vx_vy_neon, export=1
> + src0 .req x0
> + src1 .req x1
> + pad_mask .req w2
> + gh .req x3
> + gv .req x4
> + gh0 .req x27
> + gh1 .req x28
> + gv0 .req x25
> + gv1 .req x26
> + vx .req x5
> + vy .req x6
> + sgx2 .req w7
> + sgy2 .req w8
> + sgxgy .req w9
> + sgxdi .req w10
> + sgydi .req w11
> + y .req x12
> +
> + stp x27, x28, [sp, #-80]!
> + stp x25, x26, [sp, #16]
> + stp x23, x24, [sp, #32]
> + stp x21, x22, [sp, #48]
> + stp x19, x20, [sp, #64]
> +
> + ldp gh0, gh1, [gh]
> + mov sgx2, #0
> + mov sgy2, #0
> + mov sgxgy, #0
> + mov sgxdi, #0
> + mov sgydi, #0
> + ldp gv0, gv1, [gv]
> +
> + mov y, #-1
> + mov x13, #-1 // dy
> + tst pad_mask, #2
> + b.eq 1f
> + mov x13, #0 // dy: pad top
> +1:
> + add x14, src0, x13, lsl #8 // local src0
> + add x15, src1, x13, lsl #8 // local src1
> +
> + // x = -1
> + mov x16, #-1 // dx
> + tst pad_mask, #1
> + b.eq 2f
> + mov x16, #0
> +2:
> + derive_bdof_vx_vy_x_begin_end
> +
> + // x = 0 to BDOF_MIN_BLOCK_SIZE - 1
> + ldr d0, [x14]
> + ldr d1, [x15]
> + lsl x19, x13, #5
> + ldr d2, [gh0, x19]
> + ldr d3, [gh1, x19]
> + sshr v0.4h, v0.4h, #4
> + sshr v1.4h, v1.4h, #4
> + ssubl v0.4s, v0.4h, v1.4h // diff
> + ldr d4, [gv0, x19]
> + ldr d5, [gv1, x19]
> + saddl v2.4s, v2.4h, v3.4h
> + saddl v4.4s, v4.4h, v5.4h
> + sshr v2.4s, v2.4s, #1 // temph
> + sshr v4.4s, v4.4s, #1 // tempv
> + abs v3.4s, v2.4s
> + abs v5.4s, v4.4s
> + addv s3, v3.4s
> + addv s5, v5.4s
> + mov w19, v3.s[0]
> + mov w20, v5.s[0]
> + add sgx2, sgx2, w19
> + add sgy2, sgy2, w20
> +
> + movi v5.4s, #1
> + cmgt v17.4s, v4.4s, #0 // mask > 0
> + cmlt v18.4s, v4.4s, #0 // mask < 0
> + and v17.16b, v17.16b, v5.16b
> + and v18.16b, v18.16b, v5.16b
> + neg v19.4s, v18.4s
> + add v20.4s, v17.4s, v19.4s // VVC_SIGN(tempv)
> + smull v21.2d, v20.2s, v2.2s
> + smlal2 v21.2d, v20.4s, v2.4s
> + addp d21, v21.2d
> + mov w19, v21.s[0]
> + add sgxgy, sgxgy, w19
> +
> + smull v16.2d, v20.2s, v0.2s
> + smlal2 v16.2d, v20.4s, v0.4s
> + addp d16, v16.2d
> + mov w19, v16.s[0]
> + sub sgydi, sgydi, w19
> +
> + cmgt v17.4s, v2.4s, #0
> + cmlt v18.4s, v2.4s, #0
> + and v17.16b, v17.16b, v5.16b
> + and v18.16b, v18.16b, v5.16b
> + neg v21.4s, v17.4s
> + add v16.4s, v21.4s, v18.4s // -VVC_SIGN(temph)
> + smull v20.2d, v16.2s, v0.2s
> + smlal2 v20.2d, v16.4s, v0.4s
> + addp d20, v20.2d
> + mov w19, v20.s[0]
> + add sgxdi, sgxdi, w19
> +
> + // x = BDOF_MIN_BLOCK_SIZE
> + mov x16, #BDOF_MIN_BLOCK_SIZE // dx
> + tst pad_mask, #4
> + b.eq 3f
> + mov x16, #(BDOF_MIN_BLOCK_SIZE - 1)
> +3:
> + derive_bdof_vx_vy_x_begin_end
> +
> + add y, y, #1
> + cmp y, #(BDOF_MIN_BLOCK_SIZE)
> + mov x13, y
> + b.gt 4f
> + b.lt 1b
> + tst pad_mask, #8
> + b.eq 1b
> + sub x13, x13, #1 // pad bottom
> + b 1b
> +4:
> + mov w3, #31
> + mov w14, #0
> + mov w16, #-15
> + mov w17, #15
> + cbz sgx2, 5f
> + clz w12, sgx2
> + lsl sgxdi, sgxdi, #2
> + sub w13, w3, w12 // log2(sgx2)
> + asr sgxdi, sgxdi, w13
> + cmp sgxdi, w16
> + csel w14, w16, sgxdi, lt // clip to -15
> + b.le 5f
> + cmp sgxdi, w17
> + csel w14, w17, sgxdi, gt // clip to 15
> +5:
> + str w14, [vx]
> +
> + mov w15, #0
> + cbz sgy2, 6f
> + lsl sgydi, sgydi, #2
> + smull x14, w14, sgxgy
> + asr w14, w14, #1
> + sub sgydi, sgydi, w14
> + clz w12, sgy2
> + sub w13, w3, w12 // log2(sgy2)
> + asr sgydi, sgydi, w13
> + cmp sgydi, w16
> + csel w15, w16, sgydi, lt // clip to -15
> + b.le 6f
> + cmp sgydi, w17
> + csel w15, w17, sgydi, gt // clip to 15
> +6:
> + str w15, [vy]
> + ldp x25, x26, [sp, #16]
> + ldp x23, x24, [sp, #32]
> + ldp x21, x22, [sp, #48]
> + ldp x19, x20, [sp, #64]
> + ldp x27, x28, [sp], #80
> + ret
> +.unreq src0
> +.unreq src1
> +.unreq pad_mask
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.unreq sgx2
> +.unreq sgy2
> +.unreq sgxgy
> +.unreq sgxdi
> +.unreq sgydi
> +.unreq y
> +endfunc
> +
> diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
> new file mode 100644
> index 0000000000..508ea6d99d
> --- /dev/null
> +++ b/libavcodec/aarch64/vvc/of_template.c
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack at foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavcodec/bit_depth_template.c"
> +
> +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
> + int16_t *gradient_v,
> + const ptrdiff_t gradient_stride,
> + const int16_t *_src,
> + const ptrdiff_t src_stride,
> + const int width, const int height);
> +
> +void ff_vvc_derive_bdof_vx_vy_neon(
> + const int16_t *_src0, const int16_t *_src1, int pad_mask,
> + const int16_t **gradient_h, const int16_t **gradient_v,
> + int *vx, int *vy);
> +
> +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst,
> + const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
> + const int16_t **gh, const int16_t **gv, const int vx, const int vy);
> +
> +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride,
> + const int16_t *_src0, const int16_t *_src1,
> + const int block_w, const int block_h)
> +{
> + int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> + int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> + int vx, vy;
> + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
> + pixel* dst = (pixel*)_dst;
> +
> + ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE,
> + _src0, MAX_PB_SIZE, block_w, block_h);
> + ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE,
> + _src1, MAX_PB_SIZE, block_w, block_h);
> +
> + for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
> + for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) {
> + const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x;
> + const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x;
> + pixel *d = dst + x;
> + const int idx = BDOF_BLOCK_SIZE * y + x;
> + const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx };
> + const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx };
> + const int pad_mask = !x | ((!y) << 1) |
> + ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
> + ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
> + ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy);
> + FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy);
> + }
> + dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
> + }
> +}
> --
> 2.42.0
>
> From: Zhao Zhili <zhilizhao at tencent.com>
>
> apply_bdof_8_8x16_c: 18.7 ( 1.00x)
> apply_bdof_8_8x16_neon: 9.7 ( 1.93x)
> apply_bdof_8_16x8_c: 20.0 ( 1.00x)
> apply_bdof_8_16x8_neon: 9.5 ( 2.11x)
> apply_bdof_8_16x16_c: 36.7 ( 1.00x)
> apply_bdof_8_16x16_neon: 19.0 ( 1.94x)
> apply_bdof_10_8x16_c: 18.0 ( 1.00x)
> apply_bdof_10_8x16_neon: 10.0 ( 1.80x)
> apply_bdof_10_16x8_c: 18.0 ( 1.00x)
> apply_bdof_10_16x8_neon: 9.5 ( 1.90x)
> apply_bdof_10_16x16_c: 35.5 ( 1.00x)
> apply_bdof_10_16x16_neon: 19.0 ( 1.87x)
> apply_bdof_12_8x16_c: 17.5 ( 1.00x)
> apply_bdof_12_8x16_neon: 9.7 ( 1.80x)
> apply_bdof_12_16x8_c: 18.2 ( 1.00x)
> apply_bdof_12_16x8_neon: 9.5 ( 1.92x)
> apply_bdof_12_16x16_c: 34.5 ( 1.00x)
> apply_bdof_12_16x16_neon: 18.7 ( 1.84x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c | 9 +
> libavcodec/aarch64/vvc/inter.S | 351 +++++++++++++++++++++++++++
> libavcodec/aarch64/vvc/of_template.c | 70 ++++++
> 3 files changed, 430 insertions(+)
> create mode 100644 libavcodec/aarch64/vvc/of_template.c
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index b39ebb83fc..03a4c62310 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -27,16 +27,22 @@
> #include "libavcodec/vvc/dec.h"
> #include "libavcodec/vvc/ctu.h"
>
> +#define BDOF_BLOCK_SIZE 16
> +#define BDOF_MIN_BLOCK_SIZE 4
> +
> #define BIT_DEPTH 8
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
>
> #define BIT_DEPTH 10
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
>
> #define BIT_DEPTH 12
> #include "alf_template.c"
> +#include "of_template.c"
> #undef BIT_DEPTH
>
> int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
> @@ -155,6 +161,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
>
> c->inter.avg = ff_vvc_avg_8_neon;
> c->inter.w_avg = vvc_w_avg_8;
> + c->inter.apply_bdof = apply_bdof_8;
>
> for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
> c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
> @@ -196,12 +203,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
> } else if (bd == 10) {
> c->inter.avg = ff_vvc_avg_10_neon;
> c->inter.w_avg = vvc_w_avg_10;
> + c->inter.apply_bdof = apply_bdof_10;
>
> c->alf.filter[LUMA] = alf_filter_luma_10_neon;
> c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
> } else if (bd == 12) {
> c->inter.avg = ff_vvc_avg_12_neon;
> c->inter.w_avg = vvc_w_avg_12;
> + c->inter.apply_bdof = apply_bdof_12;
>
> c->alf.filter[LUMA] = alf_filter_luma_12_neon;
> c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index 49e1050aee..8cfacef44f 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -21,6 +21,8 @@
> #include "libavutil/aarch64/asm.S"
>
> #define VVC_MAX_PB_SIZE 128
> +#define BDOF_BLOCK_SIZE 16
> +#define BDOF_MIN_BLOCK_SIZE 4
>
> .macro vvc_avg type, bit_depth
>
> @@ -211,6 +213,13 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> 32:
> ret
> endfunc
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq width
> +.unreq height
> .endm
>
> vvc_avg avg, 8
> @@ -219,3 +228,345 @@ vvc_avg avg, 12
> vvc_avg w_avg, 8
> vvc_avg w_avg, 10
> vvc_avg w_avg, 12
> +
> +function ff_vvc_prof_grad_filter_8x_neon, export=1
> + gh .req x0
> + gv .req x1
> + gstride .req x2
> + src .req x3
> + src_stride .req x4
> + width .req w5
> + height .req w6
> +
> + lsl src_stride, src_stride, #1
> + neg x7, src_stride
> +1:
> + mov x10, src
> + mov w11, width
> + mov x12, gh
> + mov x13, gv
> +2:
> + ldur q0, [x10, #2]
> + ldur q1, [x10, #-2]
> + subs w11, w11, #8
> + ldr q2, [x10, src_stride]
> + ldr q3, [x10, x7]
> + sshr v0.8h, v0.8h, #6
> + sshr v1.8h, v1.8h, #6
> + sshr v2.8h, v2.8h, #6
> + sshr v3.8h, v3.8h, #6
> + sub v0.8h, v0.8h, v1.8h
> + sub v2.8h, v2.8h, v3.8h
> + st1 {v0.8h}, [x12], #16
> + st1 {v2.8h}, [x13], #16
> + add x10, x10, #16
> + b.ne 2b
> +
> + subs height, height, #1
> + add gh, gh, gstride, lsl #1
> + add gv, gv, gstride, lsl #1
> + add src, src, src_stride
> + b.ne 1b
> + ret
> +
> +.unreq gh
> +.unreq gv
> +.unreq gstride
> +.unreq src
> +.unreq src_stride
> +.unreq width
> +.unreq height
> +
> +endfunc
> +
> +.macro vvc_apply_bdof_min_block bit_depth
> + dst .req x0
> + dst_stride .req x1
> + src0 .req x2
> + src1 .req x3
> + gh .req x4
> + gv .req x5
> + vx .req w6
> + vy .req w7
> +
> + dup v0.4h, vx
> + dup v1.4h, vy
> + movi v7.4s, #(1 << (14 - \bit_depth))
> + ldp x8, x9, [gh]
> + ldp x10, x11, [gv]
> + mov x12, #(BDOF_BLOCK_SIZE * 2)
> + mov w13, #(BDOF_MIN_BLOCK_SIZE)
> + mov x14, #(VVC_MAX_PB_SIZE * 2)
> +.if \bit_depth >= 10
> + // clip pixel
> + mov w15, #((1 << \bit_depth) - 1)
> + movi v18.8h, #0
> + lsl dst_stride, dst_stride, #1
> + dup v17.8h, w15
> +.endif
> +1:
> + ld1 {v2.4h}, [x8], x12
> + ld1 {v3.4h}, [x9], x12
> + ld1 {v4.4h}, [x10], x12
> + ld1 {v5.4h}, [x11], x12
> + sub v2.4h, v2.4h, v3.4h
> + sub v4.4h, v4.4h, v5.4h
> + smull v2.4s, v0.4h, v2.4h
> + smlal v2.4s, v1.4h, v4.4h
> +
> + ld1 {v5.4h}, [src0], x14
> + ld1 {v6.4h}, [src1], x14
> + saddl v5.4s, v5.4h, v6.4h
> + add v5.4s, v5.4s, v7.4s
> + add v5.4s, v5.4s, v2.4s
> + sqshrn v5.4h, v5.4s, #(15 - \bit_depth)
> + subs w13, w13, #1
> +.if \bit_depth == 8
> + sqxtun v5.8b, v5.8h
> + str s5, [dst]
> + add dst, dst, dst_stride
> +.else
> + smin v5.4h, v5.4h, v17.4h
> + smax v5.4h, v5.4h, v18.4h
> + st1 {v5.4h}, [dst], dst_stride
> +.endif
> + b.ne 1b
> + ret
> +
> +.unreq dst
> +.unreq dst_stride
> +.unreq src0
> +.unreq src1
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.endm
> +
> +function ff_vvc_apply_bdof_min_block_8_neon, export=1
> + vvc_apply_bdof_min_block 8
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_10_neon, export=1
> + vvc_apply_bdof_min_block 10
> +endfunc
> +
> +function ff_vvc_apply_bdof_min_block_12_neon, export=1
> + vvc_apply_bdof_min_block 12
> +endfunc
> +
> +.macro derive_bdof_vx_vy_x_begin_end
> + ldrh w19, [x14, x16, lsl #1] // load from src0
> + ldrh w20, [x15, x16, lsl #1] // load from src1
> + sxth w19, w19
> + sxth w20, w20
> + asr w19, w19, #4
> + asr w20, w20, #4
> + sub w19, w19, w20 // diff
> + add x17, x16, x13, lsl #4 // idx
> + ldrh w3, [gh0, x17, lsl #1] // load from gh0
> + ldrh w4, [gh1, x17, lsl #1] // load from gh1
> + sxth w3, w3
> + sxth w4, w4
> + ldrh w22, [gv0, x17, lsl #1] // load from gv0
> + ldrh w23, [gv1, x17, lsl #1] // load from gv1
> + add w3, w3, w4
> + asr w21, w3, #1 // temph
> + sxth w3, w22
> + sxth w4, w23
> + add w3, w3, w4
> + cmp w21, #0
> + asr w22, w3, #1 // tempv
> + cneg w20, w21, mi
> + csetm w23, ne
> + csinc w23, w23, wzr, ge // -VVC_SIGN(temph)
> + cmp w22, #0
> + add sgx2, sgx2, w20
> + cneg w20, w22, mi
> + cset w24, ne
> + csinv w24, w24, wzr, ge // VVC_SIGN(tempv)
> + add sgy2, sgy2, w20
> + madd sgxgy, w24, w21, sgxgy
> + madd sgxdi, w23, w19, sgxdi
> + csetm w24, ne
> + csinc w24, w24, wzr, ge // -VVC_SIGN(tempv)
> + madd sgydi, w24, w19, sgydi
> +.endm
> +
> +function ff_vvc_derive_bdof_vx_vy_neon, export=1
> + src0 .req x0
> + src1 .req x1
> + pad_mask .req w2
> + gh .req x3
> + gv .req x4
> + gh0 .req x27
> + gh1 .req x28
> + gv0 .req x25
> + gv1 .req x26
> + vx .req x5
> + vy .req x6
> + sgx2 .req w7
> + sgy2 .req w8
> + sgxgy .req w9
> + sgxdi .req w10
> + sgydi .req w11
> + y .req x12
> +
> + stp x27, x28, [sp, #-80]!
> + stp x25, x26, [sp, #16]
> + stp x23, x24, [sp, #32]
> + stp x21, x22, [sp, #48]
> + stp x19, x20, [sp, #64]
> +
> + ldp gh0, gh1, [gh]
> + mov sgx2, #0
> + mov sgy2, #0
> + mov sgxgy, #0
> + mov sgxdi, #0
> + mov sgydi, #0
> + ldp gv0, gv1, [gv]
> +
> + mov y, #-1
> + mov x13, #-1 // dy
> + tst pad_mask, #2
> + b.eq 1f
> + mov x13, #0 // dy: pad top
> +1:
> + add x14, src0, x13, lsl #8 // local src0
> + add x15, src1, x13, lsl #8 // local src1
> +
> + // x = -1
> + mov x16, #-1 // dx
> + tst pad_mask, #1
> + b.eq 2f
> + mov x16, #0
> +2:
> + derive_bdof_vx_vy_x_begin_end
> +
> + // x = 0 to BDOF_MIN_BLOCK_SIZE - 1
> + ldr d0, [x14]
> + ldr d1, [x15]
> + lsl x19, x13, #5
> + ldr d2, [gh0, x19]
> + ldr d3, [gh1, x19]
> + sshr v0.4h, v0.4h, #4
> + sshr v1.4h, v1.4h, #4
> + ssubl v0.4s, v0.4h, v1.4h // diff
> + ldr d4, [gv0, x19]
> + ldr d5, [gv1, x19]
> + saddl v2.4s, v2.4h, v3.4h
> + saddl v4.4s, v4.4h, v5.4h
> + sshr v2.4s, v2.4s, #1 // temph
> + sshr v4.4s, v4.4s, #1 // tempv
> + abs v3.4s, v2.4s
> + abs v5.4s, v4.4s
> + addv s3, v3.4s
> + addv s5, v5.4s
> + mov w19, v3.s[0]
> + mov w20, v5.s[0]
> + add sgx2, sgx2, w19
> + add sgy2, sgy2, w20
> +
> + movi v5.4s, #1
> + cmgt v17.4s, v4.4s, #0 // mask > 0
> + cmlt v18.4s, v4.4s, #0 // mask < 0
> + and v17.16b, v17.16b, v5.16b
> + and v18.16b, v18.16b, v5.16b
> + neg v19.4s, v18.4s
> + add v20.4s, v17.4s, v19.4s // VVC_SIGN(tempv)
> + smull v21.2d, v20.2s, v2.2s
> + smlal2 v21.2d, v20.4s, v2.4s
> + addp d21, v21.2d
> + mov w19, v21.s[0]
> + add sgxgy, sgxgy, w19
> +
> + smull v16.2d, v20.2s, v0.2s
> + smlal2 v16.2d, v20.4s, v0.4s
> + addp d16, v16.2d
> + mov w19, v16.s[0]
> + sub sgydi, sgydi, w19
> +
> + cmgt v17.4s, v2.4s, #0
> + cmlt v18.4s, v2.4s, #0
> + and v17.16b, v17.16b, v5.16b
> + and v18.16b, v18.16b, v5.16b
> + neg v21.4s, v17.4s
> + add v16.4s, v21.4s, v18.4s // -VVC_SIGN(temph)
> + smull v20.2d, v16.2s, v0.2s
> + smlal2 v20.2d, v16.4s, v0.4s
> + addp d20, v20.2d
> + mov w19, v20.s[0]
> + add sgxdi, sgxdi, w19
> +
> + // x = BDOF_MIN_BLOCK_SIZE
> + mov x16, #BDOF_MIN_BLOCK_SIZE // dx
> + tst pad_mask, #4
> + b.eq 3f
> + mov x16, #(BDOF_MIN_BLOCK_SIZE - 1)
> +3:
> + derive_bdof_vx_vy_x_begin_end
> +
> + add y, y, #1
> + cmp y, #(BDOF_MIN_BLOCK_SIZE)
> + mov x13, y
> + b.gt 4f
> + b.lt 1b
> + tst pad_mask, #8
> + b.eq 1b
> + sub x13, x13, #1 // pad bottom
> + b 1b
> +4:
> + mov w3, #31
> + mov w14, #0
> + mov w16, #-15
> + mov w17, #15
> + cbz sgx2, 5f
> + clz w12, sgx2
> + lsl sgxdi, sgxdi, #2
> + sub w13, w3, w12 // log2(sgx2)
> + asr sgxdi, sgxdi, w13
> + cmp sgxdi, w16
> + csel w14, w16, sgxdi, lt // clip to -15
> + b.le 5f
> + cmp sgxdi, w17
> + csel w14, w17, sgxdi, gt // clip to 15
> +5:
> + str w14, [vx]
> +
> + mov w15, #0
> + cbz sgy2, 6f
> + lsl sgydi, sgydi, #2
> + smull x14, w14, sgxgy
> + asr w14, w14, #1
> + sub sgydi, sgydi, w14
> + clz w12, sgy2
> + sub w13, w3, w12 // log2(sgy2)
> + asr sgydi, sgydi, w13
> + cmp sgydi, w16
> + csel w15, w16, sgydi, lt // clip to -15
> + b.le 6f
> + cmp sgydi, w17
> + csel w15, w17, sgydi, gt // clip to 15
> +6:
> + str w15, [vy]
> + ldp x25, x26, [sp, #16]
> + ldp x23, x24, [sp, #32]
> + ldp x21, x22, [sp, #48]
> + ldp x19, x20, [sp, #64]
> + ldp x27, x28, [sp], #80
> + ret
> +.unreq src0
> +.unreq src1
> +.unreq pad_mask
> +.unreq gh
> +.unreq gv
> +.unreq vx
> +.unreq vy
> +.unreq sgx2
> +.unreq sgy2
> +.unreq sgxgy
> +.unreq sgxdi
> +.unreq sgydi
> +.unreq y
> +endfunc
> +
> diff --git a/libavcodec/aarch64/vvc/of_template.c b/libavcodec/aarch64/vvc/of_template.c
> new file mode 100644
> index 0000000000..508ea6d99d
> --- /dev/null
> +++ b/libavcodec/aarch64/vvc/of_template.c
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack at foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavcodec/bit_depth_template.c"
> +
> +void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
> + int16_t *gradient_v,
> + const ptrdiff_t gradient_stride,
> + const int16_t *_src,
> + const ptrdiff_t src_stride,
> + const int width, const int height);
> +
> +void ff_vvc_derive_bdof_vx_vy_neon(
> + const int16_t *_src0, const int16_t *_src1, int pad_mask,
> + const int16_t **gradient_h, const int16_t **gradient_v,
> + int *vx, int *vy);
> +
> +void FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(pixel* dst,
> + const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
> + const int16_t **gh, const int16_t **gv, const int vx, const int vy);
> +
> +static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride,
> + const int16_t *_src0, const int16_t *_src1,
> + const int block_w, const int block_h)
> +{
> + int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> + int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
> + int vx, vy;
> + const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel);
> + pixel* dst = (pixel*)_dst;
> +
> + ff_vvc_prof_grad_filter_8x_neon(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE,
> + _src0, MAX_PB_SIZE, block_w, block_h);
> + ff_vvc_prof_grad_filter_8x_neon(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE,
> + _src1, MAX_PB_SIZE, block_w, block_h);
> +
> + for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
> + for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) {
> + const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x;
> + const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x;
> + pixel *d = dst + x;
> + const int idx = BDOF_BLOCK_SIZE * y + x;
> + const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx };
> + const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx };
> + const int pad_mask = !x | ((!y) << 1) |
> + ((x + BDOF_MIN_BLOCK_SIZE == block_w) << 2) |
> + ((y + BDOF_MIN_BLOCK_SIZE == block_h) << 3);
> + ff_vvc_derive_bdof_vx_vy_neon(src0, src1, pad_mask, gh, gv, &vx, &vy);
> + FUNC2(ff_vvc_apply_bdof_min_block, BIT_DEPTH, _neon)(d, dst_stride, src0, src1, gh, gv, vx, vy);
> + }
> + dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
> + }
> +}
> --
> 2.42.0
>
More information about the ffmpeg-devel
mailing list