[FFmpeg-devel] [PATCH v2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}
Martin Storsjö
martin at martin.st
Fri Mar 7 15:56:21 EET 2025
On Mon, 3 Mar 2025, Krzysztof Pyrkosz via ffmpeg-devel wrote:
> This patch replaces integer widening with halving addition, and
> multi-step "emulated" rounding shift with a single asm instruction doing
> exactly that.
>
> Benchmarks before and after:
> A78
> avg_8_64x64_neon: 2686.2 ( 6.12x)
> avg_8_128x128_neon: 10734.2 ( 5.88x)
> avg_10_64x64_neon: 2536.8 ( 5.40x)
> avg_10_128x128_neon: 10079.0 ( 5.22x)
> avg_12_64x64_neon: 2548.2 ( 5.38x)
> avg_12_128x128_neon: 10133.8 ( 5.19x)
>
> avg_8_64x64_neon: 897.8 (18.26x)
> avg_8_128x128_neon: 3608.5 (17.37x)
> avg_10_32x32_neon: 444.2 ( 8.51x)
> avg_10_64x64_neon: 1711.8 ( 8.00x)
> avg_12_64x64_neon: 1706.2 ( 8.02x)
> avg_12_128x128_neon: 7010.0 ( 7.46x)
>
> A72
> avg_8_64x64_neon: 5823.4 ( 3.88x)
> avg_8_128x128_neon: 17430.5 ( 4.73x)
> avg_10_64x64_neon: 5228.1 ( 3.71x)
> avg_10_128x128_neon: 16722.2 ( 4.17x)
> avg_12_64x64_neon: 5379.1 ( 3.51x)
> avg_12_128x128_neon: 16715.7 ( 4.17x)
>
> avg_8_64x64_neon: 2006.5 (10.61x)
> avg_8_128x128_neon: 9158.7 ( 8.96x)
> avg_10_64x64_neon: 3357.7 ( 5.60x)
> avg_10_128x128_neon: 12411.7 ( 5.56x)
> avg_12_64x64_neon: 3317.5 ( 5.67x)
> avg_12_128x128_neon: 12358.5 ( 5.58x)
>
> A53
> avg_8_64x64_neon: 8327.8 ( 5.18x)
> avg_8_128x128_neon: 31631.3 ( 5.34x)
> avg_10_64x64_neon: 8783.5 ( 4.98x)
> avg_10_128x128_neon: 32617.0 ( 5.25x)
> avg_12_64x64_neon: 8686.0 ( 5.06x)
> avg_12_128x128_neon: 32487.5 ( 5.25x)
>
> avg_8_64x64_neon: 6032.3 ( 7.17x)
> avg_8_128x128_neon: 22008.5 ( 7.69x)
> avg_10_64x64_neon: 7738.0 ( 5.68x)
> avg_10_128x128_neon: 27813.8 ( 6.14x)
> avg_12_64x64_neon: 7844.5 ( 5.60x)
> avg_12_128x128_neon: 26999.5 ( 6.34x)
> ---
> libavcodec/aarch64/vvc/inter.S | 177 ++++++++++++++++++++++++---------
> 1 file changed, 130 insertions(+), 47 deletions(-)
>
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index 0edc861f97..b2f44697d3 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -24,9 +24,9 @@
> #define BDOF_BLOCK_SIZE 16
> #define BDOF_MIN_BLOCK_SIZE 4
>
> -.macro vvc_avg type, bit_depth
> +.macro vvc_avg bit_depth
>
> -.macro vvc_\type\()_\bit_depth\()_2_4 tap
> +.macro vvc_w_avg_\bit_depth\()_2_4 tap
> .if \tap == 2
> ldr s0, [src0]
> ldr s2, [src1]
> @@ -34,18 +34,11 @@
> ldr d0, [src0]
> ldr d2, [src1]
> .endif
> -
> -.ifc \type, avg
> - saddl v4.4s, v0.4h, v2.4h
> - add v4.4s, v4.4s, v16.4s
> - sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
> -.else
> mov v4.16b, v16.16b
> smlal v4.4s, v0.4h, v19.4h
> smlal v4.4s, v2.4h, v20.4h
> sqshl v4.4s, v4.4s, v22.4s
> sqxtun v4.4h, v4.4s
> -.endif
>
> .if \bit_depth == 8
> sqxtun v4.8b, v4.8h
> @@ -68,7 +61,7 @@
> add dst, dst, dst_stride
> .endm
>
> -function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> +function ff_vvc_w_avg_\bit_depth\()_neon, export=1
> dst .req x0
> dst_stride .req x1
> src0 .req x2
> @@ -78,9 +71,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
>
> mov x10, #(VVC_MAX_PB_SIZE * 2)
> cmp width, #8
> -.ifc \type, avg
> - movi v16.4s, #(1 << (14 - \bit_depth))
> -.else
> lsr x11, x6, #32 // weight0
> mov w12, w6 // weight1
> lsr x13, x7, #32 // offset
> @@ -91,9 +81,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> dup v20.8h, w12
> dup v16.4s, w13
> dup v22.4s, w14
> -.endif // avg
>
> - .if \bit_depth >= 10
> +.if \bit_depth >= 10
> // clip pixel
> mov w6, #((1 << \bit_depth) - 1)
> dup v17.8h, w6
> @@ -105,25 +94,17 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> b.eq 4f
> 2: // width == 2
> subs height, height, #1
> - vvc_\type\()_\bit_depth\()_2_4 2
> + vvc_w_avg_\bit_depth\()_2_4 2
> b.ne 2b
> b 32f
> 4: // width == 4
> subs height, height, #1
> - vvc_\type\()_\bit_depth\()_2_4 4
> + vvc_w_avg_\bit_depth\()_2_4 4
> b.ne 4b
> b 32f
> 8: // width == 8
> ld1 {v0.8h}, [src0], x10
> ld1 {v2.8h}, [src1], x10
> -.ifc \type, avg
> - saddl v4.4s, v0.4h, v2.4h
> - saddl2 v5.4s, v0.8h, v2.8h
> - add v4.4s, v4.4s, v16.4s
> - add v5.4s, v5.4s, v16.4s
> - sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
> - sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
> -.else
> mov v4.16b, v16.16b
> mov v5.16b, v16.16b
> smlal v4.4s, v0.4h, v19.4h
> @@ -134,7 +115,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> sqshl v5.4s, v5.4s, v22.4s
> sqxtun v4.4h, v4.4s
> sqxtun2 v4.8h, v5.4s
> -.endif
> subs height, height, #1
> .if \bit_depth == 8
> sqxtun v4.8b, v4.8h
> @@ -153,20 +133,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> 17:
> ldp q0, q1, [x7], #32
> ldp q2, q3, [x8], #32
> -.ifc \type, avg
> - saddl v4.4s, v0.4h, v2.4h
> - saddl2 v5.4s, v0.8h, v2.8h
> - saddl v6.4s, v1.4h, v3.4h
> - saddl2 v7.4s, v1.8h, v3.8h
> - add v4.4s, v4.4s, v16.4s
> - add v5.4s, v5.4s, v16.4s
> - add v6.4s, v6.4s, v16.4s
> - add v7.4s, v7.4s, v16.4s
> - sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
> - sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
> - sqshrun v6.4h, v6.4s, #(15 - \bit_depth)
> - sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth)
> -.else // avg
> mov v4.16b, v16.16b
> mov v5.16b, v16.16b
> mov v6.16b, v16.16b
> @@ -187,7 +153,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> sqxtun v6.4h, v6.4s
> sqxtun2 v4.8h, v5.4s
> sqxtun2 v6.8h, v7.4s
> -.endif // w_avg
> subs w6, w6, #16
> .if \bit_depth == 8
> sqxtun v4.8b, v4.8h
> @@ -217,12 +182,130 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> endfunc
> .endm
>
> -vvc_avg avg, 8
> -vvc_avg avg, 10
> -vvc_avg avg, 12
> -vvc_avg w_avg, 8
> -vvc_avg w_avg, 10
> -vvc_avg w_avg, 12
> +vvc_avg 8
> +vvc_avg 10
> +vvc_avg 12
> +
> +.macro vvc_avg2 bit_depth
Instead of naming this vvc_avg2, and the old one (which only produces the
w_avg function now) vvc_avg, we could rename the old one to vvc_w_avg, and
the new one to plain vvc_avg.
I did that change and pushed this patch now, thanks!
// Martin
More information about the ffmpeg-devel
mailing list