[FFmpeg-devel] [PATCH] lavc/aarch64: add pred16x16 10-bit functions
Martin Storsjö
martin at martin.st
Thu Apr 8 23:55:48 EEST 2021
On Thu, 8 Apr 2021, Mikhail Nitenko wrote:
> here are the benchmarks https://0x1.st/kX.txt
> ---
> libavcodec/aarch64/h264pred_init.c | 75 +++++++++++-------
> libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++
> 2 files changed, 168 insertions(+), 30 deletions(-)
>
> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
> index 213b40b3e7..633b401d59 100644
> --- a/libavcodec/aarch64/h264pred_neon.S
> +++ b/libavcodec/aarch64/h264pred_neon.S
> @@ -359,3 +359,126 @@ function ff_pred8x8_0l0_dc_neon, export=1
> dup v1.8b, v1.b[0]
> b .L_pred8x8_dc_end
> endfunc
> +
> +.macro ldcol.16 rd, rs, rt, n=4, hi=0
> +.if \n >= 4 || \hi == 0
> + ld1 {\rd\().h}[0], [\rs], \rt
> + ld1 {\rd\().h}[1], [\rs], \rt
> +.endif
> +.if \n >= 4 || \hi == 1
> + ld1 {\rd\().h}[2], [\rs], \rt
> + ld1 {\rd\().h}[3], [\rs], \rt
> +.endif
> +.if \n == 8
> + ld1 {\rd\().h}[4], [\rs], \rt
> + ld1 {\rd\().h}[5], [\rs], \rt
> + ld1 {\rd\().h}[6], [\rs], \rt
> + ld1 {\rd\().h}[7], [\rs], \rt
> +.endif
> +.endm
I believe this could be a bit faster by using two alternating registers
that are incremented - but as the existing code doesn't do that, it's not
necessary.
> +
> +function ff_pred16x16_128_dc_neon_10, export=1
> + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
> +
> + b .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_top_dc_neon_10, export=1
> + sub x2, x0, x1
> +
> + ld1 {v0.8h}, [x2], #16
> + ld1 {v1.8h}, [x2]
This can be one single instruction, ld1 {v0.8h, v1.8h}, [x2]
> + uaddlv s0, v0.8h
> + uaddlv s1, v1.8h
When adding up 8 elements that are 10 bit, they still fit in 16 bit (it
only requires 13 bit), so you don't need uaddlv here, addv would be
better. And when adding the two results, it still fits in 16 bit (then
it'd use 14 bits).
> +
> + add v0.2s, v0.2s, v1.2s
> +
> + rshrn v0.4h, v0.4s, #4
> + dup v0.8h, v0.h[0]
> + b .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_left_dc_neon_10, export=1
> + sub x2, x0, #2 // access to the "left" column
> + ldcol.16 v0, x2, x1, 8
> + ldcol.16 v1, x2, x1, 8 // load "left" column
> +
> + uaddlv s0, v0.8h
> + uaddlv s1, v1.8h
Same thing here, addv+addv should be enough
> +
> + add v0.2s, v0.2s, v1.2s
> +
> + rshrn v0.4h, v0.4s, #4
> + dup v0.8h, v0.h[0]
> + b .L_pred16x16_dc_10_end
> +endfunc
> +
> +function ff_pred16x16_dc_neon_10, export=1
> + sub x2, x0, x1 // access to the "top" row
> + sub x3, x0, #2 // access to the "left" column
> +
> + ld1 {v0.8h}, [x2], #16
> + ld1 {v1.8h}, [x2]
One single ld1 {v0.8h, v1.8h}
> + ldcol.16 v2, x3, x1, 8
> + ldcol.16 v3, x3, x1, 8 // load pixels in "top" col and "left" row
> +
> + uaddlv s0, v0.8h
> + uaddlv s1, v1.8h
> + uaddlv s2, v2.8h // sum all pixels in the "top" row and "left" col
> + uaddlv s3, v3.8h // (sum stays in v0-v3 registers)
addv
> +
> + add v0.2s, v0.2s, v1.2s
> + add v0.2s, v0.2s, v2.2s
> + add v0.2s, v0.2s, v3.2s // sum registers v0-v3
> +
> + rshrn v0.4h, v0.4s, #5 // right shift vector
> + dup v0.8h, v0.h[0] // fill vector with 0th value (dcsplat)
These comments are kinda pointless here
> +.L_pred16x16_dc_10_end:
> + sub x1, x1, #16
> + mov w3, #8
> +6: st1 {v0.8h}, [x0], #16
> + st1 {v0.8h}, [x0], x1
This can be one single "st1 {v0.8h, v1.8h}, [x0], x1" if you make sure
that v1 contains the same
> + st1 {v0.8h}, [x0], #16
> + st1 {v0.8h}, [x0], x1
> +
> + subs w3, w3, #1
> + b.ne 6b
> + ret
> +endfunc
> +
> +function ff_pred16x16_hor_neon_10, export=1
> + sub x2, x0, #2
> + sub x3, x1, #16
> +
> + mov w4, #16
> +1: ld1r {v0.8h}, [x2], x1
> + st1 {v0.8h}, [x0], #16
> + st1 {v0.8h}, [x0], x3
This might be ok here, but also do check if copying the value to v1 and
doing one single "st1 {v0.8h, v1.8h}" is faster.
> +
> + subs w4, w4, #1
> + b.ne 1b
> + ret
> +endfunc
> +
> +function ff_pred16x16_vert_neon_10, export=1
> + sub x2, x0, x1
> + add x1, x1, x1
> + sub x1, x1, #16
> +
> + ld1 {v0.8h}, [x2], #16
> + ld1 {v1.8h}, [x2], x1
One single ld1
> +
> + mov w3, #8
> +1: st1 {v0.8h}, [x0], #16
> + st1 {v1.8h}, [x0], x1
One single st1
// Martin
More information about the ffmpeg-devel
mailing list