[FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: h264, add chroma loop filters for 10bit
Martin Storsjö
martin at martin.st
Tue Aug 17 13:52:12 EEST 2021
On Mon, 16 Aug 2021, Mikhail Nitenko wrote:
> Benchmarks: A53 A72
> h264_h_loop_filter_chroma422_10bpp_c: 277.5 114.2
> h264_h_loop_filter_chroma422_10bpp_neon: 109.7 81.7
> h264_h_loop_filter_chroma_10bpp_c: 165.0 75.5
> h264_h_loop_filter_chroma_10bpp_neon: 121.2 74.7
> h264_h_loop_filter_chroma_intra422_10bpp_c: 324.2 124.2
> h264_h_loop_filter_chroma_intra422_10bpp_neon: 155.2 99.5
> h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 48.5
> h264_h_loop_filter_chroma_intra_10bpp_neon: 79.5 52.7
> h264_h_loop_filter_chroma_mbaff422_10bpp_c: 191.0 73.5
> h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 121.2 75.5
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.0 51.5
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 79.5 53.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 63.0 28.5
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 48.7 33.2
> h264_v_loop_filter_chroma_10bpp_c: 260.2 135.5
> h264_v_loop_filter_chroma_10bpp_neon: 72.2 49.2
> h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 70.7
> h264_v_loop_filter_chroma_intra_10bpp_neon: 48.7 32.0
>
> Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
> ---
>
> removed leftover code, moved from 32bit and started loading with two
> alternating registers, code became quite a bit faster!
>
> libavcodec/aarch64/h264dsp_init_aarch64.c | 37 ++++
> libavcodec/aarch64/h264dsp_neon.S | 255 ++++++++++++++++++++++
> 2 files changed, 292 insertions(+)
>
> diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
> index 997082498f..80b7ed5ce1 100644
> --- a/libavcodec/aarch64/h264dsp_neon.S
> +++ b/libavcodec/aarch64/h264dsp_neon.S
> @@ -819,3 +819,258 @@ endfunc
> weight_func 16
> weight_func 8
> weight_func 4
> +
> +.macro h264_loop_filter_start_10
> + cmp w2, #0
> + ldr w6, [x4]
> + ccmp w3, #0, #0, ne
> + lsl w2, w2, #2
> + mov v24.S[0], w6
> + lsl w3, w3, #2
> + and w8, w6, w6, lsl #16
Nitpick: Align the third operand column on ccmp/lsl above with how it's
done here for the 'and'. (Yes the existing code here seems to have the
same misalignment.)
> + b.eq 1f
> + ands w8, w8, w8, lsl #8
> + b.ge 2f
> +1:
> + ret
> +2:
> +.endm
> +
> +.macro h264_loop_filter_start_intra_10
> + orr w4, w2, w3
> + cbnz w4, 1f
> + ret
> +1:
> + lsl w2, w2, #2
> + lsl w3, w3, #2
> + dup v30.8h, w2 // alpha
> + dup v31.8h, w3 // beta
> +.endm
> +
> +.macro h264_loop_filter_chroma_10
> + dup v22.8h, w2 // alpha
> + dup v23.8h, w3 // beta
> + uxtl v24.8h, v24.8b // tc0
> +
> + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0)
> + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0)
> + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0)
> + cmhi v26.8h, v22.8h, v26.8h // < alpha
> + cmhi v28.8h, v23.8h, v28.8h // < beta
> + cmhi v30.8h, v23.8h, v30.8h // < beta
> +
> + and v26.16b, v26.16b, v28.16b
> + mov v4.16b, v0.16b
> + sub v4.8h, v4.8h, v16.8h
> + and v26.16b, v26.16b, v30.16b
> + shl v4.8h, v4.8h, #2
> + mov x8, v26.d[0]
> + mov x9, v26.d[1]
> + sli v24.8H, v24.8H, #8
> + uxtl v24.8H, v24.8B
> + add v4.8h, v4.8h, v18.8h
> + shl v24.8h, v24.8h, #2
> +
> + adds x8, x8, x9
I think it would be better for in-order cores to do this 'adds' maybe a
couple instructions earlier (but the 'mov' from SIMD to GPR probably takes
a couple cycles, so not too far earlier), maybe one instruction earlier?
> + b.eq 9f
> +
> + movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
I guess this 'movi' could be done before the 'b.eq' too? If we branch out,
we'd have run it in vain, but it's probably essentially free in that case
anyway, and avoids having the next 'uqsub' stalling, waiting for it.
> + uqsub v24.8h, v24.8h, v31.8h
> + sub v4.8h , v4.8h, v2.8h
> + srshr v4.8h, v4.8h, #3
> + smin v4.8h, v4.8h, v24.8h
> + neg v25.8h, v24.8h
> + smax v4.8h, v4.8h, v25.8h
> + and v4.16B, v4.16B, v26.16B
> + add v16.8h, v16.8h, v4.8h
> + sub v0.8h, v0.8h, v4.8h
Nit: The vertical alignment is quite wobbly here, please try to make nice
vertical lines if possible.
> +
> + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
> + movi v5.8h, #0
> + smin v0.8h, v0.8h, v4.8h
> + smax v16.8h, v16.8h, v5.8h
> + smax v0.8h, v0.8h, v5.8h
> + smin v16.8h, v16.8h, v4.8h
I think it'd be (marginally) better pipelining, and more consistent, if
you'd do first two smin v4, then two smax v5.
> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_neon_10, export=1
> + h264_loop_filter_start_10
> +
> + mov x10, x0
> + sub x0, x0, x1, lsl #1
> + ld1 {v18.8h}, [x0 ], x1
> + ld1 {v0.8h}, [x10], x1
> + ld1 {v16.8h}, [x0 ], x1
> + ld1 {v2.8h}, [x10]
> +
> + h264_loop_filter_chroma_10
> +
> + sub x0, x10, x1, lsl #1
> + st1 {v16.8h}, [x0], x1
> + st1 {v0.8h}, [x0], x1
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_neon_10, export=1
> + h264_loop_filter_start_10
> +
> + sub x0, x0, #4 // access the 2nd left pixel
> +h_loop_filter_chroma420_10:
> + add x10, x0, x1, lsl #2
> + ld1 {v18.d}[0], [x0 ], x1
> + ld1 {v18.d}[1], [x10], x1
> + ld1 {v16.d}[0], [x0 ], x1
> + ld1 {v16.d}[1], [x10], x1
> + ld1 {v0.d}[0], [x0 ], x1
> + ld1 {v0.d}[1], [x10], x1
> + ld1 {v2.d}[0], [x0 ], x1
> + ld1 {v2.d}[1], [x10], x1
> +
> + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
> +
> + h264_loop_filter_chroma_10
> +
> + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
> +
> + sub x0, x10, x1, lsl #3
> + st1 {v18.d}[0], [x0], x1
> + st1 {v16.d}[0], [x0], x1
> + st1 {v0.d}[0], [x0], x1
> + st1 {v2.d}[0], [x0], x1
> + st1 {v18.d}[1], [x0], x1
> + st1 {v16.d}[1], [x0], x1
> + st1 {v0.d}[1], [x0], x1
> + st1 {v2.d}[1], [x0], x1
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma422_neon_10, export=1
> + h264_loop_filter_start_10
> + add x5, x0, x1
> + sub x0, x0, #4
> + add x1, x1, x1
> + mov x7, x30
> + bl h_loop_filter_chroma420_10
> + mov x30, x7
> + sub x0, x5, #4
> + mov v24.s[0], w6
> + b h_loop_filter_chroma420_10
> +endfunc
> +
> +.macro h264_loop_filter_chroma_intra_10
> + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0)
> + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0)
> + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0)
> + cmhi v26.8h, v30.8h, v26.8h // < alpha
> + cmhi v27.8h, v31.8h, v27.8h // < beta
> + cmhi v28.8h, v31.8h, v28.8h // < beta
> + and v26.16b, v26.16b, v27.16b
> + and v26.16b, v26.16b, v28.16b
> + mov x2, v26.d[0]
> + mov x3, v26.d[1]
> +
> + shl v4.8h, v18.8h, #1
> + shl v6.8h, v19.8h, #1
> +
> + adds x2, x2, x3
> + b.eq 9f
> +
> + add v20.8h, v16.8h, v19.8h
> + add v22.8h, v17.8h, v18.8h
> + add v20.8h, v20.8h, v4.8h
> + add v22.8h, v22.8h, v6.8h
> + urshr v24.8h, v20.8h, #2
> + urshr v25.8h, v22.8h, #2
> + bit v16.16b, v24.16b, v26.16b
> + bit v17.16b, v25.16b, v26.16b
> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
> + h264_loop_filter_start_intra_10
> + mov x9, x0
> + sub x0, x0, x1, lsl #1
> + ld1 {v18.8h}, [x0], x1
> + ld1 {v17.8h}, [x9], x1
> + ld1 {v16.8h}, [x0], x1
> + ld1 {v19.8h}, [x9]
> +
> + h264_loop_filter_chroma_intra_10
> +
> + sub x0, x9, x1, lsl #1
> + st1 {v16.8h}, [x0], x1
> + st1 {v17.8h}, [x0], x1
> +
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
> + h264_loop_filter_start_intra_10
> +
> + sub x4, x0, #4
> + sub x0, x0, #2
> + add x9, x4, x1, lsl #1
> + ld1 {v18.8h}, [x4], x1
> + ld1 {v17.8h}, [x9], x1
> + ld1 {v16.8h}, [x4], x1
> + ld1 {v19.8h}, [x9], x1
> +
> + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
> +
> + h264_loop_filter_chroma_intra_10
> +
> + st2 {v16.h,v17.h}[0], [x0], x1
> + st2 {v16.h,v17.h}[1], [x0], x1
> + st2 {v16.h,v17.h}[2], [x0], x1
> + st2 {v16.h,v17.h}[3], [x0], x1
> +
> +9:
> + ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
> + h264_loop_filter_start_intra_10
> + sub x4, x0, #4
> + sub x0, x0, #2
> +h_loop_filter_chroma420_intra_10:
> + add x9, x4, x1, lsl #2
> + ld1 {v18.4h}, [x4], x1
> + ld1 {v18.d}[1], [x9], x1
> + ld1 {v16.4h}, [x4], x1
> + ld1 {v16.d}[1], [x9], x1
> + ld1 {v17.4h}, [x4], x1
> + ld1 {v17.d}[1], [x9], x1
> + ld1 {v19.4h}, [x4], x1
> + ld1 {v19.d}[1], [x9], x1
Try to align things here to make this bit at least a bit more readable.
This looks pretty good now overall, just a bit minor cosmetic cleanup and
maybe some minor instruction scheduling left.
If you have access to run and benchmark things on an in-order core (like
Cortex A53), that'd be good to do while trying to do the instruction
scheduling tweaks to avoid stalls.
// Martin
More information about the ffmpeg-devel
mailing list