[FFmpeg-devel] [PATCH 5/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_hv
Martin Storsjö
martin at martin.st
Mon Jun 12 11:19:29 EEST 2023
On Sun, 4 Jun 2023, Logan.Lyu at myais.com.cn wrote:
> From: Logan Lyu <Logan.Lyu at myais.com.cn>
>
> Signed-off-by: Logan Lyu <Logan.Lyu at myais.com.cn>
> ---
> libavcodec/aarch64/hevcdsp_epel_neon.S | 703 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 7 +
> 2 files changed, 710 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index 32f052a7b1..24a74d2c7d 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -718,6 +718,709 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
> ret
> endfunc
>
> +.macro epel_uni_w_hv_start
> + mov x15, x5 //denom
> + mov x16, x6 //wx
> + mov x17, x7 //ox
> + add w15, w15, #6 //shift = denom+6
> +
> +
> + ldp x5, x6, [sp]
> + ldp x7, xzr, [sp, #16]
Why ldp into xzr, that seems pointless?
> +
> + sub sp, sp, #128
> + stp q12, q13, [sp]
This could be "stp q12, q13, [sp, #-128]!"
> + stp q14, q15, [sp, #32]
> + stp q8, q9, [sp, #64]
> + stp q10, q11, [sp, #96]
> +
> + dup v13.8h, w16 //wx
> + dup v14.4s, w17 //ox
> +
> + mov w17, #1
> + lsl w17, w17, w15
> + lsr w17, w17, #1
> + dup v15.4s, w17
> +
> + neg w15, w15 // -shift
> + dup v12.4s, w15 //shift
> +.endm
> +
> +.macro epel_uni_w_hv_end
> + smull v28.4s, v4.4h, v13.4h
> + smull2 v29.4s, v4.8h, v13.8h
> + add v28.4s, v28.4s, v15.4s
> + add v29.4s, v29.4s, v15.4s
> + sshl v28.4s, v28.4s, v12.4s
> + sshl v29.4s, v29.4s, v12.4s
> + add v28.4s, v28.4s, v14.4s
> + add v29.4s, v29.4s, v14.4s
> + sqxtn v4.4h, v28.4s
> + sqxtn2 v4.8h, v29.4s
> +.endm
> +
> +.macro epel_uni_w_hv_end2
> + smull v28.4s, v4.4h, v13.4h
> + smull2 v29.4s, v4.8h, v13.8h
> + smull v30.4s, v5.4h, v13.4h
> + smull2 v31.4s, v5.8h, v13.8h
> + add v28.4s, v28.4s, v15.4s
> + add v29.4s, v29.4s, v15.4s
> + add v30.4s, v30.4s, v15.4s
> + add v31.4s, v31.4s, v15.4s
> +
> + sshl v28.4s, v28.4s, v12.4s
> + sshl v29.4s, v29.4s, v12.4s
> + sshl v30.4s, v30.4s, v12.4s
> + sshl v31.4s, v31.4s, v12.4s
> +
> + add v28.4s, v28.4s, v14.4s
> + add v29.4s, v29.4s, v14.4s
> + add v30.4s, v30.4s, v14.4s
> + add v31.4s, v31.4s, v14.4s
> +
> + sqxtn v4.4h, v28.4s
> + sqxtn2 v4.8h, v29.4s
> + sqxtn v5.4h, v30.4s
> + sqxtn2 v5.8h, v31.4s
> +.endm
> +
> +.macro epel_uni_w_hv_end3
> + smull v1.4s, v4.4h, v13.4h
> + smull2 v2.4s, v4.8h, v13.8h
> + smull v28.4s, v5.4h, v13.4h
> + smull2 v29.4s, v5.8h, v13.8h
> + smull v30.4s, v6.4h, v13.4h
> + smull2 v31.4s, v6.8h, v13.8h
> + add v1.4s, v1.4s, v15.4s
> + add v2.4s, v2.4s, v15.4s
> + add v28.4s, v28.4s, v15.4s
> + add v29.4s, v29.4s, v15.4s
> + add v30.4s, v30.4s, v15.4s
> + add v31.4s, v31.4s, v15.4s
> +
> + sshl v1.4s, v1.4s, v12.4s
> + sshl v2.4s, v2.4s, v12.4s
> + sshl v28.4s, v28.4s, v12.4s
> + sshl v29.4s, v29.4s, v12.4s
> + sshl v30.4s, v30.4s, v12.4s
> + sshl v31.4s, v31.4s, v12.4s
> + add v1.4s, v1.4s, v14.4s
> + add v2.4s, v2.4s, v14.4s
> + add v28.4s, v28.4s, v14.4s
> + add v29.4s, v29.4s, v14.4s
> + add v30.4s, v30.4s, v14.4s
> + add v31.4s, v31.4s, v14.4s
> +
> + sqxtn v4.4h, v1.4s
> + sqxtn2 v4.8h, v2.4s
> + sqxtn v5.4h, v28.4s
> + sqxtn2 v5.8h, v29.4s
> + sqxtn v6.4h, v30.4s
> + sqxtn2 v6.8h, v31.4s
> +.endm
> +
> +.macro calc_epelh dst, src0, src1, src2, src3
> + smull \dst\().4s, \src0\().4h, v0.h[0]
> + smlal \dst\().4s, \src1\().4h, v0.h[1]
> + smlal \dst\().4s, \src2\().4h, v0.h[2]
> + smlal \dst\().4s, \src3\().4h, v0.h[3]
> + sqshrn \dst\().4h, \dst\().4s, #6
> +.endm
> +
> +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
> + smull2 \tmp\().4s, \src0\().8h, v0.h[0]
> + smlal2 \tmp\().4s, \src1\().8h, v0.h[1]
> + smlal2 \tmp\().4s, \src2\().8h, v0.h[2]
> + smlal2 \tmp\().4s, \src3\().8h, v0.h[3]
> + sqshrn2 \dst\().8h, \tmp\().4s, #6
> +.endm
> +
> +.macro load_epel_filterh freg, xreg
> + movrel \xreg, epel_filters
> + add \xreg, \xreg, \freg, lsl #2
> + ld1 {v0.8b}, [\xreg]
> + sxtl v0.8h, v0.8b
> +.endm
> +
> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
> + epel_uni_w_hv_start
> + and x4, x4, 0xffffffff
What does this "and" do here? Is it a case where the argument is "int",
while the upper bits of the register is undefined? In those cases, you're
best off by just using "w4", possibly "w4, uxtw" (or sxtw) instead of
manually doing such an "and" here.
> +
> + add x10, x4, #3
> + lsl x10, x10, #7
> + sub sp, sp, x10 // tmp_array
> + stp x0, x1, [sp, #-16]!
> + stp x4, x6, [sp, #-16]!
> + stp xzr, x30, [sp, #-16]!
Don't do consecutive decrements like this, but do one "stp ..., [sp,
#-48]!" followed by "stp ..., [sp, #16]" etc.
> + add x0, sp, #48
> + sub x1, x2, x3
> + mov x2, x3
> + add x3, x4, #3
> + mov x4, x5
> + bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
> + ldp xzr, x30, [sp], #16
> + ldp x4, x6, [sp], #16
> + ldp x0, x1, [sp], #16
> + load_epel_filterh x6, x5
> + mov x10, #(MAX_PB_SIZE * 2)
> + ld1 {v16.4h}, [sp], x10
> + ld1 {v17.4h}, [sp], x10
> + ld1 {v18.4h}, [sp], x10
> +1: ld1 {v19.4h}, [sp], x10
> + calc_epelh v4, v16, v17, v18, v19
> + epel_uni_w_hv_end
> + sqxtun v4.8b, v4.8h
> + str s4, [x0]
> + add x0, x0, x1
> + subs x4, x4, #1
> + b.eq 2f
> +
> + ld1 {v16.4h}, [sp], x10
> + calc_epelh v4, v17, v18, v19, v16
> + epel_uni_w_hv_end
> + sqxtun v4.8b, v4.8h
> + str s4, [x0]
> + add x0, x0, x1
> + subs x4, x4, #1
> + b.eq 2f
> +
> + ld1 {v17.4h}, [sp], x10
> + calc_epelh v4, v18, v19, v16, v17
> + epel_uni_w_hv_end
> + sqxtun v4.8b, v4.8h
> + str s4, [x0]
> + add x0, x0, x1
> + subs x4, x4, #1
> + b.eq 2f
> +
> + ld1 {v18.4h}, [sp], x10
> + calc_epelh v4, v19, v16, v17, v18
> + epel_uni_w_hv_end
> + sqxtun v4.8b, v4.8h
> + str s4, [x0]
> + add x0, x0, x1
> + subs x4, x4, #1
> + b.ne 1b
> +2:
> + ldp q12, q13, [sp]
> + ldp q14, q15, [sp, #32]
> + ldp q8, q9, [sp, #64]
> + ldp q10, q11, [sp, #96]
> + add sp, sp, #128
Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128".
The same thing applies to all other functions in this patch too.
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 348497bbbe..fbbc4e6071 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
> int height, int denom, int wx, int ox,
> intptr_t mx, intptr_t my, int width), _i8mm);
>
> +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
> + const uint8_t *_src, ptrdiff_t _srcstride,
> + int height, int denom, int wx, int ox,
> + intptr_t mx, intptr_t my, int width), _i8mm);
> +
> NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
> const uint8_t *_src, ptrdiff_t _srcstride,
> int height, int denom, int wx, int ox,
> @@ -286,11 +291,13 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
> NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
>
> +
> if (have_i8mm(cpu_flags)) {
Stray whitespace change.
// Martin
More information about the ffmpeg-devel
mailing list