[FFmpeg-devel] [PATCH 5/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_hv

Mon Jun 12 11:19:29 EEST 2023

On Sun, 4 Jun 2023, Logan.Lyu at myais.com.cn wrote:

> From: Logan Lyu <Logan.Lyu at myais.com.cn>
>
> Signed-off-by: Logan Lyu <Logan.Lyu at myais.com.cn>
> ---
> libavcodec/aarch64/hevcdsp_epel_neon.S    | 703 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +
> 2 files changed, 710 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index 32f052a7b1..24a74d2c7d 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -718,6 +718,709 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
>         ret
> endfunc
>
> +.macro epel_uni_w_hv_start
> +        mov             x15, x5         //denom
> +        mov             x16, x6         //wx
> +        mov             x17, x7         //ox
> +        add             w15, w15, #6    //shift = denom+6
> +
> +
> +        ldp             x5, x6, [sp]
> +        ldp             x7, xzr, [sp, #16]

Why ldp into xzr, that seems pointless?

> +
> +        sub             sp, sp, #128
> +        stp             q12, q13, [sp]

This could be "stp q12, q13, [sp, #-128]!"

> +        stp             q14, q15, [sp, #32]
> +        stp             q8, q9,   [sp, #64]
> +        stp             q10, q11, [sp, #96]
> +
> +        dup             v13.8h, w16     //wx
> +        dup             v14.4s, w17     //ox
> +
> +        mov             w17, #1
> +        lsl             w17, w17, w15
> +        lsr             w17, w17, #1
> +        dup             v15.4s, w17
> +
> +        neg             w15, w15        // -shift
> +        dup             v12.4s, w15     //shift
> +.endm
> +
> +.macro epel_uni_w_hv_end
> +        smull           v28.4s, v4.4h, v13.4h
> +        smull2          v29.4s, v4.8h, v13.8h
> +        add             v28.4s, v28.4s, v15.4s
> +        add             v29.4s, v29.4s, v15.4s
> +        sshl            v28.4s, v28.4s, v12.4s
> +        sshl            v29.4s, v29.4s, v12.4s
> +        add             v28.4s, v28.4s, v14.4s
> +        add             v29.4s, v29.4s, v14.4s
> +        sqxtn           v4.4h, v28.4s
> +        sqxtn2          v4.8h, v29.4s
> +.endm
> +
> +.macro epel_uni_w_hv_end2
> +        smull           v28.4s, v4.4h, v13.4h
> +        smull2          v29.4s, v4.8h, v13.8h
> +        smull           v30.4s, v5.4h, v13.4h
> +        smull2          v31.4s, v5.8h, v13.8h
> +        add             v28.4s, v28.4s, v15.4s
> +        add             v29.4s, v29.4s, v15.4s
> +        add             v30.4s, v30.4s, v15.4s
> +        add             v31.4s, v31.4s, v15.4s
> +
> +        sshl            v28.4s, v28.4s, v12.4s
> +        sshl            v29.4s, v29.4s, v12.4s
> +        sshl            v30.4s, v30.4s, v12.4s
> +        sshl            v31.4s, v31.4s, v12.4s
> +
> +        add             v28.4s, v28.4s, v14.4s
> +        add             v29.4s, v29.4s, v14.4s
> +        add             v30.4s, v30.4s, v14.4s
> +        add             v31.4s, v31.4s, v14.4s
> +
> +        sqxtn           v4.4h, v28.4s
> +        sqxtn2          v4.8h, v29.4s
> +        sqxtn           v5.4h, v30.4s
> +        sqxtn2          v5.8h, v31.4s
> +.endm
> +
> +.macro epel_uni_w_hv_end3
> +        smull           v1.4s,  v4.4h, v13.4h
> +        smull2          v2.4s,  v4.8h, v13.8h
> +        smull           v28.4s, v5.4h, v13.4h
> +        smull2          v29.4s, v5.8h, v13.8h
> +        smull           v30.4s, v6.4h, v13.4h
> +        smull2          v31.4s, v6.8h, v13.8h
> +        add             v1.4s, v1.4s, v15.4s
> +        add             v2.4s, v2.4s, v15.4s
> +        add             v28.4s, v28.4s, v15.4s
> +        add             v29.4s, v29.4s, v15.4s
> +        add             v30.4s, v30.4s, v15.4s
> +        add             v31.4s, v31.4s, v15.4s
> +
> +        sshl            v1.4s, v1.4s, v12.4s
> +        sshl            v2.4s, v2.4s, v12.4s
> +        sshl            v28.4s, v28.4s, v12.4s
> +        sshl            v29.4s, v29.4s, v12.4s
> +        sshl            v30.4s, v30.4s, v12.4s
> +        sshl            v31.4s, v31.4s, v12.4s
> +        add             v1.4s, v1.4s, v14.4s
> +        add             v2.4s, v2.4s, v14.4s
> +        add             v28.4s, v28.4s, v14.4s
> +        add             v29.4s, v29.4s, v14.4s
> +        add             v30.4s, v30.4s, v14.4s
> +        add             v31.4s, v31.4s, v14.4s
> +
> +        sqxtn           v4.4h, v1.4s
> +        sqxtn2          v4.8h, v2.4s
> +        sqxtn           v5.4h, v28.4s
> +        sqxtn2          v5.8h, v29.4s
> +        sqxtn           v6.4h, v30.4s
> +        sqxtn2          v6.8h, v31.4s
> +.endm
> +
> +.macro calc_epelh dst, src0, src1, src2, src3
> +        smull           \dst\().4s, \src0\().4h, v0.h[0]
> +        smlal           \dst\().4s, \src1\().4h, v0.h[1]
> +        smlal           \dst\().4s, \src2\().4h, v0.h[2]
> +        smlal           \dst\().4s, \src3\().4h, v0.h[3]
> +        sqshrn          \dst\().4h, \dst\().4s, #6
> +.endm
> +
> +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
> +        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
> +        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
> +        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
> +        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
> +        sqshrn2         \dst\().8h, \tmp\().4s, #6
> +.endm
> +
> +.macro load_epel_filterh freg, xreg
> +        movrel          \xreg, epel_filters
> +        add             \xreg, \xreg, \freg, lsl #2
> +        ld1             {v0.8b}, [\xreg]
> +        sxtl            v0.8h, v0.8b
> +.endm
> +
> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
> +        epel_uni_w_hv_start
> +        and             x4, x4, 0xffffffff

What does this "and" do here? Is it a case where the argument is "int", 
while the upper bits of the register is undefined? In those cases, you're 
best off by just using "w4", possibly "w4, uxtw" (or sxtw) instead of 
manually doing such an "and" here.

> +
> +        add             x10, x4, #3
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10     // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        stp             xzr, x30, [sp, #-16]!

Don't do consecutive decrements like this, but do one "stp ..., [sp, 
#-48]!" followed by "stp ..., [sp, #16]" etc.

> +        add             x0, sp, #48
> +        sub             x1, x2, x3
> +        mov             x2, x3
> +        add             x3, x4, #3
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
> +        ldp             xzr, x30, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        load_epel_filterh x6, x5
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ld1             {v16.4h}, [sp], x10
> +        ld1             {v17.4h}, [sp], x10
> +        ld1             {v18.4h}, [sp], x10
> +1:      ld1             {v19.4h}, [sp], x10
> +        calc_epelh      v4, v16, v17, v18, v19
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.eq            2f
> +
> +        ld1             {v16.4h}, [sp], x10
> +        calc_epelh      v4, v17, v18, v19, v16
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.eq            2f
> +
> +        ld1             {v17.4h}, [sp], x10
> +        calc_epelh      v4, v18, v19, v16, v17
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.eq            2f
> +
> +        ld1             {v18.4h}, [sp], x10
> +        calc_epelh      v4, v19, v16, v17, v18
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.ne            1b
> +2:
> +        ldp             q12, q13, [sp]
> +        ldp             q14, q15, [sp, #32]
> +        ldp             q8, q9,   [sp, #64]
> +        ldp             q10, q11, [sp, #96]
> +        add             sp, sp, #128

Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128".

The same thing applies to all other functions in this patch too.

> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 348497bbbe..fbbc4e6071 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
>         int height, int denom, int wx, int ox,
>         intptr_t mx, intptr_t my, int width), _i8mm);
>
> +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox,
> +        intptr_t mx, intptr_t my, int width), _i8mm);
> +
> NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
>         const uint8_t *_src, ptrdiff_t _srcstride,
>         int height, int denom, int wx, int ox,
> @@ -286,11 +291,13 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
>
> +
>         if (have_i8mm(cpu_flags)) {

Stray whitespace change.

// Martin