[FFmpeg-devel] [PATCH] codec/aarch64/hevc:add transform_luma_neon checkasm

Wed Apr 12 15:56:29 EEST 2023

Please don't send emails again.

On Wed, Apr 12, 2023, 5:54 PM Martin Storsjö <martin at martin.st> wrote:

> On Sun, 9 Apr 2023, xufuji456 wrote:
>
> > got 56% speed up (run_count=1000, CPU=Cortex A53)
> > transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
> > ---
> > libavcodec/aarch64/hevcdsp_idct_neon.S    | 51 ++++++++++++++++++++++-
> > libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
> > tests/checkasm/hevc_idct.c                | 28 +++++++++++++
> > 3 files changed, 80 insertions(+), 1 deletion(-)
>
> When sending updates to your patches, please mention somewhere (outside of
> the main git commit message) what has changed. When I got back to looking
> at your patches now, I see a handful of revisions of the same patch with
> no explanation of what has changed.
>
> >
> > diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S
> b/libavcodec/aarch64/hevcdsp_idct_neon.S
> > index 74a96957bf..fc683a6396 100644
> > --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> > +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> > @@ -6,6 +6,7 @@
> >  * Ported from arm/hevcdsp_idct_neon.S by
> >  * Copyright (c) 2020 Reimar Döffinger
> >  * Copyright (c) 2023 J. Dekker <jdek at itanimul.li>
> > + * Copyright (c) 2023 xu fulong <839789740 at qq.com>
> >  *
> >  * This file is part of FFmpeg.
> >  *
> > @@ -656,4 +657,52 @@ idct_dc 16, 8
> > idct_dc 16, 10
> >
> > idct_dc 32, 8
> > -idct_dc 32, 10
> > \ No newline at end of file
>
> The patch still does not apply cleanly on git master due to this. Please
> rebase your patch on top of the actual public git master.
>
> > +idct_dc 32, 10
> > +
> > +.macro tr4_luma_shift r0, r1, r2, r3, shift
> > +        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
> > +        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
> > +        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
> > +        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
> > +
> > +        saddl       v7.4s, \r0, \r3         // src0 + src3
> > +        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
> > +        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2
> + src3)
> > +
> > +        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
> > +        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
> > +        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
> > +        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1
> + c3
> > +
> > +        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
> > +        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
> > +        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
> > +        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1
> + c3
> > +
> > +        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
> > +        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
> > +        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
> > +        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2
> - c3
> > +
> > +        sqrshrn     \r0, v5.4s, \shift
> > +        sqrshrn     \r1, v6.4s, \shift
> > +        sqrshrn     \r2, v7.4s, \shift
> > +        sqrshrn     \r3, v0.4s, \shift
> > +.endm
> > +
> > +function ff_hevc_transform_luma_4x4_neon_8, export=1
> > +        ld1            {v28.4h-v31.4h}, [x0]
> > +        movi           v18.4s, #74
> > +        movi           v19.4s, #29
> > +        movi           v20.4s, #55
> > +        movi           v21.4h, #74
> > +
> > +        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
> > +        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> > +
> > +        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
> > +        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> > +
> > +        st1            {v28.4h-v31.4h}, [x0]
> > +        ret
> > +endfunc
> > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > index a923bae35c..6605a39973 100644
> > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > @@ -75,6 +75,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
> > +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
> > void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t
> *_src,
> >                                   ptrdiff_t stride_dst, ptrdiff_t
> stride_src,
> >                                   const int16_t *sao_offset_val, int
> sao_left_class,
> > @@ -142,6 +143,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
> *c, const int bit_depth)
> >         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
> >         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
> >         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
> > +        c->transform_4x4_luma          =
> ff_hevc_transform_luma_4x4_neon_8;
> >         c->sao_band_filter[0]          =
> >         c->sao_band_filter[1]          =
> >         c->sao_band_filter[2]          =
> > diff --git a/tests/checkasm/hevc_idct.c b/tests/checkasm/hevc_idct.c
> > index 338b8a23e4..1c2b08d0f8 100644
> > --- a/tests/checkasm/hevc_idct.c
> > +++ b/tests/checkasm/hevc_idct.c
> > @@ -84,6 +84,27 @@ static void check_idct_dc(HEVCDSPContext h, int
> bit_depth)
> >     }
> > }
> >
> > +static void check_transform_luma(HEVCDSPContext h)
>
> Unrelated to your patch, I see that this test file already does this
> elsewhere, but it is uncommon to pass such a context by value here;
> normally we'd pass it as a pointer to the subfunctions. But that's a
> preexisting issue, so stick with the current convention and we can change
> them all in a separate patch.
>
> > +{
> > +    LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]);
> > +    LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]);
> > +
> > +    int block_size = 4;
> > +    int size = block_size * block_size;
> > +    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, int16_t *coeffs);
> > +
> > +    randomize_buffers(coeffs0, size);
> > +    memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
> > +
> > +    if (check_func(h.transform_4x4_luma, "hevc_transform_4x4_luma")) {
> > +        call_ref(coeffs0);
> > +        call_new(coeffs1);
> > +        if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
> > +            fail();
> > +        bench_new(coeffs1);
> > +    }
> > +}
> > +
> > void checkasm_check_hevc_idct(void)
> > {
> >     int bit_depth;
> > @@ -103,4 +124,11 @@ void checkasm_check_hevc_idct(void)
> >         check_idct(h, bit_depth);
> >     }
> >     report("idct");
> > +
> > +    bit_depth = 8;
> > +    HEVCDSPContext h;
>
> This causes warnings:
>
> src/tests/checkasm/hevc_idct.c: In function ‘checkasm_check_hevc_idct’:
> src/tests/checkasm/hevc_idct.c:129:5: warning: ISO C90 forbids mixed
> declaration
> s and code [-Wdeclaration-after-statement]
>       HEVCDSPContext h;
>       ^~~~~~~~~~~~~~
>
> And why not test all bitdepths like the other tests? Even if you might not
> be adding assembly for other bitdepths right now, it's best to make the
> test cover them all from the start.
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>