[FFmpeg-devel] [PATCH] codec/aarch64/hevc:add transform_luma_neon checkasm
Fahad Mustafa
fm03454323 at gmail.com
Wed Apr 12 15:56:29 EEST 2023
Please don't send emails again.
On Wed, Apr 12, 2023, 5:54 PM Martin Storsjö <martin at martin.st> wrote:
> On Sun, 9 Apr 2023, xufuji456 wrote:
>
> > got 56% speed up (run_count=1000, CPU=Cortex A53)
> > transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
> > ---
> > libavcodec/aarch64/hevcdsp_idct_neon.S | 51 ++++++++++++++++++++++-
> > libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
> > tests/checkasm/hevc_idct.c | 28 +++++++++++++
> > 3 files changed, 80 insertions(+), 1 deletion(-)
>
> When sending updates to your patches, please mention somewhere (outside of
> the main git commit message) what has changed. When I got back to looking
> at your patches now, I see a handful of revisions of the same patch with
> no explanation of what has changed.
>
> >
> > diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S
> b/libavcodec/aarch64/hevcdsp_idct_neon.S
> > index 74a96957bf..fc683a6396 100644
> > --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> > +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> > @@ -6,6 +6,7 @@
> > * Ported from arm/hevcdsp_idct_neon.S by
> > * Copyright (c) 2020 Reimar Döffinger
> > * Copyright (c) 2023 J. Dekker <jdek at itanimul.li>
> > + * Copyright (c) 2023 xu fulong <839789740 at qq.com>
> > *
> > * This file is part of FFmpeg.
> > *
> > @@ -656,4 +657,52 @@ idct_dc 16, 8
> > idct_dc 16, 10
> >
> > idct_dc 32, 8
> > -idct_dc 32, 10
> > \ No newline at end of file
>
> The patch still does not apply cleanly on git master due to this. Please
> rebase your patch on top of the actual public git master.
>
> > +idct_dc 32, 10
> > +
> > +.macro tr4_luma_shift r0, r1, r2, r3, shift
> > + saddl v0.4s, \r0, \r2 // c0 = src0 + src2
> > + saddl v1.4s, \r2, \r3 // c1 = src2 + src3
> > + ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
> > + smull v3.4s, \r1, v21.4h // c3 = 74 * src1
> > +
> > + saddl v7.4s, \r0, \r3 // src0 + src3
> > + ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
> > + mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2
> + src3)
> > +
> > + mul v5.4s, v0.4s, v19.4s // 29 * c0
> > + mul v6.4s, v1.4s, v20.4s // 55 * c1
> > + add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
> > + add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1
> + c3
> > +
> > + mul v1.4s, v1.4s, v19.4s // 29 * c1
> > + mul v6.4s, v2.4s, v20.4s // 55 * c2
> > + sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
> > + add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1
> + c3
> > +
> > + mul v0.4s, v0.4s, v20.4s // 55 * c0
> > + mul v2.4s, v2.4s, v19.4s // 29 * c2
> > + add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
> > + sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2
> - c3
> > +
> > + sqrshrn \r0, v5.4s, \shift
> > + sqrshrn \r1, v6.4s, \shift
> > + sqrshrn \r2, v7.4s, \shift
> > + sqrshrn \r3, v0.4s, \shift
> > +.endm
> > +
> > +function ff_hevc_transform_luma_4x4_neon_8, export=1
> > + ld1 {v28.4h-v31.4h}, [x0]
> > + movi v18.4s, #74
> > + movi v19.4s, #29
> > + movi v20.4s, #55
> > + movi v21.4h, #74
> > +
> > + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
> > + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> > +
> > + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
> > + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
> > +
> > + st1 {v28.4h-v31.4h}, [x0]
> > + ret
> > +endfunc
> > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > index a923bae35c..6605a39973 100644
> > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> > @@ -75,6 +75,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
> > void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
> > +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
> > void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t
> *_src,
> > ptrdiff_t stride_dst, ptrdiff_t
> stride_src,
> > const int16_t *sao_offset_val, int
> sao_left_class,
> > @@ -142,6 +143,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
> *c, const int bit_depth)
> > c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
> > c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
> > c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
> > + c->transform_4x4_luma =
> ff_hevc_transform_luma_4x4_neon_8;
> > c->sao_band_filter[0] =
> > c->sao_band_filter[1] =
> > c->sao_band_filter[2] =
> > diff --git a/tests/checkasm/hevc_idct.c b/tests/checkasm/hevc_idct.c
> > index 338b8a23e4..1c2b08d0f8 100644
> > --- a/tests/checkasm/hevc_idct.c
> > +++ b/tests/checkasm/hevc_idct.c
> > @@ -84,6 +84,27 @@ static void check_idct_dc(HEVCDSPContext h, int
> bit_depth)
> > }
> > }
> >
> > +static void check_transform_luma(HEVCDSPContext h)
>
> Unrelated to your patch, I see that this test file already does this
> elsewhere, but it is uncommon to pass such a context by value here;
> normally we'd pass it as a pointer to the subfunctions. But that's a
> preexisting issue, so stick with the current convention and we can change
> them all in a separate patch.
>
> > +{
> > + LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]);
> > + LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]);
> > +
> > + int block_size = 4;
> > + int size = block_size * block_size;
> > + declare_func_emms(AV_CPU_FLAG_MMXEXT, void, int16_t *coeffs);
> > +
> > + randomize_buffers(coeffs0, size);
> > + memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
> > +
> > + if (check_func(h.transform_4x4_luma, "hevc_transform_4x4_luma")) {
> > + call_ref(coeffs0);
> > + call_new(coeffs1);
> > + if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
> > + fail();
> > + bench_new(coeffs1);
> > + }
> > +}
> > +
> > void checkasm_check_hevc_idct(void)
> > {
> > int bit_depth;
> > @@ -103,4 +124,11 @@ void checkasm_check_hevc_idct(void)
> > check_idct(h, bit_depth);
> > }
> > report("idct");
> > +
> > + bit_depth = 8;
> > + HEVCDSPContext h;
>
> This causes warnings:
>
> src/tests/checkasm/hevc_idct.c: In function ‘checkasm_check_hevc_idct’:
> src/tests/checkasm/hevc_idct.c:129:5: warning: ISO C90 forbids mixed
> declaration
> s and code [-Wdeclaration-after-statement]
> HEVCDSPContext h;
> ^~~~~~~~~~~~~~
>
> And why not test all bitdepths like the other tests? Even if you might not
> be adding assembly for other bitdepths right now, it's best to make the
> test cover them all from the start.
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>
More information about the ffmpeg-devel
mailing list