[FFmpeg-devel] [PATCH] lavc/aarch64/fdct: add neon-optimized fdct for aarch64

Wed Mar 6 18:26:00 EET 2024

ping

On Sun, Feb 4, 2024 at 3:42 PM Ramiro Polla <ramiro.polla at gmail.com> wrote:
>
> The code is imported from libjpeg-turbo-3.0.1. The neon registers used
> have been changed to avoid modifying v8-v15.
> ---
>  libavcodec/aarch64/Makefile               |   2 +
>  libavcodec/aarch64/fdct.h                 |  26 ++
>  libavcodec/aarch64/fdctdsp_init_aarch64.c |  39 +++
>  libavcodec/aarch64/fdctdsp_neon.S         | 369 ++++++++++++++++++++++
>  libavcodec/avcodec.h                      |   1 +
>  libavcodec/fdctdsp.c                      |   4 +-
>  libavcodec/fdctdsp.h                      |   2 +
>  libavcodec/options_table.h                |   1 +
>  libavcodec/tests/aarch64/dct.c            |   2 +
>  9 files changed, 445 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/aarch64/fdct.h
>  create mode 100644 libavcodec/aarch64/fdctdsp_init_aarch64.c
>  create mode 100644 libavcodec/aarch64/fdctdsp_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index beb6a02f5f..eebccbe4a5 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -1,4 +1,5 @@
>  # subsystems
> +OBJS-$(CONFIG_FDCTDSP)                  += aarch64/fdctdsp_init_aarch64.o
>  OBJS-$(CONFIG_FMTCONVERT)               += aarch64/fmtconvert_init.o
>  OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
>  OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
> @@ -35,6 +36,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
>
>  # subsystems
>  NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/sbrdsp_neon.o
> +NEON-OBJS-$(CONFIG_FDCTDSP)             += aarch64/fdctdsp_neon.o
>  NEON-OBJS-$(CONFIG_FMTCONVERT)          += aarch64/fmtconvert_neon.o
>  NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
>  NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
> diff --git a/libavcodec/aarch64/fdct.h b/libavcodec/aarch64/fdct.h
> new file mode 100644
> index 0000000000..0901b53a83
> --- /dev/null
> +++ b/libavcodec/aarch64/fdct.h
> @@ -0,0 +1,26 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_AARCH64_FDCT_H
> +#define AVCODEC_AARCH64_FDCT_H
> +
> +#include <stdint.h>
> +
> +void ff_fdct_neon(int16_t *block);
> +
> +#endif /* AVCODEC_AARCH64_FDCT_H */
> diff --git a/libavcodec/aarch64/fdctdsp_init_aarch64.c b/libavcodec/aarch64/fdctdsp_init_aarch64.c
> new file mode 100644
> index 0000000000..59d91bc8fc
> --- /dev/null
> +++ b/libavcodec/aarch64/fdctdsp_init_aarch64.c
> @@ -0,0 +1,39 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavcodec/avcodec.h"
> +#include "libavcodec/fdctdsp.h"
> +#include "fdct.h"
> +
> +av_cold void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, AVCodecContext *avctx,
> +                                     unsigned high_bit_depth)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (have_neon(cpu_flags)) {
> +        if (!high_bit_depth) {
> +            if (avctx->dct_algo == FF_DCT_AUTO ||
> +                avctx->dct_algo == FF_DCT_NEON) {
> +                c->fdct = ff_fdct_neon;
> +            }
> +        }
> +    }
> +}
> diff --git a/libavcodec/aarch64/fdctdsp_neon.S b/libavcodec/aarch64/fdctdsp_neon.S
> new file mode 100644
> index 0000000000..978c8d3002
> --- /dev/null
> +++ b/libavcodec/aarch64/fdctdsp_neon.S
> @@ -0,0 +1,369 @@
> +/*
> + * Armv8 Neon optimizations for libjpeg-turbo
> + *
> + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
> + *                          All Rights Reserved.
> + * Author:  Siarhei Siamashka <siarhei.siamashka at nokia.com>
> + * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
> + * Author:  Ragesh Radhakrishnan <ragesh.r at linaro.org>
> + * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
> + * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
> + * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
> + *
> + * This software is provided 'as-is', without any express or implied
> + * warranty.  In no event will the authors be held liable for any damages
> + * arising from the use of this software.
> + *
> + * Permission is granted to anyone to use this software for any purpose,
> + * including commercial applications, and to alter it and redistribute it
> + * freely, subject to the following restrictions:
> + *
> + * 1. The origin of this software must not be misrepresented; you must not
> + *    claim that you wrote the original software. If you use this software
> + *    in a product, an acknowledgment in the product documentation would be
> + *    appreciated but is not required.
> + * 2. Altered source versions must be plainly marked as such, and must not be
> + *    misrepresented as being the original software.
> + * 3. This notice may not be removed or altered from any source distribution.
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +#include "neon.S"
> +
> +// #define EIGHT_BIT_SAMPLES
> +
> +/* Constants for jsimd_fdct_islow_neon() */
> +
> +#define F_0_298   2446  /* FIX(0.298631336) */
> +#define F_0_390   3196  /* FIX(0.390180644) */
> +#define F_0_541   4433  /* FIX(0.541196100) */
> +#define F_0_765   6270  /* FIX(0.765366865) */
> +#define F_0_899   7373  /* FIX(0.899976223) */
> +#define F_1_175   9633  /* FIX(1.175875602) */
> +#define F_1_501  12299  /* FIX(1.501321110) */
> +#define F_1_847  15137  /* FIX(1.847759065) */
> +#define F_1_961  16069  /* FIX(1.961570560) */
> +#define F_2_053  16819  /* FIX(2.053119869) */
> +#define F_2_562  20995  /* FIX(2.562915447) */
> +#define F_3_072  25172  /* FIX(3.072711026) */
> +
> +const jsimd_fdct_islow_neon_consts, align=4
> +  .short F_0_298
> +  .short -F_0_390
> +  .short F_0_541
> +  .short F_0_765
> +  .short - F_0_899
> +  .short F_1_175
> +  .short F_1_501
> +  .short - F_1_847
> +  .short - F_1_961
> +  .short F_2_053
> +  .short - F_2_562
> +  .short F_3_072
> +  .short 0          /* padding */
> +  .short 0
> +  .short 0
> +  .short 0
> +endconst
> +
> +#undef F_0_298
> +#undef F_0_390
> +#undef F_0_541
> +#undef F_0_765
> +#undef F_0_899
> +#undef F_1_175
> +#undef F_1_501
> +#undef F_1_847
> +#undef F_1_961
> +#undef F_2_053
> +#undef F_2_562
> +#undef F_3_072
> +
> +/*****************************************************************************/
> +
> +/*
> + * jsimd_fdct_islow_neon
> + *
> + * This file contains a slower but more accurate integer implementation of the
> + * forward DCT (Discrete Cosine Transform). The following code is based
> + * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
> + * more details.
> + */
> +
> +#define CONST_BITS  13
> +#ifdef EIGHT_BIT_SAMPLES
> +#define PASS1_BITS  2
> +#else
> +#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
> +#endif
> +
> +#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
> +#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
> +
> +#define XFIX_P_0_298  v0.h[0]
> +#define XFIX_N_0_390  v0.h[1]
> +#define XFIX_P_0_541  v0.h[2]
> +#define XFIX_P_0_765  v0.h[3]
> +#define XFIX_N_0_899  v0.h[4]
> +#define XFIX_P_1_175  v0.h[5]
> +#define XFIX_P_1_501  v0.h[6]
> +#define XFIX_N_1_847  v0.h[7]
> +#define XFIX_N_1_961  v1.h[0]
> +#define XFIX_P_2_053  v1.h[1]
> +#define XFIX_N_2_562  v1.h[2]
> +#define XFIX_P_3_072  v1.h[3]
> +
> +function ff_fdct_neon, export=1
> +
> +    DATA            .req x0
> +    TMP             .req x9
> +
> +    /* Load constants */
> +    movrel          TMP, jsimd_fdct_islow_neon_consts
> +    ld1             {v0.8h, v1.8h}, [TMP]
> +
> +    /* Load all DATA into Neon registers with the following allocation:
> +     *       0 1 2 3 | 4 5 6 7
> +     *      ---------+--------
> +     *   0 | d16     | d17    | v16.8h
> +     *   1 | d18     | d19    | v17.8h
> +     *   2 | d20     | d21    | v18.8h
> +     *   3 | d22     | d23    | v19.8h
> +     *   4 | d24     | d25    | v20.8h
> +     *   5 | d26     | d27    | v21.8h
> +     *   6 | d28     | d29    | v22.8h
> +     *   7 | d30     | d31    | v23.8h
> +     */
> +
> +    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
> +    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
> +    sub             DATA, DATA, #64
> +
> +    /* Transpose */
> +    transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
> +    /* 1-D FDCT */
> +    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
> +    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
> +    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
> +    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
> +    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
> +    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
> +    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
> +    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
> +
> +    /* even part */
> +
> +    add             v4.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
> +    sub             v5.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
> +    add             v6.8h, v25.8h, v26.8h   /* tmp11 = tmp1 + tmp2; */
> +    sub             v7.8h, v25.8h, v26.8h   /* tmp12 = tmp1 - tmp2; */
> +
> +    add             v16.8h, v4.8h, v6.8h   /* tmp10 + tmp11 */
> +    sub             v20.8h, v4.8h, v6.8h   /* tmp10 - tmp11 */
> +
> +    add             v18.8h, v7.8h, v5.8h   /* tmp12 + tmp13 */
> +
> +    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
> +    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
> +
> +    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
> +    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
> +    mov             v22.16b, v18.16b
> +    mov             v25.16b, v24.16b
> +
> +    smlal           v18.4s, v5.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
> +    smlal2          v24.4s, v5.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
> +    smlal           v22.4s, v7.4h, XFIX_N_1_847   /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
> +    smlal2          v25.4s, v7.8h, XFIX_N_1_847   /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
> +
> +    rshrn           v18.4h, v18.4s, #DESCALE_P1
> +    rshrn           v22.4h, v22.4s, #DESCALE_P1
> +    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
> +    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
> +
> +    /* Odd part */
> +
> +    add             v2.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
> +    add             v3.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
> +    add             v6.8h, v28.8h, v30.8h        /* z3 = tmp4 + tmp6; */
> +    add             v7.8h, v29.8h, v31.8h        /* z4 = tmp5 + tmp7; */
> +    smull           v4.4s, v6.4h, XFIX_P_1_175   /* z5 lo = z3 lo * XFIX_P_1_175 */
> +    smull2          v5.4s, v6.8h, XFIX_P_1_175
> +    smlal           v4.4s, v7.4h, XFIX_P_1_175   /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
> +    smlal2          v5.4s, v7.8h, XFIX_P_1_175
> +
> +    smull2          v24.4s, v28.8h, XFIX_P_0_298
> +    smull2          v25.4s, v29.8h, XFIX_P_2_053
> +    smull2          v26.4s, v30.8h, XFIX_P_3_072
> +    smull2          v27.4s, v31.8h, XFIX_P_1_501
> +    smull           v23.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
> +    smull           v21.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
> +    smull           v19.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
> +    smull           v17.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
> +
> +    smull2          v28.4s, v2.8h, XFIX_N_0_899
> +    smull2          v29.4s, v3.8h, XFIX_N_2_562
> +    smull2          v30.4s, v6.8h, XFIX_N_1_961
> +    smull2          v31.4s, v7.8h, XFIX_N_0_390
> +    smull           v2.4s, v2.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
> +    smull           v3.4s, v3.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
> +    smull           v6.4s, v6.4h, XFIX_N_1_961    /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
> +    smull           v7.4s, v7.4h, XFIX_N_0_390    /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
> +
> +    add             v6.4s, v6.4s, v4.4s    /* z3 += z5 */
> +    add             v30.4s, v30.4s, v5.4s
> +    add             v7.4s, v7.4s, v4.4s    /* z4 += z5 */
> +    add             v31.4s, v31.4s, v5.4s
> +
> +    add             v23.4s, v23.4s, v2.4s   /* tmp4 += z1 */
> +    add             v24.4s, v24.4s, v28.4s
> +    add             v21.4s, v21.4s, v3.4s   /* tmp5 += z2 */
> +    add             v25.4s, v25.4s, v29.4s
> +    add             v19.4s, v19.4s, v6.4s   /* tmp6 += z3 */
> +    add             v26.4s, v26.4s, v30.4s
> +    add             v17.4s, v17.4s, v7.4s   /* tmp7 += z4 */
> +    add             v27.4s, v27.4s, v31.4s
> +
> +    add             v23.4s, v23.4s, v6.4s   /* tmp4 += z3 */
> +    add             v24.4s, v24.4s, v30.4s
> +    add             v21.4s, v21.4s, v7.4s   /* tmp5 += z4 */
> +    add             v25.4s, v25.4s, v31.4s
> +    add             v19.4s, v19.4s, v3.4s   /* tmp6 += z2 */
> +    add             v26.4s, v26.4s, v29.4s
> +    add             v17.4s, v17.4s, v2.4s   /* tmp7 += z1 */
> +    add             v27.4s, v27.4s, v28.4s
> +
> +    rshrn           v23.4h, v23.4s, #DESCALE_P1
> +    rshrn           v21.4h, v21.4s, #DESCALE_P1
> +    rshrn           v19.4h, v19.4s, #DESCALE_P1
> +    rshrn           v17.4h, v17.4s, #DESCALE_P1
> +    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
> +    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
> +    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
> +    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
> +
> +    /* Transpose */
> +    transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v31, v2
> +
> +    /* 1-D FDCT */
> +    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
> +    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
> +    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
> +    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
> +    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
> +    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
> +    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
> +    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
> +
> +    /* even part */
> +    add             v4.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
> +    sub             v5.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
> +    add             v6.8h, v25.8h, v26.8h   /* tmp11 = tmp1 + tmp2; */
> +    sub             v7.8h, v25.8h, v26.8h   /* tmp12 = tmp1 - tmp2; */
> +
> +    add             v16.8h, v4.8h, v6.8h   /* tmp10 + tmp11 */
> +    sub             v20.8h, v4.8h, v6.8h   /* tmp10 - tmp11 */
> +
> +    add             v18.8h, v7.8h, v5.8h   /* tmp12 + tmp13 */
> +
> +    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
> +    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
> +
> +    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
> +    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
> +    mov             v22.16b, v18.16b
> +    mov             v25.16b, v24.16b
> +
> +    smlal           v18.4s, v5.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
> +    smlal2          v24.4s, v5.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
> +    smlal           v22.4s, v7.4h, XFIX_N_1_847   /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
> +    smlal2          v25.4s, v7.8h, XFIX_N_1_847   /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
> +
> +    rshrn           v18.4h, v18.4s, #DESCALE_P2
> +    rshrn           v22.4h, v22.4s, #DESCALE_P2
> +    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS+PASS1_BITS); */
> +    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS+PASS1_BITS); */
> +
> +    /* Odd part */
> +    add             v2.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
> +    add             v3.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
> +    add             v6.8h, v28.8h, v30.8h   /* z3 = tmp4 + tmp6; */
> +    add             v7.8h, v29.8h, v31.8h   /* z4 = tmp5 + tmp7; */
> +
> +    smull           v4.4s, v6.4h, XFIX_P_1_175   /* z5 lo = z3 lo * XFIX_P_1_175 */
> +    smull2          v5.4s, v6.8h, XFIX_P_1_175
> +    smlal           v4.4s, v7.4h, XFIX_P_1_175   /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
> +    smlal2          v5.4s, v7.8h, XFIX_P_1_175
> +
> +    smull2          v24.4s, v28.8h, XFIX_P_0_298
> +    smull2          v25.4s, v29.8h, XFIX_P_2_053
> +    smull2          v26.4s, v30.8h, XFIX_P_3_072
> +    smull2          v27.4s, v31.8h, XFIX_P_1_501
> +    smull           v23.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
> +    smull           v21.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
> +    smull           v19.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
> +    smull           v17.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
> +
> +    smull2          v28.4s, v2.8h, XFIX_N_0_899
> +    smull2          v29.4s, v3.8h, XFIX_N_2_562
> +    smull2          v30.4s, v6.8h, XFIX_N_1_961
> +    smull2          v31.4s, v7.8h, XFIX_N_0_390
> +    smull           v2.4s, v2.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
> +    smull           v3.4s, v3.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
> +    smull           v6.4s, v6.4h, XFIX_N_1_961    /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
> +    smull           v7.4s, v7.4h, XFIX_N_0_390    /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
> +
> +    add             v6.4s, v6.4s, v4.4s    /* z3 += z5 */
> +    add             v30.4s, v30.4s, v5.4s
> +    add             v7.4s, v7.4s, v4.4s    /* z4 += z5 */
> +    add             v31.4s, v31.4s, v5.4s
> +
> +    add             v23.4s, v23.4s, v2.4s   /* tmp4 += z1 */
> +    add             v24.4s, v24.4s, v28.4s
> +    add             v21.4s, v21.4s, v3.4s   /* tmp5 += z2 */
> +    add             v25.4s, v25.4s, v29.4s
> +    add             v19.4s, v19.4s, v6.4s   /* tmp6 += z3 */
> +    add             v26.4s, v26.4s, v30.4s
> +    add             v17.4s, v17.4s, v7.4s   /* tmp7 += z4 */
> +    add             v27.4s, v27.4s, v31.4s
> +
> +    add             v23.4s, v23.4s, v6.4s   /* tmp4 += z3 */
> +    add             v24.4s, v24.4s, v30.4s
> +    add             v21.4s, v21.4s, v7.4s   /* tmp5 += z4 */
> +    add             v25.4s, v25.4s, v31.4s
> +    add             v19.4s, v19.4s, v3.4s   /* tmp6 += z2 */
> +    add             v26.4s, v26.4s, v29.4s
> +    add             v17.4s, v17.4s, v2.4s   /* tmp7 += z1 */
> +    add             v27.4s, v27.4s, v28.4s
> +
> +    rshrn           v23.4h, v23.4s, #DESCALE_P2
> +    rshrn           v21.4h, v21.4s, #DESCALE_P2
> +    rshrn           v19.4h, v19.4s, #DESCALE_P2
> +    rshrn           v17.4h, v17.4s, #DESCALE_P2
> +    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS); */
> +    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS); */
> +    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS); */
> +    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS); */
> +
> +    /* store results */
> +    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
> +    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
> +
> +    ret
> +
> +    .unreq          DATA
> +    .unreq          TMP
> +endfunc
> +
> +#undef XFIX_P_0_298
> +#undef XFIX_N_0_390
> +#undef XFIX_P_0_541
> +#undef XFIX_P_0_765
> +#undef XFIX_N_0_899
> +#undef XFIX_P_1_175
> +#undef XFIX_P_1_501
> +#undef XFIX_N_1_847
> +#undef XFIX_N_1_961
> +#undef XFIX_P_2_053
> +#undef XFIX_N_2_562
> +#undef XFIX_P_3_072
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index 7fb44e28f4..f9b86f1d58 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -1477,6 +1477,7 @@ typedef struct AVCodecContext {
>  #define FF_DCT_MMX     3
>  #define FF_DCT_ALTIVEC 5
>  #define FF_DCT_FAAN    6
> +#define FF_DCT_NEON    7
>
>      /**
>       * IDCT algorithm, see FF_IDCT_* below.
> diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c
> index f8ba17426c..d20558ce88 100644
> --- a/libavcodec/fdctdsp.c
> +++ b/libavcodec/fdctdsp.c
> @@ -42,7 +42,9 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx)
>          c->fdct248 = ff_fdct248_islow_8;
>      }
>
> -#if ARCH_PPC
> +#if ARCH_AARCH64
> +    ff_fdctdsp_init_aarch64(c, avctx, high_bit_depth);
> +#elif ARCH_PPC
>      ff_fdctdsp_init_ppc(c, avctx, high_bit_depth);
>  #elif ARCH_X86
>      ff_fdctdsp_init_x86(c, avctx, high_bit_depth);
> diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h
> index 7378eab870..cad99ed7ca 100644
> --- a/libavcodec/fdctdsp.h
> +++ b/libavcodec/fdctdsp.h
> @@ -32,6 +32,8 @@ typedef struct FDCTDSPContext {
>
>  FF_VISIBILITY_PUSH_HIDDEN
>  void ff_fdctdsp_init(FDCTDSPContext *c, struct AVCodecContext *avctx);
> +void ff_fdctdsp_init_aarch64(FDCTDSPContext *c, struct AVCodecContext *avctx,
> +                             unsigned high_bit_depth);
>  void ff_fdctdsp_init_ppc(FDCTDSPContext *c, struct AVCodecContext *avctx,
>                           unsigned high_bit_depth);
>  void ff_fdctdsp_init_x86(FDCTDSPContext *c, struct AVCodecContext *avctx,
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index ee243d9894..d9a3c92f28 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -159,6 +159,7 @@ static const AVOption avcodec_options[] = {
>  {"mmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"},
>  {"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, "dct"},
>  {"faan", "floating point AAN DCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, "dct"},
> +{"neon", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_NEON }, INT_MIN, INT_MAX, V|E, "dct"},
>  {"lumi_mask", "compresses bright areas stronger than medium ones", OFFSET(lumi_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
>  {"tcplx_mask", "temporal complexity masking", OFFSET(temporal_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
>  {"scplx_mask", "spatial complexity masking", OFFSET(spatial_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E},
> diff --git a/libavcodec/tests/aarch64/dct.c b/libavcodec/tests/aarch64/dct.c
> index 9e477328d5..e98a887cd5 100644
> --- a/libavcodec/tests/aarch64/dct.c
> +++ b/libavcodec/tests/aarch64/dct.c
> @@ -19,9 +19,11 @@
>  #include "config.h"
>
>  #include "libavutil/cpu.h"
> +#include "libavcodec/aarch64/fdct.h"
>  #include "libavcodec/aarch64/idct.h"
>
>  static const struct algo fdct_tab_arch[] = {
> +    { "neon", ff_fdct_neon, FF_IDCT_PERM_NONE, AV_CPU_FLAG_NEON },
>      { 0 }
>  };
>
> --
> 2.30.2
>