[FFmpeg-devel] [PATCH v2 4/4] avcodec/aarch64/hevcdsp: add sao_band NEON

Martin Storsjö martin at martin.st
Thu Feb 11 11:53:07 EET 2021


On Thu, 4 Feb 2021, Josh Dekker wrote:

> Only works for 8x8.
>
> Signed-off-by: Josh Dekker <josh at itanimul.li>
> ---
> libavcodec/aarch64/Makefile               |  3 +-
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  7 ++
> libavcodec/aarch64/hevcdsp_sao_neon.S     | 87 +++++++++++++++++++++++
> 3 files changed, 96 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_sao_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 2ea1d74a38..954461f81d 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -62,4 +62,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
>                                            aarch64/vp9mc_16bpp_neon.o          \
>                                            aarch64/vp9mc_neon.o
> NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
> -                                           aarch64/hevcdsp_init_aarch64.o
> +                                           aarch64/hevcdsp_init_aarch64.o      \
> +                                           aarch64/hevcdsp_sao_neon.o
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index fe111bd1ac..c785e46f79 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -53,6 +53,12 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
> void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
> +void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
> +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +                                  int16_t *sao_offset_val, int sao_left_class,
> +                                  int width, int height);
> +
> +
> 
> av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
> @@ -69,6 +75,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
>         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
>         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
> +        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
>     }
>     if (bit_depth == 10) {
>         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
> new file mode 100644
> index 0000000000..f142c1e8c2
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
> @@ -0,0 +1,87 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * AArch64 NEON optimised SAO functions for HEVC decoding
> + *
> + * Copyright (c) 2020 Josh Dekker <josh at itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
> +//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +//                      int16_t *sao_offset_val, int sao_left_class,
> +//                      int width, int height)
> +function ff_hevc_sao_band_filter_8x8_8_neon, export=1
> +    sub sp, sp, #64
> +    stp xzr, xzr, [sp]
> +    stp xzr, xzr, [sp, #16]
> +    stp xzr, xzr, [sp, #32]
> +    stp xzr, xzr, [sp, #48]
> +    mov w8, #4
> +0:
> +    ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1]
> +    subs w8, w8, #1
> +    add w10, w8, w5 // x10 = k + sao_left_class
> +    and w10, w10, #0x1F
> +    strh w9, [sp, x10, lsl #1]
> +    bne 0b
> +    ld1 {v16.16b-v19.16b}, [sp], #64
> +    movi v20.8h, #1
> +1:  // beginning of line

No technical objections, it seems to build fine in all environments, and 
gives a consistent speedup over C, so that's good even if things maybe 
could be even faster. Didn't look closer at the algorithm so far. But the 
indentation is way different than all other asm, so please fix that.

// Martin



More information about the ffmpeg-devel mailing list