[FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi

Martin Storsjö martin at martin.st
Mon Oct 24 15:01:36 EEST 2022


On Tue, 11 Oct 2022, J. Dekker wrote:

> checkasm benchmark on Ampere Altra (Neoverse N1):
>
> put_hevc_qpel_bi_h4_8_c: 170.7
> put_hevc_qpel_bi_h4_8_neon: 64.5
> put_hevc_qpel_bi_h6_8_c: 373.7
> put_hevc_qpel_bi_h6_8_neon: 130.2
> put_hevc_qpel_bi_h8_8_c: 662.0
> put_hevc_qpel_bi_h8_8_neon: 138.5
> put_hevc_qpel_bi_h12_8_c: 1529.5
> put_hevc_qpel_bi_h12_8_neon: 422.0
> put_hevc_qpel_bi_h16_8_c: 2735.5
> put_hevc_qpel_bi_h16_8_neon: 560.5
> put_hevc_qpel_bi_h24_8_c: 6015.7
> put_hevc_qpel_bi_h24_8_neon: 1636.0
> put_hevc_qpel_bi_h32_8_c: 10779.0
> put_hevc_qpel_bi_h32_8_neon: 2204.5
> put_hevc_qpel_bi_h48_8_c: 24375.0
> put_hevc_qpel_bi_h48_8_neon: 4984.0
> put_hevc_qpel_bi_h64_8_c: 42768.0
> put_hevc_qpel_bi_h64_8_neon: 8795.7
> put_hevc_qpel_h4_8_c: 149.0
> put_hevc_qpel_h4_8_neon: 55.7
> put_hevc_qpel_h6_8_c: 321.2
> put_hevc_qpel_h6_8_neon: 106.0
> put_hevc_qpel_h8_8_c: 578.7
> put_hevc_qpel_h8_8_neon: 133.2
> put_hevc_qpel_h12_8_c: 1279.0
> put_hevc_qpel_h12_8_neon: 391.7
> put_hevc_qpel_h16_8_c: 2286.2
> put_hevc_qpel_h16_8_neon: 519.7
> put_hevc_qpel_h24_8_c: 5100.7
> put_hevc_qpel_h24_8_neon: 1546.2
> put_hevc_qpel_h32_8_c: 9022.0
> put_hevc_qpel_h32_8_neon: 2060.2
> put_hevc_qpel_h48_8_c: 20293.5
> put_hevc_qpel_h48_8_neon: 4656.7
> put_hevc_qpel_h64_8_c: 36037.0
> put_hevc_qpel_h64_8_neon: 8262.7
> put_hevc_qpel_uni_h4_8_c: 162.2
> put_hevc_qpel_uni_h4_8_neon: 61.7
> put_hevc_qpel_uni_h6_8_c: 355.2
> put_hevc_qpel_uni_h6_8_neon: 114.2
> put_hevc_qpel_uni_h8_8_c: 651.0
> put_hevc_qpel_uni_h8_8_neon: 135.7
> put_hevc_qpel_uni_h12_8_c: 1412.5
> put_hevc_qpel_uni_h12_8_neon: 402.7
> put_hevc_qpel_uni_h16_8_c: 2551.0
> put_hevc_qpel_uni_h16_8_neon: 533.5
> put_hevc_qpel_uni_h24_8_c: 5782.2
> put_hevc_qpel_uni_h24_8_neon: 1578.7
> put_hevc_qpel_uni_h32_8_c: 10586.5
> put_hevc_qpel_uni_h32_8_neon: 2102.2
> put_hevc_qpel_uni_h48_8_c: 23812.0
> put_hevc_qpel_uni_h48_8_neon: 4739.5
> put_hevc_qpel_uni_h64_8_c: 42958.7
> put_hevc_qpel_uni_h64_8_neon: 8366.5
>
> Signed-off-by: J. Dekker <jdek at itanimul.li>
> ---
>
> Summary of changes since last iteration:
> - Interleaved stores
> - Changed tiling to loop more naturally
> - Increased code reuse (.text reduction by ~60%)
> - Simplified function variations through .req
>
> libavcodec/aarch64/Makefile               |   1 +
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  67 +++
> libavcodec/aarch64/hevcdsp_qpel_neon.S    | 484 ++++++++++++++++++++++
> 3 files changed, 552 insertions(+)
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 9ce21566c6..02fb51c3ab 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -67,4 +67,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
>                                            aarch64/vp9mc_neon.o
> NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
>                                            aarch64/hevcdsp_init_aarch64.o      \
> +                                           aarch64/hevcdsp_qpel_neon.o         \
>                                            aarch64/hevcdsp_sao_neon.o
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 644cc17715..44399b05d8 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrd
>                                           const int16_t *sao_offset_val, int eo, int width, int height);
> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>                                         const int16_t *sao_offset_val, int eo, int width, int height);
> +void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> +                                     intptr_t mx, intptr_t my, int width);

The function pointers in the dsp context has gotten 'const' on the source 
pointers now, which makes it emit a lot of warnings with GCC, and fail 
with latest Clang. Please rebase and check that it builds without 
warnings.

> +void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> +                                     intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> +                                     intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> +                                      intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> +                                      intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                         ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
> +                                         int width);
> +void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                         ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
> +                                         int width);
> +void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                         ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
> +                                         int width);
> +void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
> +                                          my, int width);
> +void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
> +                                          my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                        ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> +                                        mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                        ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> +                                        mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                        ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> +                                        mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                         ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> +                                         mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> +                                         ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> +                                         mx, intptr_t my, int width);
>
> av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
> @@ -95,6 +135,33 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         c->sao_edge_filter[2]          =
>         c->sao_edge_filter[3]          =
>         c->sao_edge_filter[4]          = ff_hevc_sao_edge_filter_16x16_8_neon;
> +        c->put_hevc_qpel[1][0][1]      = ff_hevc_put_hevc_qpel_h4_8_neon;
> +        c->put_hevc_qpel[2][0][1]      = ff_hevc_put_hevc_qpel_h6_8_neon;
> +        c->put_hevc_qpel[3][0][1]      = ff_hevc_put_hevc_qpel_h8_8_neon;
> +        c->put_hevc_qpel[4][0][1]      =
> +        c->put_hevc_qpel[6][0][1]      = ff_hevc_put_hevc_qpel_h12_8_neon;
> +        c->put_hevc_qpel[5][0][1]      =
> +        c->put_hevc_qpel[7][0][1]      =
> +        c->put_hevc_qpel[8][0][1]      =
> +        c->put_hevc_qpel[9][0][1]      = ff_hevc_put_hevc_qpel_h16_8_neon;
> +        c->put_hevc_qpel_uni[1][0][1]  = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
> +        c->put_hevc_qpel_uni[2][0][1]  = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
> +        c->put_hevc_qpel_uni[3][0][1]  = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
> +        c->put_hevc_qpel_uni[4][0][1]  =
> +        c->put_hevc_qpel_uni[6][0][1]  = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
> +        c->put_hevc_qpel_uni[5][0][1]  =
> +        c->put_hevc_qpel_uni[7][0][1]  =
> +        c->put_hevc_qpel_uni[8][0][1]  =
> +        c->put_hevc_qpel_uni[9][0][1]  = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
> +        c->put_hevc_qpel_bi[1][0][1]   = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
> +        c->put_hevc_qpel_bi[2][0][1]   = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
> +        c->put_hevc_qpel_bi[3][0][1]   = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
> +        c->put_hevc_qpel_bi[4][0][1]   =
> +        c->put_hevc_qpel_bi[6][0][1]   = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
> +        c->put_hevc_qpel_bi[5][0][1]   =
> +        c->put_hevc_qpel_bi[7][0][1]   =
> +        c->put_hevc_qpel_bi[8][0][1]   =
> +        c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
>     }
>     if (bit_depth == 10) {
>         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> new file mode 100644
> index 0000000000..7974b8529e
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> @@ -0,0 +1,484 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * Copyright (c) 2022 J. Dekker <jdek at itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +#define MAX_PB_SIZE 64
> +
> +const qpel_filters, align=4
> +        .byte           0,  0,  0,  0,  0,  0, 0,  0
> +        .byte           -1, 4,-10, 58, 17, -5, 1,  0
> +        .byte           -1, 4,-11, 40, 40,-11, 4, -1
> +        .byte           0,  1, -5, 17, 58,-10, 4, -1
> +endconst
> +
> +.macro load_filter m
> +        movrel          x15, qpel_filters
> +        add             x15, x15, \m, lsl #3
> +        ld1             {v0.8b}, [x15]
> +        sxtl            v0.8h, v0.8b
> +.endm
> +
> +.macro put_hevc type
> +.ifc \type, qpel
> +        // void put_hevc_qpel_h(int16_t *dst,
> +        //                      uint8_t *_src, ptrdiff_t _srcstride,
> +        //                      int height, intptr_t mx, intptr_t my, int width)
> +        dst        .req x0
> +        dststride  .req x7
> +        src        .req x1
> +        srcstride  .req x2
> +        height     .req x3
> +        heightw    .req w3
> +        mx         .req x4
> +        width      .req w6
> +.endif
> +.ifc \type, qpel_uni
> +        // void put_hevc_qpel_uni_h(uint8_t *_dst,  ptrdiff_t _dststride,
> +        //                          uint8_t *_src, ptrdiff_t _srcstride,
> +        //                          int height, intptr_t mx, intptr_t my, int width)
> +        dst        .req x0
> +        dststride  .req x1
> +        src        .req x2
> +        srcstride  .req x3
> +        height     .req x4
> +        heightw    .req w4
> +        mx         .req x5
> +        width      .req w7
> +.endif
> +.ifc \type, qpel_bi
> +        // void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
> +        //                         uint8_t *_src, ptrdiff_t _srcstride,
> +        //                         int16_t *src2, int height, intptr_t mx,
> +        //                         intptr_t my, int width)
> +        dst        .req x0
> +        dststride  .req x1
> +        src        .req x2
> +        srcstride  .req x3
> +        height     .req x5
> +        heightw    .req w5
> +        mx         .req x6
> +        width      .req w8
> +.endif
> +
> +.ifc \type, qpel
> +function ff_hevc_put_hevc_h4_8_neon, export=0
> +        uxtl            v16.8h,  v16.8b
> +        uxtl            v17.8h,  v17.8b
> +        uxtl            v18.8h,  v18.8b
> +        uxtl            v19.8h,  v19.8b
> +
> +        mul             v23.4h,  v16.4h, v0.h[0]
> +        mul             v24.4h,  v18.4h, v0.h[0]
> +
> +.irpc i, 1234567
> +        ext             v20.16b, v16.16b, v17.16b, #(2*\i)
> +        ext             v21.16b, v18.16b, v19.16b, #(2*\i)
> +        mla             v23.4h,  v20.4h, v0.h[\i]
> +        mla             v24.4h,  v21.4h, v0.h[\i]
> +.endr
> +        ret
> +endfunc
> +.endif
> +
> +function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
> +        load_filter     mx
> +.ifc \type, qpel_bi
> +        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
> +        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b

Beware that you can't in general rely on x16/x17 keeping their values for 
long. If you branch to a function which is implemented in a different 
object file, it may end up linked at a place in the address space which is 
too far away for a regular 'bl' branch, so the linker has to insert a 
range extension thunk, which clobbers x16/x17. But as long as everything 
here is branched within the same object file, it should be ok.

In general, if you need to use x16/x17, use it only for very short-lived 
temporaries.

> +.endif
> +        sub             src, src, #3
> +        mov             mx, lr

Please use literal 'x30' instead of 'lr' - older binutils don't support 
the 'lr' register name alias.


Other than that, the code seems to run correctly, and the code looks 
mostly reasonable now. (I didn't do a very deep read-through this time, 
but it looks like you've addressed my earlier concerns.)

// Martin



More information about the ffmpeg-devel mailing list