[FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi

Tue Oct 11 10:32:29 EEST 2022

checkasm benchmark on Ampere Altra (Neoverse N1):

put_hevc_qpel_bi_h4_8_c: 170.7
put_hevc_qpel_bi_h4_8_neon: 64.5
put_hevc_qpel_bi_h6_8_c: 373.7
put_hevc_qpel_bi_h6_8_neon: 130.2
put_hevc_qpel_bi_h8_8_c: 662.0
put_hevc_qpel_bi_h8_8_neon: 138.5
put_hevc_qpel_bi_h12_8_c: 1529.5
put_hevc_qpel_bi_h12_8_neon: 422.0
put_hevc_qpel_bi_h16_8_c: 2735.5
put_hevc_qpel_bi_h16_8_neon: 560.5
put_hevc_qpel_bi_h24_8_c: 6015.7
put_hevc_qpel_bi_h24_8_neon: 1636.0
put_hevc_qpel_bi_h32_8_c: 10779.0
put_hevc_qpel_bi_h32_8_neon: 2204.5
put_hevc_qpel_bi_h48_8_c: 24375.0
put_hevc_qpel_bi_h48_8_neon: 4984.0
put_hevc_qpel_bi_h64_8_c: 42768.0
put_hevc_qpel_bi_h64_8_neon: 8795.7
put_hevc_qpel_h4_8_c: 149.0
put_hevc_qpel_h4_8_neon: 55.7
put_hevc_qpel_h6_8_c: 321.2
put_hevc_qpel_h6_8_neon: 106.0
put_hevc_qpel_h8_8_c: 578.7
put_hevc_qpel_h8_8_neon: 133.2
put_hevc_qpel_h12_8_c: 1279.0
put_hevc_qpel_h12_8_neon: 391.7
put_hevc_qpel_h16_8_c: 2286.2
put_hevc_qpel_h16_8_neon: 519.7
put_hevc_qpel_h24_8_c: 5100.7
put_hevc_qpel_h24_8_neon: 1546.2
put_hevc_qpel_h32_8_c: 9022.0
put_hevc_qpel_h32_8_neon: 2060.2
put_hevc_qpel_h48_8_c: 20293.5
put_hevc_qpel_h48_8_neon: 4656.7
put_hevc_qpel_h64_8_c: 36037.0
put_hevc_qpel_h64_8_neon: 8262.7
put_hevc_qpel_uni_h4_8_c: 162.2
put_hevc_qpel_uni_h4_8_neon: 61.7
put_hevc_qpel_uni_h6_8_c: 355.2
put_hevc_qpel_uni_h6_8_neon: 114.2
put_hevc_qpel_uni_h8_8_c: 651.0
put_hevc_qpel_uni_h8_8_neon: 135.7
put_hevc_qpel_uni_h12_8_c: 1412.5
put_hevc_qpel_uni_h12_8_neon: 402.7
put_hevc_qpel_uni_h16_8_c: 2551.0
put_hevc_qpel_uni_h16_8_neon: 533.5
put_hevc_qpel_uni_h24_8_c: 5782.2
put_hevc_qpel_uni_h24_8_neon: 1578.7
put_hevc_qpel_uni_h32_8_c: 10586.5
put_hevc_qpel_uni_h32_8_neon: 2102.2
put_hevc_qpel_uni_h48_8_c: 23812.0
put_hevc_qpel_uni_h48_8_neon: 4739.5
put_hevc_qpel_uni_h64_8_c: 42958.7
put_hevc_qpel_uni_h64_8_neon: 8366.5

Signed-off-by: J. Dekker <jdek at itanimul.li>
---

 Summary of changes since last iteration:
 - Interleaved stores
 - Changed tiling to loop more naturally
 - Increased code reuse (.text reduction by ~60%)
 - Simplified function variations through .req

 libavcodec/aarch64/Makefile               |   1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  67 +++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 484 ++++++++++++++++++++++
 3 files changed, 552 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 9ce21566c6..02fb51c3ab 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -67,4 +67,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
                                            aarch64/hevcdsp_init_aarch64.o      \
+                                           aarch64/hevcdsp_qpel_neon.o         \
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 644cc17715..44399b05d8 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrd
                                           const int16_t *sao_offset_val, int eo, int width, int height);
 void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
                                         const int16_t *sao_offset_val, int eo, int width, int height);
+void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+                                     intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+                                     intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+                                     intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+                                      intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+                                      intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                         ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
+                                         int width);
+void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                         ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
+                                         int width);
+void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                         ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
+                                         int width);
+void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
+                                          my, int width);
+void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
+                                          my, int width);
+void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                        ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+                                        mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                        ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+                                        mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                        ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+                                        mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                         ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+                                         mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+                                         ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+                                         mx, intptr_t my, int width);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -95,6 +135,33 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->sao_edge_filter[2]          =
         c->sao_edge_filter[3]          =
         c->sao_edge_filter[4]          = ff_hevc_sao_edge_filter_16x16_8_neon;
+        c->put_hevc_qpel[1][0][1]      = ff_hevc_put_hevc_qpel_h4_8_neon;
+        c->put_hevc_qpel[2][0][1]      = ff_hevc_put_hevc_qpel_h6_8_neon;
+        c->put_hevc_qpel[3][0][1]      = ff_hevc_put_hevc_qpel_h8_8_neon;
+        c->put_hevc_qpel[4][0][1]      =
+        c->put_hevc_qpel[6][0][1]      = ff_hevc_put_hevc_qpel_h12_8_neon;
+        c->put_hevc_qpel[5][0][1]      =
+        c->put_hevc_qpel[7][0][1]      =
+        c->put_hevc_qpel[8][0][1]      =
+        c->put_hevc_qpel[9][0][1]      = ff_hevc_put_hevc_qpel_h16_8_neon;
+        c->put_hevc_qpel_uni[1][0][1]  = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
+        c->put_hevc_qpel_uni[2][0][1]  = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
+        c->put_hevc_qpel_uni[3][0][1]  = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
+        c->put_hevc_qpel_uni[4][0][1]  =
+        c->put_hevc_qpel_uni[6][0][1]  = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
+        c->put_hevc_qpel_uni[5][0][1]  =
+        c->put_hevc_qpel_uni[7][0][1]  =
+        c->put_hevc_qpel_uni[8][0][1]  =
+        c->put_hevc_qpel_uni[9][0][1]  = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
+        c->put_hevc_qpel_bi[1][0][1]   = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
+        c->put_hevc_qpel_bi[2][0][1]   = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
+        c->put_hevc_qpel_bi[3][0][1]   = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
+        c->put_hevc_qpel_bi[4][0][1]   =
+        c->put_hevc_qpel_bi[6][0][1]   = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
+        c->put_hevc_qpel_bi[5][0][1]   =
+        c->put_hevc_qpel_bi[7][0][1]   =
+        c->put_hevc_qpel_bi[8][0][1]   =
+        c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
     }
     if (bit_depth == 10) {
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..7974b8529e
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,484 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2022 J. Dekker <jdek at itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const qpel_filters, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           -1, 4,-10, 58, 17, -5, 1,  0
+        .byte           -1, 4,-11, 40, 40,-11, 4, -1
+        .byte           0,  1, -5, 17, 58,-10, 4, -1
+endconst
+
+.macro load_filter m
+        movrel          x15, qpel_filters
+        add             x15, x15, \m, lsl #3
+        ld1             {v0.8b}, [x15]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro put_hevc type
+.ifc \type, qpel
+        // void put_hevc_qpel_h(int16_t *dst,
+        //                      uint8_t *_src, ptrdiff_t _srcstride,
+        //                      int height, intptr_t mx, intptr_t my, int width)
+        dst        .req x0
+        dststride  .req x7
+        src        .req x1
+        srcstride  .req x2
+        height     .req x3
+        heightw    .req w3
+        mx         .req x4
+        width      .req w6
+.endif
+.ifc \type, qpel_uni
+        // void put_hevc_qpel_uni_h(uint8_t *_dst,  ptrdiff_t _dststride,
+        //                          uint8_t *_src, ptrdiff_t _srcstride,
+        //                          int height, intptr_t mx, intptr_t my, int width)
+        dst        .req x0
+        dststride  .req x1
+        src        .req x2
+        srcstride  .req x3
+        height     .req x4
+        heightw    .req w4
+        mx         .req x5
+        width      .req w7
+.endif
+.ifc \type, qpel_bi
+        // void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
+        //                         uint8_t *_src, ptrdiff_t _srcstride,
+        //                         int16_t *src2, int height, intptr_t mx,
+        //                         intptr_t my, int width)
+        dst        .req x0
+        dststride  .req x1
+        src        .req x2
+        srcstride  .req x3
+        height     .req x5
+        heightw    .req w5
+        mx         .req x6
+        width      .req w8
+.endif
+
+.ifc \type, qpel
+function ff_hevc_put_hevc_h4_8_neon, export=0
+        uxtl            v16.8h,  v16.8b
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v19.8h,  v19.8b
+
+        mul             v23.4h,  v16.4h, v0.h[0]
+        mul             v24.4h,  v18.4h, v0.h[0]
+
+.irpc i, 1234567
+        ext             v20.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v21.16b, v18.16b, v19.16b, #(2*\i)
+        mla             v23.4h,  v20.4h, v0.h[\i]
+        mla             v24.4h,  v21.4h, v0.h[\i]
+.endr
+        ret
+endfunc
+.endif
+
+function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
+        load_filter     mx
+.ifc \type, qpel_bi
+        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
+        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+        sub             src, src, #3
+        mov             mx, lr
+.ifc \type, qpel
+        mov             dststride, #(MAX_PB_SIZE << 1)
+        lsl             x13, srcstride, #1 // srcstridel
+        mov             x14, #(MAX_PB_SIZE << 2)
+.else
+        lsl             x14, dststride, #1 // dststridel
+        lsl             x13, srcstride, #1 // srcstridel
+.endif
+        add             x10, dst, dststride // dstb
+        add             x12, src, srcstride // srcb
+0:      ld1             {v16.8b, v17.8b}, [src], x13
+        ld1             {v18.8b, v19.8b}, [x12], x13
+.ifc \type, qpel_bi
+        ld1             {v25.8h}, [ x4], x16
+        ld1             {v26.8h}, [x15], x16
+.endif
+
+        bl              ff_hevc_put_hevc_h4_8_neon
+        subs            heightw, heightw, #2
+
+.ifc \type, qpel
+        st1             {v23.4h}, [dst], x14
+        st1             {v24.4h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+        sqadd           v23.4h, v23.4h, v25.4h
+        sqadd           v24.4h, v24.4h, v26.4h
+        sqrshrun        v23.8b, v23.8h, #7
+        sqrshrun        v24.8b, v24.8h, #7
+.else
+        sqrshrun        v23.8b, v23.8h, #6
+        sqrshrun        v24.8b, v24.8h, #6
+.endif
+        st1             {v23.s}[0], [dst], x14
+        st1             {v24.s}[0], [x10], x14
+.endif
+        b.gt            0b // double line
+        ret             mx
+endfunc
+
+.ifc \type, qpel
+function ff_hevc_put_hevc_h8_8_neon, export=0
+        uxtl            v16.8h,  v16.8b
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v19.8h,  v19.8b
+
+        mul             v23.8h,  v16.8h, v0.h[0]
+        mul             v24.8h,  v18.8h, v0.h[0]
+
+.irpc i, 1234567
+        ext             v20.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v21.16b, v18.16b, v19.16b, #(2*\i)
+        mla             v23.8h,  v20.8h, v0.h[\i]
+        mla             v24.8h,  v21.8h, v0.h[\i]
+.endr
+        ret
+endfunc
+.endif
+
+function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
+        load_filter     mx
+.ifc \type, qpel_bi
+        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
+        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+        sub             src, src, #3
+        mov             mx, lr
+.ifc \type, qpel
+        mov             dststride, #(MAX_PB_SIZE << 1)
+        lsl             x13, srcstride, #1 // srcstridel
+        mov             x14, #((MAX_PB_SIZE << 2) - 8)
+.else
+        lsl             x14, dststride, #1 // dststridel
+        lsl             x13, srcstride, #1 // srcstridel
+        sub             x14, x14, #4
+.endif
+        add             x10, dst, dststride // dstb
+        add             x12, src, srcstride // srcb
+0:      ld1             {v16.8b, v17.8b}, [src], x13
+        ld1             {v18.8b, v19.8b}, [x12], x13
+.ifc \type, qpel_bi
+        ld1             {v25.8h}, [ x4], x16
+        ld1             {v26.8h}, [x15], x16
+.endif
+
+        bl              ff_hevc_put_hevc_h8_8_neon
+        subs            heightw, heightw, #2
+
+.ifc \type, qpel
+        st1             {v23.4h},   [dst], #8
+        st1             {v24.4h},   [x10], #8
+        st1             {v23.s}[2], [dst], x14
+        st1             {v24.s}[2], [x10], x14
+.else
+.ifc \type, qpel_bi
+        sqadd           v23.8h, v23.8h, v25.8h
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqrshrun        v23.8b, v23.8h, #7
+        sqrshrun        v24.8b, v24.8h, #7
+.else
+        sqrshrun        v23.8b, v23.8h, #6
+        sqrshrun        v24.8b, v24.8h, #6
+.endif
+        st1             {v23.s}[0], [dst], #4
+        st1             {v24.s}[0], [x10], #4
+        st1             {v23.h}[2], [dst], x14
+        st1             {v24.h}[2], [x10], x14
+.endif
+        b.gt            0b // double line
+        ret             mx
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
+        load_filter     mx
+.ifc \type, qpel_bi
+        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
+        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+        sub             src, src, #3
+        mov             mx, lr
+.ifc \type, qpel
+        mov             dststride, #(MAX_PB_SIZE << 1)
+        lsl             x13, srcstride, #1 // srcstridel
+        mov             x14, #(MAX_PB_SIZE << 2)
+.else
+        lsl             x14, dststride, #1 // dststridel
+        lsl             x13, srcstride, #1 // srcstridel
+.endif
+        add             x10, dst, dststride // dstb
+        add             x12, src, srcstride // srcb
+0:      ld1             {v16.8b, v17.8b}, [src], x13
+        ld1             {v18.8b, v19.8b}, [x12], x13
+.ifc \type, qpel_bi
+        ld1             {v25.8h}, [ x4], x16
+        ld1             {v26.8h}, [x15], x16
+.endif
+
+        bl              ff_hevc_put_hevc_h8_8_neon
+        subs            heightw, heightw, #2
+
+.ifc \type, qpel
+        st1             {v23.8h}, [dst], x14
+        st1             {v24.8h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+        sqadd           v23.8h, v23.8h, v25.8h
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqrshrun        v23.8b, v23.8h, #7
+        sqrshrun        v24.8b, v24.8h, #7
+.else
+        sqrshrun        v23.8b, v23.8h, #6
+        sqrshrun        v24.8b, v24.8h, #6
+.endif
+        st1             {v23.8b}, [dst], x14
+        st1             {v24.8b}, [x10], x14
+.endif
+        b.gt            0b // double line
+        ret             mx
+endfunc
+
+.ifc \type, qpel
+function ff_hevc_put_hevc_h16_8_neon, export=0
+        uxtl            v16.8h,  v16.8b
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+
+        uxtl            v19.8h,  v19.8b
+        uxtl            v20.8h,  v20.8b
+        uxtl            v21.8h,  v21.8b
+
+        mul             v26.8h,  v16.8h, v0.h[0]
+        mul             v27.8h,  v17.8h, v0.h[0]
+        mul             v28.8h,  v19.8h, v0.h[0]
+        mul             v29.8h,  v20.8h, v0.h[0]
+.irpc i, 1234567
+        ext             v22.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v23.16b, v17.16b, v18.16b, #(2*\i)
+
+        ext             v24.16b, v19.16b, v20.16b, #(2*\i)
+        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+
+        mla             v26.8h,  v22.8h, v0.h[\i]
+        mla             v27.8h,  v23.8h, v0.h[\i]
+
+        mla             v28.8h,  v24.8h, v0.h[\i]
+        mla             v29.8h,  v25.8h, v0.h[\i]
+.endr
+        subs            x9, x9, #2
+        ret
+endfunc
+.endif
+
+function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
+        load_filter     mx
+        sxtw            height, heightw
+.ifc \type, qpel_bi
+        ldrh            w8, [sp] // width
+        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
+        lsl             x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1))
+        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+        sub             src, src, #3
+        mov             mx, lr
+.ifc \type, qpel
+        mov             dststride, #(MAX_PB_SIZE << 1)
+        lsl             x13, srcstride, #1 // srcstridel
+        mov             x14, #((MAX_PB_SIZE << 2) - 16)
+.else
+        lsl             x14, dststride, #1 // dststridel
+        lsl             x13, srcstride, #1 // srcstridel
+        sub             x14, x14, #8
+.endif
+        add             x10, dst, dststride // dstb
+        add             x12, src, srcstride // srcb
+0:      mov             x9, height
+1:      ld1             {v16.8b-v18.8b}, [src], x13
+        ld1             {v19.8b-v21.8b}, [x12], x13
+
+        bl              ff_hevc_put_hevc_h16_8_neon
+
+.ifc \type, qpel
+        st1             {v26.8h}, [dst], #16
+        st1             {v28.8h}, [x10], #16
+        st1             {v27.4h}, [dst], x14
+        st1             {v29.4h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+        ld1             {v16.8h, v17.8h}, [ x4], x16
+        ld1             {v18.8h, v19.8h}, [x15], x16
+        sqadd           v26.8h, v26.8h, v16.8h
+        sqadd           v27.8h, v27.8h, v17.8h
+        sqadd           v28.8h, v28.8h, v18.8h
+        sqadd           v29.8h, v29.8h, v19.8h
+        sqrshrun        v26.8b, v26.8h, #7
+        sqrshrun        v27.8b, v27.8h, #7
+        sqrshrun        v28.8b, v28.8h, #7
+        sqrshrun        v29.8b, v29.8h, #7
+.else
+        sqrshrun        v26.8b, v26.8h, #6
+        sqrshrun        v27.8b, v27.8h, #6
+        sqrshrun        v28.8b, v28.8h, #6
+        sqrshrun        v29.8b, v29.8h, #6
+.endif
+        st1             {v26.8b},   [dst], #8
+        st1             {v28.8b},   [x10], #8
+        st1             {v27.s}[0], [dst], x14
+        st1             {v29.s}[0], [x10], x14
+.endif
+        b.gt            1b // double line
+        subs            width, width, #12
+        // reset src
+        msub            src, srcstride, height, src
+        msub            x12, srcstride, height, x12
+        // reset dst
+        msub            dst, dststride, height, dst
+        msub            x10, dststride, height, x10
+.ifc \type, qpel_bi
+        // reset xsrc
+        sub             x4,  x4,  x17
+        sub             x15, x15, x17
+        add             x4,  x4,  #24
+        add             x15, x15, #24
+.endif
+        add             src, src, #12
+        add             x12, x12, #12
+.ifc \type, qpel
+        add             dst, dst, #24
+        add             x10, x10, #24
+.else
+        add             dst, dst, #12
+        add             x10, x10, #12
+.endif
+        b.gt            0b
+        ret             mx
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
+        load_filter     mx
+        sxtw            height, heightw
+        mov             mx, lr
+.ifc \type, qpel_bi
+        ldrh            w8, [sp] // width
+        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
+        lsl             x17, x5, #7 // src2b reset
+        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+        sub             src, src, #3
+        mov             mx, lr
+.ifc \type, qpel
+        mov             dststride, #(MAX_PB_SIZE << 1)
+        lsl             x13, srcstride, #1 // srcstridel
+        mov             x14, #((MAX_PB_SIZE << 2) - 16)
+.else
+        lsl             x14, dststride, #1 // dststridel
+        lsl             x13, srcstride, #1 // srcstridel
+        sub             x14, x14, #8
+.endif
+        add             x10, dst, dststride // dstb
+        add             x12, src, srcstride // srcb
+0:      mov             x9, height
+1:      ld1             {v16.8b-v18.8b}, [src], x13
+        ld1             {v19.8b-v21.8b}, [x12], x13
+
+        bl              ff_hevc_put_hevc_h16_8_neon
+
+.ifc \type, qpel
+        st1             {v26.8h}, [dst], #16
+        st1             {v28.8h}, [x10], #16
+        st1             {v27.8h}, [dst], x14
+        st1             {v29.8h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+        ld1             {v16.8h, v17.8h}, [ x4], x16
+        ld1             {v18.8h, v19.8h}, [x15], x16
+        sqadd           v26.8h, v26.8h, v16.8h
+        sqadd           v27.8h, v27.8h, v17.8h
+        sqadd           v28.8h, v28.8h, v18.8h
+        sqadd           v29.8h, v29.8h, v19.8h
+        sqrshrun        v26.8b, v26.8h, #7
+        sqrshrun        v27.8b, v27.8h, #7
+        sqrshrun        v28.8b, v28.8h, #7
+        sqrshrun        v29.8b, v29.8h, #7
+.else
+        sqrshrun        v26.8b, v26.8h, #6
+        sqrshrun        v27.8b, v27.8h, #6
+        sqrshrun        v28.8b, v28.8h, #6
+        sqrshrun        v29.8b, v29.8h, #6
+.endif
+        st1             {v26.8b}, [dst], #8
+        st1             {v28.8b}, [x10], #8
+        st1             {v27.8b}, [dst], x14
+        st1             {v29.8b}, [x10], x14
+.endif
+        b.gt            1b // double line
+        subs            width, width, #16
+        // reset src
+        msub            src, srcstride, height, src
+        msub            x12, srcstride, height, x12
+        // reset dst
+        msub            dst, dststride, height, dst
+        msub            x10, dststride, height, x10
+.ifc \type, qpel_bi
+        // reset xsrc
+        sub             x4,  x4,  x17
+        sub             x15, x15, x17
+        add             x4,  x4,  #32
+        add             x15, x15, #32
+.endif
+        add             src, src, #16
+        add             x12, x12, #16
+.ifc \type, qpel
+        add             dst, dst, #32
+        add             x10, x10, #32
+.else
+        add             dst, dst, #16
+        add             x10, x10, #16
+.endif
+        b.gt            0b
+        ret             mx
+endfunc
+
+.unreq height
+.unreq heightw
+.unreq width
+.unreq src
+.unreq dst
+.unreq srcstride
+.unreq dststride
+.unreq mx
+.endm
+
+put_hevc qpel
+put_hevc qpel_uni
+put_hevc qpel_bi
-- 
2.32.0 (Apple Git-132)