[FFmpeg-devel] [PATCH v2 1/2] lavc/aarch64: add hevc qpel assembly
J. Dekker
jdek at itanimul.li
Thu Feb 3 15:51:50 EET 2022
Thanks: Rafal Dabrowa <fatwildcat at gmail.com>
---
libavcodec/aarch64/Makefile | 1 +
libavcodec/aarch64/hevcdsp_init_aarch64.c | 67 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 2799 +++++++++++++++++++++
3 files changed, 2867 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
Had trouble testing on a Linux machine as well, but have a workflow
setup for that now so should be easier in the future. Passes FATE on
both macOS and Linux.
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..8592692479 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,4 +63,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9mc_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
+ aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..3e5d85247e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,63 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
int16_t *sao_offset_val, int sao_left_class,
int width, int height);
+#define NEON8_FNPROTO(fn, args) \
+ void ff_hevc_put_hevc_##fn##4_8_neon args; \
+ void ff_hevc_put_hevc_##fn##6_8_neon args; \
+ void ff_hevc_put_hevc_##fn##8_8_neon args; \
+ void ff_hevc_put_hevc_##fn##12_8_neon args; \
+ void ff_hevc_put_hevc_##fn##16_8_neon args; \
+ void ff_hevc_put_hevc_##fn##24_8_neon args; \
+ void ff_hevc_put_hevc_##fn##32_8_neon args; \
+ void ff_hevc_put_hevc_##fn##48_8_neon args; \
+ void ff_hevc_put_hevc_##fn##64_8_neon args; \
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon; \
+ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon; \
+ member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon; \
+ member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+ member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+ member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+ member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+ member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+ member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -80,6 +136,17 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
// for the current size, but if enabled for bigger sizes, the cases
// of non-multiple of 8 seem to arise.
// c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon;
+
+ NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
+ NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
+
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..e8cc6f5f25
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,2799 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+.Lqpel_filters:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte -1, 4,-10, 58, 17, -5, 1, 0
+ .byte -1, 4,-11, 40, 40,-11, 4, -1
+ .byte 0, 1, -5, 17, 58,-10, 4, -1
+
+.macro load_qpel_filterb freg, xreg
+ adr \xreg, .Lqpel_filters
+ add \xreg, \xreg, \freg, lsl #3
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+ ld4r {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+ neg v0.16b, v0.16b
+ neg v2.16b, v2.16b
+ neg v5.16b, v5.16b
+ neg v7.16b, v7.16b
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umlsl \dst\().8h, \src0\().8b, v0.8b
+ umlal \dst\().8h, \src1\().8b, v1.8b
+ umlsl \dst\().8h, \src2\().8b, v2.8b
+ umlal \dst\().8h, \src3\().8b, v3.8b
+ umlal \dst\().8h, \src4\().8b, v4.8b
+ umlsl \dst\().8h, \src5\().8b, v5.8b
+ umlal \dst\().8h, \src6\().8b, v6.8b
+ umlsl \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umlsl2 \dst\().8h, \src0\().16b, v0.16b
+ umlal2 \dst\().8h, \src1\().16b, v1.16b
+ umlsl2 \dst\().8h, \src2\().16b, v2.16b
+ umlal2 \dst\().8h, \src3\().16b, v3.16b
+ umlal2 \dst\().8h, \src4\().16b, v4.16b
+ umlsl2 \dst\().8h, \src5\().16b, v5.16b
+ umlal2 \dst\().8h, \src6\().16b, v6.16b
+ umlsl2 \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+ adr \xreg, .Lqpel_filters
+ add \xreg, \xreg, \freg, lsl #3
+ ld1 {v0.8b}, [\xreg]
+ sxtl v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ smlal \dst\().4s, \src4\().4h, v0.h[4]
+ smlal \dst\().4s, \src5\().4h, v0.h[5]
+ smlal \dst\().4s, \src6\().4h, v0.h[6]
+ smlal \dst\().4s, \src7\().4h, v0.h[7]
+.ifc \op, sshr
+ sshr \dst\().4s, \dst\().4s, \shift
+.else
+ \op \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+ smull2 \dstt\().4s, \src0\().8h, v0.h[0]
+ smlal2 \dstt\().4s, \src1\().8h, v0.h[1]
+ smlal2 \dstt\().4s, \src2\().8h, v0.h[2]
+ smlal2 \dstt\().4s, \src3\().8h, v0.h[3]
+ smlal2 \dstt\().4s, \src4\().8h, v0.h[4]
+ smlal2 \dstt\().4s, \src5\().8h, v0.h[5]
+ smlal2 \dstt\().4s, \src6\().8h, v0.h[6]
+ smlal2 \dstt\().4s, \src7\().8h, v0.h[7]
+.ifc \op, sshr
+ sshr \dst\().4s, \dstt\().4s, \shift
+.else
+ \op \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #8
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b}, [x1], #8
+ ld1 {v17.s}[0], [x1], x2
+.macro calc src0, src1, idx
+ ushr \src0\().2d, \src1\().2d, #8
+ mov \src0\().b[7], v17.b[\idx]
+.endm
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ ushr v21.2d, v20.2d, #8
+ ushr v22.2d, v21.2d, #8
+ ushr v23.2d, v22.2d, #8
+ ushr v24.2d, v23.2d, #8
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st1 {v28.4h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ mov x14, #(MAX_PB_SIZE * 2 - 8)
+1: ld1 {v16.8b, v17.8b}, [x1], x2
+ // same macro
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ calc v21, v20, 3
+ calc v22, v21, 4
+ ushr v23.2d, v22.2d, #8
+ ushr v24.2d, v23.2d, #8
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ st1 {v28.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v28.s}[2], [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
+ sxtw x4, w4
+ sxtw x7, w7
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x1], x2
+ // same macro
+ calc v18, v16, 0
+ calc v19, v18, 1
+ calc v20, v19, 2
+ calc v21, v20, 3
+ calc v22, v21, 4
+ calc v23, v22, 5
+ calc v24, v23, 6
+.purgem calc
+ movi v28.8h, #0
+ calc_qpelb v28, v16, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st1 {v28.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #16
+ mov x14, #(MAX_PB_SIZE * 2 - 16)
+1: ld2 {v16.8b, v17.8b}, [x1], #16
+ ld1 {v27.s}[0], [x1], x2
+ ushr v18.2d, v16.2d, #8
+ ushr v19.2d, v17.2d, #8
+ mov v18.b[7], v27.b[0]
+ mov v19.b[7], v27.b[1]
+ ushr v20.2d, v18.2d, #8
+ ushr v21.2d, v19.2d, #8
+ mov v20.b[7], v27.b[2]
+ mov v21.b[7], v27.b[3]
+ ushr v22.2d, v20.2d, #8
+ ushr v23.2d, v21.2d, #8
+ ushr v24.2d, v22.2d, #8
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ zip1 v16.8h, v28.8h, v29.8h
+ zip2 v17.8h, v28.8h, v29.8h
+ st1 {v16.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v17.4h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #16
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x1], #16
+ ld1 {v27.8b}, [x1], x2
+ ushr v18.2d, v16.2d, #8
+ ushr v19.2d, v17.2d, #8
+ mov v18.b[7], v27.b[0]
+ mov v19.b[7], v27.b[1]
+ ushr v20.2d, v18.2d, #8
+ ushr v21.2d, v19.2d, #8
+ mov v20.b[7], v27.b[2]
+ mov v21.b[7], v27.b[3]
+ ushr v22.2d, v20.2d, #8
+ ushr v23.2d, v21.2d, #8
+ mov v22.b[7], v27.b[4]
+ mov v23.b[7], v27.b[5]
+ ushr v24.2d, v22.2d, #8
+ mov v24.b[7], v27.b[6]
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ subs w3, w3, #1
+ st2 {v28.8h, v29.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #24
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld3 {v16.8b, v17.8b, v18.8b}, [x1], #24
+ ld1 {v27.8b}, [x1], x2
+ ushr v19.2d, v16.2d, #8
+ ushr v20.2d, v17.2d, #8
+ ushr v21.2d, v18.2d, #8
+ mov v19.b[7], v27.b[0]
+ mov v20.b[7], v27.b[1]
+ mov v21.b[7], v27.b[2]
+ ushr v22.2d, v19.2d, #8
+ ushr v23.2d, v20.2d, #8
+ ushr v24.2d, v21.2d, #8
+ mov v22.b[7], v27.b[3]
+ mov v23.b[7], v27.b[4]
+ mov v24.b[7], v27.b[5]
+ ushr v25.2d, v22.2d, #8
+ mov v25.b[7], v27.b[6]
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ subs w3, w3, #1
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #32
+ mov x14, #(MAX_PB_SIZE * 2)
+1: ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+ movi v28.8h, #0
+ movi v29.8h, #0
+ ld1 {v27.8b}, [x1], x2
+ movi v30.8h, #0
+ movi v31.8h, #0
+ ushr v20.2d, v16.2d, #8
+ ushr v21.2d, v17.2d, #8
+ ushr v22.2d, v18.2d, #8
+ ushr v23.2d, v19.2d, #8
+ mov v20.b[7], v27.b[0]
+ mov v21.b[7], v27.b[1]
+ mov v22.b[7], v27.b[2]
+ mov v23.b[7], v27.b[3]
+ ushr v24.2d, v20.2d, #8
+ ushr v25.2d, v21.2d, #8
+ ushr v26.2d, v22.2d, #8
+ mov v24.b[7], v27.b[4]
+ mov v25.b[7], v27.b[5]
+ mov v26.b[7], v27.b[6]
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ calc_qpelb v31, v19, v20, v21, v22, v23, v24, v25, v26
+ st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #48
+ mov x7, #24
+ mov x14, #80
+1: ld3 {v16.16b, v17.16b, v18.16b}, [x1], x7
+ movi v28.8h, #0
+ ld1 {v26.8b}, [x1], x7
+ movi v29.8h, #0
+ ld1 {v27.8b}, [x1], x2
+ movi v30.8h, #0
+ ushr v19.2d, v16.2d, #8
+ ushr v20.2d, v17.2d, #8
+ ushr v21.2d, v18.2d, #8
+ mov v19.b[7], v26.b[0]
+ mov v19.b[15], v27.b[0]
+ mov v20.b[7], v26.b[1]
+ mov v20.b[15], v27.b[1]
+ mov v21.b[7], v26.b[2]
+ mov v21.b[15], v27.b[2]
+ ushr v22.2d, v19.2d, #8
+ ushr v23.2d, v20.2d, #8
+ ushr v24.2d, v21.2d, #8
+ mov v22.b[7], v26.b[3]
+ mov v22.b[15], v27.b[3]
+ mov v23.b[7], v26.b[4]
+ mov v23.b[15], v27.b[4]
+ mov v24.b[7], v26.b[5]
+ mov v24.b[15], v27.b[5]
+ ushr v25.2d, v22.2d, #8
+ mov v25.b[7], v26.b[6]
+ mov v25.b[15], v27.b[6]
+ calc_qpelb v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb v30, v18, v19, v20, v21, v22, v23, v24, v25
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], #48
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ calc_qpelb2 v28, v16, v17, v18, v19, v20, v21, v22, v23
+ calc_qpelb2 v29, v17, v18, v19, v20, v21, v22, v23, v24
+ calc_qpelb2 v30, v18, v19, v20, v21, v22, v23, v24, v25
+ st3 {v28.8h, v29.8h, v30.8h}, [x0], x14
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
+ load_qpel_filterb x4, x5
+ sub x1, x1, #3
+ sub x2, x2, #64
+ mov x7, #32
+1: ld4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
+ ld1 {v27.8b}, [x1], x7
+ ld1 {v28.8b}, [x1], x2
+ ushr v20.2d, v16.2d, #8
+ ushr v21.2d, v17.2d, #8
+ ushr v22.2d, v18.2d, #8
+ ushr v23.2d, v19.2d, #8
+ mov v20.b[7], v27.b[0]
+ mov v21.b[7], v27.b[1]
+ mov v22.b[7], v27.b[2]
+ mov v23.b[7], v27.b[3]
+ mov v20.b[15], v28.b[0]
+ mov v21.b[15], v28.b[1]
+ mov v22.b[15], v28.b[2]
+ mov v23.b[15], v28.b[3]
+ ushr v24.2d, v20.2d, #8
+ ushr v25.2d, v21.2d, #8
+ ushr v26.2d, v22.2d, #8
+ mov v24.b[7], v27.b[4]
+ mov v25.b[7], v27.b[5]
+ mov v26.b[7], v27.b[6]
+ mov v24.b[15], v28.b[4]
+ mov v25.b[15], v28.b[5]
+ mov v26.b[15], v28.b[6]
+.macro calc fn
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ \fn v28, v16, v17, v18, v19, v20, v21, v22, v23
+ \fn v29, v17, v18, v19, v20, v21, v22, v23, v24
+ \fn v30, v18, v19, v20, v21, v22, v23, v24, v25
+ \fn v31, v19, v20, v21, v22, v23, v24, v25, v26
+ st4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+.endm
+ calc calc_qpelb
+ calc calc_qpelb2
+.purgem calc
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro calc_all
+ calc v23, v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v16
+ b.eq 2f
+ calc v17, v18, v19, v20, v21, v22, v23, v16, v17
+ b.eq 2f
+ calc v18, v19, v20, v21, v22, v23, v16, v17, v18
+ b.eq 2f
+ calc v19, v20, v21, v22, v23, v16, v17, v18, v19
+ b.eq 2f
+ calc v20, v21, v22, v23, v16, v17, v18, v19, v20
+ b.eq 2f
+ calc v21, v22, v23, v16, v17, v18, v19, v20, v21
+ b.eq 2f
+ calc v22, v23, v16, v17, v18, v19, v20, v21, v22
+ b.hi 1b
+.endm
+
+.macro calc_all2
+ calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+ b.eq 2f
+ calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+ b.eq 2f
+ calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+ b.eq 2f
+ calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+ b.eq 2f
+ calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+ b.eq 2f
+ calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+ b.eq 2f
+ calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+ b.eq 2f
+ calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+ b.hi 1b
+.endm
+
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.s}[0], [x1], x2
+ ld1 {v17.s}[0], [x1], x2
+ ld1 {v18.s}[0], [x1], x2
+ ld1 {v19.s}[0], [x1], x2
+ ld1 {v20.s}[0], [x1], x2
+ ld1 {v21.s}[0], [x1], x2
+ ld1 {v22.s}[0], [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], x9
+ subs w3, w3, #1
+ b.eq 2f
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 8)
+ sub x1, x1, x2
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+ ld1 {v19.8b}, [x1], x2
+ ld1 {v20.8b}, [x1], x2
+ ld1 {v21.8b}, [x1], x2
+ ld1 {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], #8
+ st1 {v24.s}[2], [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.8b}, [x1], x2
+ ld1 {v17.8b}, [x1], x2
+ ld1 {v18.8b}, [x1], x2
+ ld1 {v19.8b}, [x1], x2
+ ld1 {v20.8b}, [x1], x2
+ ld1 {v21.8b}, [x1], x2
+ ld1 {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 16)
+ sub x1, x1, x2
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+ ld1 {v19.16b}, [x1], x2
+ ld1 {v20.16b}, [x1], x2
+ ld1 {v21.16b}, [x1], x2
+ ld1 {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v25.4h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b}, [x1], x2
+ ld1 {v17.16b}, [x1], x2
+ ld1 {v18.16b}, [x1], x2
+ ld1 {v19.16b}, [x1], x2
+ ld1 {v20.16b}, [x1], x2
+ ld1 {v21.16b}, [x1], x2
+ ld1 {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ subs w3, w3, #1
+ st1 {v24.8h, v25.8h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+ sub sp, sp, #48
+ st1 {v8.16b, v9.16b, v10.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.16b, v9.16b, v10.16b}, [sp], #48
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+ stp x5, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x5, [sp]
+ add x0, x0, #48
+ add x1, x1, #24
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+0: mov x8, x1 // src
+ ld1 {v16.16b, v17.16b}, [x8], x2
+ mov w11, w3 // height
+ ld1 {v18.16b, v19.16b}, [x8], x2
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x2
+ ld1 {v22.16b, v23.16b}, [x8], x2
+ ld1 {v24.16b, v25.16b}, [x8], x2
+ ld1 {v26.16b, v27.16b}, [x8], x2
+ ld1 {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs x11, x11, #1
+ st1 {v8.8h-v11.8h}, [x10], x9
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #64
+ add x1, x1, #32
+ subs w6, w6, #32
+ b.hi 0b
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x5, x30, [sp], #16
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.4h}, [sp], x7
+ ld1 {v17.4h}, [sp], x7
+ ld1 {v18.4h}, [sp], x7
+ ld1 {v19.4h}, [sp], x7
+ ld1 {v20.4h}, [sp], x7
+ ld1 {v21.4h}, [sp], x7
+ ld1 {v22.4h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ subs w3, w3, #1
+ st1 {v1.4h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x8, #120
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h}, [sp], x7
+ ld1 {v17.8h}, [sp], x7
+ ld1 {v18.8h}, [sp], x7
+ ld1 {v19.8h}, [sp], x7
+ ld1 {v20.8h}, [sp], x7
+ ld1 {v21.8h}, [sp], x7
+ ld1 {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ st1 {v1.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v1.s}[2], [x0], x8
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h}, [sp], x7
+ ld1 {v17.8h}, [sp], x7
+ ld1 {v18.8h}, [sp], x7
+ ld1 {v19.8h}, [sp], x7
+ ld1 {v20.8h}, [sp], x7
+ ld1 {v21.8h}, [sp], x7
+ ld1 {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ mov x8, #112
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ st1 {v1.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v2.4h}, [x0], x8
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x3, x3, #7
+ add x0, sp, #32
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h, v2.8h}, [x0], x7
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, #64
+ add w10, w3, #7
+ st1 {v12.16b-v15.16b}, [sp]
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h24_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ ld1 {v26.8h-v28.8h}, [sp], x7
+1: ld1 {v29.8h-v31.8h}, [sp], x7
+ calc_qpelh v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+ calc_qpelh2 v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+ calc_qpelh v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+ calc_qpelh2 v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+ calc_qpelh v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+ calc_qpelh2 v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v8.8h-v10.8h}, [sp], x7
+ calc_qpelh v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+ calc_qpelh2 v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+ calc_qpelh v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+ calc_qpelh2 v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+ calc_qpelh v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+ calc_qpelh2 v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v11.8h-v13.8h}, [sp], x7
+ calc_qpelh v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+ calc_qpelh2 v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+ calc_qpelh v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+ calc_qpelh2 v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+ calc_qpelh v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+ calc_qpelh2 v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v14.8h-v16.8h}, [sp], x7
+ calc_qpelh v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+ calc_qpelh2 v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+ calc_qpelh v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+ calc_qpelh2 v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+ calc_qpelh v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+ calc_qpelh2 v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v17.8h-v19.8h}, [sp], x7
+ calc_qpelh v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+ calc_qpelh2 v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+ calc_qpelh v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+ calc_qpelh2 v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+ calc_qpelh v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+ calc_qpelh2 v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v20.8h-v22.8h}, [sp], x7
+ calc_qpelh v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+ calc_qpelh2 v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+ calc_qpelh v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+ calc_qpelh2 v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+ calc_qpelh v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+ calc_qpelh2 v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v23.8h-v25.8h}, [sp], x7
+ calc_qpelh v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+ calc_qpelh2 v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+ calc_qpelh v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+ calc_qpelh2 v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+ calc_qpelh v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+ calc_qpelh2 v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.eq 2f
+
+ ld1 {v26.8h-v28.8h}, [sp], x7
+ calc_qpelh v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+ calc_qpelh2 v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+ calc_qpelh v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+ calc_qpelh2 v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+ calc_qpelh v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+ calc_qpelh2 v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h-v3.8h}, [x0], x7
+ b.hi 1b
+2: ld1 {v12.16b-v15.16b}, [sp], #64
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
+ add w10, w3, #7
+ sub x1, x1, x2, lsl #1
+ lsl x10, x10, #7
+ sub x1, x1, x2
+ sub sp, sp, x10 // tmp_array
+ stp x0, x3, [sp, #-16]!
+ add x3, x3, #7
+ stp x5, x30, [sp, #-16]!
+ add x0, sp, #32
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x5, x30, [sp], #16
+ mov x7, #128
+ ldp x0, x3, [sp], #16
+ load_qpel_filterh x5, x4
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x7
+ mov w9, w3 // height
+ ld1 {v18.8h, v19.8h}, [x8], x7
+ mov x5, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x7
+ ld1 {v22.8h, v23.8h}, [x8], x7
+ ld1 {v24.8h, v25.8h}, [x8], x7
+ ld1 {v26.8h, v27.8h}, [x8], x7
+ ld1 {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs x9, x9, #1
+ st1 {v1.8h, v2.8h}, [x5], x7
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32
+ add sp, sp, #32
+ subs w6, w6, #16
+ b.hi 0b
+ add w10, w3, #6
+ add sp, sp, #64 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x1, x1, #24
+ add x0, x0, #48
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x1, x1, #32
+ add x0, x0, #64
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+.macro calc op, src
+ \op v20.8h, v16.8b, v\src\().8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], v17.b[\src]
+.endm
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+// no purgem
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ subs w4, w4, #1
+ st1 {v20.s}[0], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+ sub x1, x1, #4
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+// same macro as above
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ st1 {v20.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v20.h}[2], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+// same macro as above
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+.purgem calc
+ umlsl v20.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ subs w4, w4, #1
+ st1 {v20.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+ sub x1, x1, #8
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr w12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+ ushr \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ mov v16.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ mov v17.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ mov v16.b[7], w12
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7
+ calc umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+ zip1 v16.8h, v20.8h, v21.8h
+ zip2 v17.8h, v20.8h, v21.8h
+ sqrshrun v20.8b, v16.8h, #6
+ sqrshrun2 v20.16b, v17.8h, #6
+ st1 {v20.8b}, [x0], #8
+ add x2, x2, x3
+ st1 {v20.s}[2], [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld2 {v16.8b, v17.8b}, [x2]
+ ldr x12, [x2, #16]
+ movi v20.8h, #0
+ movi v21.8h, #0
+.macro calc op1, op2, dst, r0, r1, src0, src1, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7, 1
+.purgem calc
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ st2 {v20.8b, v21.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld3 {v16.8b-v18.8b}, [x2]
+ ldr x12, [x2, #24]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, src0, src1, src2
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst\().8h, \r2\().8b, \src2\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ lsr x12, x12, #8
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ st3 {v20.8b-v22.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld4 {v16.8b-v19.8b}, [x2]
+ ldr x12, [x2, #32]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst\().8h, \r3\().8b, \src3\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+ umlal v23.8h, v19.8b, v4.8b
+ sqrshrun v20.8b, v20.8h, #6
+ umlsl v23.8h, v16.8b, v5.8b
+ sqrshrun v21.8b, v21.8h, #6
+ umlal v23.8h, v17.8b, v6.8b
+ sqrshrun v22.8b, v22.8h, #6
+ umlsl v23.8h, v18.8b, v7.8b
+ sqrshrun v23.8b, v23.8h, #6
+ st4 {v20.8b-v23.8b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld3 {v16.16b-v18.16b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ ldr x12, [x2, #24]
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+ ldr x13, [x2, #48]
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst0\().8h, \r2\().8b, \src2\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ umlsl2 \dst1\().8h, \r2\().16b, \src2\().16b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+.endm
+ calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+.macro calc r0, r1, r2, r3
+ umlal \r0\().8h, \r2\().8b, v6.8b
+ umlsl \r0\().8h, \r3\().8b, v7.8b
+ umlal2 \r1\().8h, \r2\().16b, v6.16b
+ umlsl2 \r1\().8h, \r3\().16b, v7.16b
+.endm
+ calc v20, v23, v16, v17
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ mov v16.b[15], w13
+ calc v21, v24, v17, v18
+ calc v22, v25, v18, v16
+.purgem calc
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun2 v20.16b, v23.8h, #6
+ sqrshrun2 v21.16b, v24.8h, #6
+ sqrshrun2 v22.16b, v25.8h, #6
+ st3 {v20.16b-v22.16b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
+ load_qpel_filterb x5, x6
+ sub x2, x2, #3
+1: ld4 {v16.16b-v19.16b}, [x2]
+ ldr x12, [x2, #32]
+ ldr x13, [x2, #64]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+ movi v26.8h, #0
+ movi v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst0\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst0\().8h, \r3\().8b, \src3\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ \op1\()2 \dst1\().8h, \r2\().16b, \src2\().16b
+ \op2\()2 \dst1\().8h, \r3\().16b, \src3\().16b
+.if \tail-1
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+ calc umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+ sqrshrun v20.8b, v20.8h, #6
+ sqrshrun v21.8b, v21.8h, #6
+ sqrshrun v22.8b, v22.8h, #6
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun2 v20.16b, v24.8h, #6
+ sqrshrun2 v21.16b, v25.8h, #6
+ sqrshrun2 v22.16b, v26.8h, #6
+ sqrshrun2 v23.16b, v27.8h, #6
+ st4 {v20.16b-v23.16b}, [x0], x1
+ add x2, x2, x3
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+ ld1 {v19.s}[0], [x2], x3
+ ld1 {v20.s}[0], [x2], x3
+ ld1 {v21.s}[0], [x2], x3
+ ld1 {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ subs w4, w4, #1
+ st1 {v24.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x1, x1, #4
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ st1 {v24.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v24.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ subs w4, w4, #1
+ st1 {v24.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x1, x1, #8
+ sub x2, x2, x3
+0: mov x8, x2 // src
+ ld1 {v16.16b}, [x8], x3
+ mov w11, w4 // height
+ ld1 {v17.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v18.16b}, [x8], x3
+ ld1 {v19.16b}, [x8], x3
+ ld1 {v20.16b}, [x8], x3
+ ld1 {v21.16b}, [x8], x3
+ ld1 {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x8], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ sqrshrun2 v24.16b, v25.8h, #6
+ st1 {v24.8b}, [x10], #8
+ subs x11, x11, #1
+ st1 {v24.s}[2], [x10], x1
+.endm
+1: calc_all
+.purgem calc
+2: add x0, x0, #12
+ add x2, x2, #12
+ subs w7, w7, #12
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+ load_qpel_filterb x6, x5
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+0: mov x8, x2 // src
+ ld1 {v16.16b}, [x8], x3
+ mov w11, w4 // height
+ ld1 {v17.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v18.16b}, [x8], x3
+ ld1 {v19.16b}, [x8], x3
+ ld1 {v20.16b}, [x8], x3
+ ld1 {v21.16b}, [x8], x3
+ ld1 {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x8], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ sqrshrun v24.8b, v24.8h, #6
+ sqrshrun2 v24.16b, v25.8h, #6
+ subs x11, x11, #1
+ st1 {v24.16b}, [x10], x1
+.endm
+1: calc_all
+.purgem calc
+2: add x0, x0, #16
+ add x2, x2, #16
+ subs w7, w7, #16
+ b.ne 0b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ ld1 {v16.4h}, [sp], x9
+ ld1 {v17.4h}, [sp], x9
+ ld1 {v18.4h}, [sp], x9
+ ld1 {v19.4h}, [sp], x9
+ ld1 {v20.4h}, [sp], x9
+ ld1 {v21.4h}, [sp], x9
+ ld1 {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ sqxtun v1.8b, v1.8h
+ subs w4, w4, #1
+ st1 {v1.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ sub x1, x1, #4
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ st1 {v1.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v1.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x30, xzr, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x30, xzr, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ subs w4, w4, #1
+ st1 {v1.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ mov x2, x3
+ add x0, sp, #48
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x6, x5
+ sub x1, x1, #8
+ ld1 {v16.8h, v17.8h}, [sp], x9
+ ld1 {v18.8h, v19.8h}, [sp], x9
+ ld1 {v20.8h, v21.8h}, [sp], x9
+ ld1 {v22.8h, v23.8h}, [sp], x9
+ ld1 {v24.8h, v25.8h}, [sp], x9
+ ld1 {v26.8h, v27.8h}, [sp], x9
+ ld1 {v28.8h, v29.8h}, [sp], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
+ sqxtun v1.8b, v1.8h
+ sqxtun2 v1.16b, v2.8h
+ st1 {v1.8b}, [x0], #8
+ subs w4, w4, #1
+ st1 {v1.s}[2], [x0], x1
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+.Lqpel_uni_hv16_loop:
+ mov x9, #(MAX_PB_SIZE * 2)
+ load_qpel_filterh x6, x5
+ sub w12, w9, w7, lsl #1
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x9
+ mov w11, w4 // height
+ ld1 {v18.8h, v19.8h}, [x8], x9
+ mov x10, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x9
+ ld1 {v22.8h, v23.8h}, [x8], x9
+ ld1 {v24.8h, v25.8h}, [x8], x9
+ ld1 {v26.8h, v27.8h}, [x8], x9
+ ld1 {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
+ sqxtun v1.8b, v1.8h
+ subs x11, x11, #1
+ sqxtun2 v1.16b, v2.8h
+ st1 {v1.16b}, [x10], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #16
+ add sp, sp, #32
+ subs w7, w7, #16
+ b.ne 0b
+ add w10, w4, #6
+ add sp, sp, x12 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
+ stp x6, x30, [sp, #-16]!
+ mov x7, #16
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ add x2, x2, #16
+ ldp x0, x1, [sp], #16
+ mov x7, #8
+ add x0, x0, #16
+ ldr x6, [sp]
+ bl X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ add x0, sp, #48
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ mov x2, x3
+ add x0, sp, #48
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h48_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
+ add w10, w4, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w4, #7
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_qpel_h64_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+.macro calc op, idx
+ \op v20.8h, v16.8b, v\idx\().8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], v17.b[\idx]
+.endm
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ subs w5, w5, #1
+ st1 {v16.s}[0], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+ // same macro
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ st1 {v16.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v16.h}[2], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld1 {v16.8b, v17.8b}, [x2], x3
+ movi v20.8h, #0
+ // same macro
+ calc umlsl, 0
+ calc umlal, 1
+ calc umlsl, 2
+ calc umlal, 3
+ calc umlal, 4
+ calc umlsl, 5
+ calc umlal, 6
+ umlsl v20.8h, v16.8b, v7.8b
+.purgem calc
+ ld1 {v24.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqrshrun v16.8b, v16.8h, #7
+ subs w5, w5, #1
+ st1 {v16.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ sub x1, x1, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr w12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+ ushr \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ mov v16.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ mov v17.b[7], w12
+ lsr x12, x12, #8
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ mov v16.b[7], w12
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7
+ calc umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+ ld2 {v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ zip1 v16.16b, v16.16b, v17.16b
+ st1 {v16.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v16.s}[2], [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+1: ld2 {v16.8b, v17.8b}, [x2]
+ movi v20.8h, #0
+ ldr x12, [x2, #16]
+ movi v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+ ushr \r1\().2d, \r1\().2d, #8
+ mov \r1\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v0, v1
+ calc umlsl, umlal, v21, v17, v16, v0, v1
+ calc umlsl, umlal, v20, v16, v17, v2, v3
+ calc umlsl, umlal, v21, v17, v16, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v4, v5
+ calc umlal, umlsl, v21, v17, v16, v4, v5
+ calc umlal, umlsl, v20, v16, v17, v6, v7, 1
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v16.8b, v7.8b
+.purgem calc
+ ld2 {v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ subs w5, w5, #1
+ st2 {v16.8b, v17.8b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h24_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+ mov x11, x7 // height
+1: ld3 {v16.8b-v18.8b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ ldr x12, [x2, #24]
+ movi v22.8h, #0
+.macro calc op1, op2, r0, r1, r2, r3, src0, src1, src2, tail=0
+ \op1 \r0\().8h, \r1\().8b, \src0\().8b
+ \op2 \r0\().8h, \r2\().8b, \src1\().8b
+ umlsl \r0\().8h, \r3\().8b, \src2\().8b
+ ushr \r1\().2d, \r1\().2d, #8
+ mov \r1\().b[7], w12
+ lsr x12, x12, #8
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ ld3 {v23.8h, v24.8h, v25.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v23.8h
+ sqadd v17.8h, v21.8h, v24.8h
+ sqadd v18.8h, v22.8h, v25.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ subs w5, w5, #1
+ st3 {v16.8b, v17.8b, v18.8b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h32_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #(MAX_PB_SIZE * 2)
+ mov x11, x7 // height
+1: ld4 {v16.8b-v19.8b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ ldr x12, [x2, #32]
+ movi v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst\().8h, \r3\().8b, \src3\().8b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+.if \tail-1
+ lsr x12, x12, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+ umlal v23.8h, v19.8b, v4.8b
+ umlsl v23.8h, v16.8b, v5.8b
+ umlal v23.8h, v17.8b, v6.8b
+ umlsl v23.8h, v18.8b, v7.8b
+ ld4 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+ sqadd v16.8h, v20.8h, v24.8h
+ sqadd v17.8h, v21.8h, v25.8h
+ sqadd v18.8h, v22.8h, v26.8h
+ sqadd v19.8h, v23.8h, v27.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ sqrshrun v19.8b, v19.8h, #7
+ st4 {v16.8b-v19.8b}, [x0], x1
+ add x2, x2, x3
+ subs w5, w5, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h48_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+ mov x10, #80
+ mov x11, x7 // height
+1: ld3 {v16.16b-v18.16b}, [x2]
+ ldr x12, [x2, #24]
+ ldr x13, [x2, #48]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ movi v24.8h, #0
+ movi v25.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ umlsl \dst0\().8h, \r2\().8b, \src2\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ umlsl2 \dst1\().8h, \r2\().16b, \src2\().16b
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ mov \r0\().b[15], w13
+
+ lsr x12, x12, #8
+ lsr x13, x13, #8
+
+.endm
+ calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+ calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+ calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+ calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+ calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+ calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+ umlal v20.8h, v16.8b, v6.8b
+ umlsl v20.8h, v17.8b, v7.8b
+ umlal2 v23.8h, v16.16b, v6.16b
+ umlsl2 v23.8h, v17.16b, v7.16b
+ ushr v16.2d, v16.2d, #8
+ mov v16.b[7], w12
+ mov v16.b[15], w13
+ umlal v21.8h, v17.8b, v6.8b
+ umlsl v21.8h, v18.8b, v7.8b
+ umlal2 v24.8h, v17.16b, v6.16b
+ umlsl2 v24.8h, v18.16b, v7.16b
+ umlal v22.8h, v18.8b, v6.8b
+ umlsl v22.8h, v16.8b, v7.8b
+ umlal2 v25.8h, v18.16b, v6.16b
+ umlsl2 v25.8h, v16.16b, v7.16b
+ ld3 {v26.8h, v27.8h, v28.8h}, [x4], #48
+ sqadd v16.8h, v20.8h, v26.8h
+ sqadd v17.8h, v21.8h, v27.8h
+ sqadd v18.8h, v22.8h, v28.8h
+ ld3 {v26.8h, v27.8h, v28.8h}, [x4], x10
+ sqadd v19.8h, v23.8h, v26.8h
+ sqadd v20.8h, v24.8h, v27.8h
+ sqadd v21.8h, v25.8h, v28.8h
+ sqrshrun v16.8b, v16.8h, #7
+ sqrshrun v17.8b, v17.8h, #7
+ sqrshrun v18.8b, v18.8h, #7
+ sqrshrun2 v16.16b, v19.8h, #7
+ sqrshrun2 v17.16b, v20.8h, #7
+ sqrshrun2 v18.16b, v21.8h, #7
+ subs w5, w5, #1
+ st3 {v16.16b-v18.16b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h64_8_neon, export=1
+ load_qpel_filterb x6, x7
+ sub x2, x2, #3
+1: ld4 {v16.16b-v19.16b}, [x2]
+ movi v20.8h, #0
+ movi v21.8h, #0
+ movi v22.8h, #0
+ movi v23.8h, #0
+ ldr x12, [x2, #32]
+ movi v24.8h, #0
+ movi v25.8h, #0
+ ldr x13, [x2, #64]
+ movi v26.8h, #0
+ movi v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+ \op1 \dst0\().8h, \r0\().8b, \src0\().8b
+ \op2 \dst0\().8h, \r1\().8b, \src1\().8b
+ \op1 \dst0\().8h, \r2\().8b, \src2\().8b
+ \op2 \dst0\().8h, \r3\().8b, \src3\().8b
+ \op1\()2 \dst1\().8h, \r0\().16b, \src0\().16b
+ \op2\()2 \dst1\().8h, \r1\().16b, \src1\().16b
+ \op1\()2 \dst1\().8h, \r2\().16b, \src2\().16b
+ \op2\()2 \dst1\().8h, \r3\().16b, \src3\().16b
+.if \tail-1
+ ushr \r0\().2d, \r0\().2d, #8
+ mov \r0\().b[7], w12
+ lsr x12, x12, #8
+ mov \r0\().b[15], w13
+ lsr x13, x13, #8
+.endif
+.endm
+ calc umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+ calc umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+ calc umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+ calc umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+ calc umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+ calc umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+ calc umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+ calc umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+ ld4 {v28.8h-v31.8h}, [x4], #64
+ sqadd v20.8h, v20.8h, v28.8h
+ sqadd v21.8h, v21.8h, v29.8h
+ sqadd v22.8h, v22.8h, v30.8h
+ sqadd v23.8h, v23.8h, v31.8h
+ ld4 {v28.8h-v31.8h}, [x4], #64
+ sqadd v24.8h, v24.8h, v28.8h
+ sqadd v25.8h, v25.8h, v29.8h
+ sqadd v26.8h, v26.8h, v30.8h
+ sqadd v27.8h, v27.8h, v31.8h
+ sqrshrun v16.8b, v20.8h, #7
+ sqrshrun v17.8b, v21.8h, #7
+ sqrshrun v18.8b, v22.8h, #7
+ sqrshrun v19.8b, v23.8h, #7
+ sqrshrun2 v16.16b, v24.8h, #7
+ sqrshrun2 v17.16b, v25.8h, #7
+ sqrshrun2 v18.16b, v26.8h, #7
+ sqrshrun2 v19.16b, v27.8h, #7
+ subs w5, w5, #1
+ st4 {v16.16b-v19.16b}, [x0], x1
+ add x2, x2, x3
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v17.s}[0], [x2], x3
+ ld1 {v18.s}[0], [x2], x3
+ ld1 {v19.s}[0], [x2], x3
+ ld1 {v20.s}[0], [x2], x3
+ ld1 {v21.s}[0], [x2], x3
+ ld1 {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.4h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ subs w5, w5, #1
+ st1 {v25.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v16.8b}, [x2], x3
+ sub x1, x1, #4
+ ld1 {v17.8b}, [x2], x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ st1 {v25.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v25.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v19.8b}, [x2], x3
+ ld1 {v20.8b}, [x2], x3
+ ld1 {v21.8b}, [x2], x3
+ ld1 {v22.8b}, [x2], x3
+ .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x2], x3
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v25.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v25.8h
+ sqrshrun v25.8b, v24.8h, #7
+ subs w5, w5, #1
+ st1 {v25.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x1, x1, #8
+ ld1 {v16.16b}, [x2], x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+ ld1 {v19.16b}, [x2], x3
+ ld1 {v20.16b}, [x2], x3
+ ld1 {v21.16b}, [x2], x3
+ ld1 {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x2], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v26.8h, v27.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v26.8h
+ sqadd v25.8h, v25.8h, v27.8h
+ sqrshrun v26.8b, v24.8h, #7
+ sqrshrun2 v26.16b, v25.8h, #7
+ st1 {v26.8b}, [x0], #8
+ subs w5, w5, #1
+ st1 {v26.s}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
+ load_qpel_filterb x7, x6
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ mov x12, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x2], x3
+ ld1 {v19.16b}, [x2], x3
+ ld1 {v20.16b}, [x2], x3
+ ld1 {v21.16b}, [x2], x3
+ ld1 {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x2], x3
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ ld1 {v26.8h, v27.8h}, [x4], x12 // src2
+ sqadd v24.8h, v24.8h, v26.8h
+ sqadd v25.8h, v25.8h, v27.8h
+ sqrshrun v26.8b, v24.8h, #7
+ subs w5, w5, #1
+ sqrshrun2 v26.16b, v25.8h, #7
+ st1 {v26.16b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x7, [sp]
+ add x0, x0, #16
+ add x2, x2, #16
+ add x4, x4, #32
+ bl X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
+ sub sp, sp, #64
+ st1 {v12.16b-v15.16b}, [sp]
+ sub x2, x2, x3, lsl #1
+ sub sp, sp, #64
+ st1 {v8.16b-v11.16b}, [sp]
+ sub x2, x2, x3
+ load_qpel_filterb x7, x6
+ ldr w6, [sp, #128]
+ mov x12, #(MAX_PB_SIZE * 2)
+0: mov x8, x2 // src
+ ld1 {v16.16b, v17.16b}, [x8], x3
+ mov w11, w5 // height
+ ld1 {v18.16b, v19.16b}, [x8], x3
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x3
+ mov x9, x4 // src2
+ ld1 {v22.16b, v23.16b}, [x8], x3
+ ld1 {v24.16b, v25.16b}, [x8], x3
+ ld1 {v26.16b, v27.16b}, [x8], x3
+ ld1 {v28.16b, v29.16b}, [x8], x3
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x3
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12 // src2
+ sqadd v8.8h, v8.8h, v12.8h
+ sqadd v9.8h, v9.8h, v13.8h
+ sqadd v10.8h, v10.8h, v14.8h
+ sqadd v11.8h, v11.8h, v15.8h
+ sqrshrun v12.8b, v8.8h, #7
+ sqrshrun2 v12.16b, v9.8h, #7
+ sqrshrun v13.8b, v10.8h, #7
+ sqrshrun2 v13.16b, v11.8h, #7
+ subs x11, x11, #1
+ st1 {v12.16b, v13.16b}, [x10], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32 // dst
+ add x2, x2, #32 // src
+ add x4, x4, #64 // src2
+ subs w6, w6, #32
+ b.ne 0b
+ ld1 {v8.16b-v11.16b}, [sp], #64
+ ld1 {v12.16b-v15.16b}, [sp], #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
+ stp x7, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ mov x8, #32
+ stp x8, x8, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+ ldp x8, xzr, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ ldr x7, [sp]
+ add x0, x0, #32
+ add x2, x2, #32
+ add x4, x4, #64
+ bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
+ b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv4_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ ld1 {v16.4h}, [sp], x9
+ ld1 {v17.4h}, [sp], x9
+ ld1 {v18.4h}, [sp], x9
+ ld1 {v19.4h}, [sp], x9
+ ld1 {v20.4h}, [sp], x9
+ ld1 {v21.4h}, [sp], x9
+ ld1 {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.4h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ rshrn v1.4h, v1.4s, #7
+ sqxtun v1.8b, v1.8h
+ subs w5, w5, #1
+ st1 {v1.s}[0], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv6_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ sub x1, x1, #4
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.8h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ sqxtun v1.8b, v1.8h
+ st1 {v1.s}[0], [x0], #4
+ subs w5, w5, #1
+ st1 {v1.h}[2], [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv8_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ sub x1, x2, x3, lsl #1
+ stp x7, x30, [sp, #-16]!
+ sub x1, x1, x3
+ add x0, sp, #48
+ mov x2, x3
+ add x3, x5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ mov x9, #(MAX_PB_SIZE * 2)
+ ldp x0, x1, [sp], #16
+ load_qpel_filterh x7, x6
+ ld1 {v16.8h}, [sp], x9
+ ld1 {v17.8h}, [sp], x9
+ ld1 {v18.8h}, [sp], x9
+ ld1 {v19.8h}, [sp], x9
+ ld1 {v20.8h}, [sp], x9
+ ld1 {v21.8h}, [sp], x9
+ ld1 {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ ld1 {v5.8h}, [x4], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ sqxtun v1.8b, v1.8h
+ subs w5, w5, #1
+ st1 {v1.8b}, [x0], x1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv12_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ add x4, x4, #16
+ ldp x0, x1, [sp], #16
+ add x2, x2, #8
+ add x0, x0, #8
+ bl X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv16_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #16 // width
+.Lqpel_bi_hv16_loop:
+ load_qpel_filterh x7, x8
+ mov x9, #(MAX_PB_SIZE * 2)
+ mov x10, x6
+
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x9
+ mov w11, w5 // height
+ ld1 {v18.8h, v19.8h}, [x8], x9
+ mov x12, x4 // src2
+ ld1 {v20.8h, v21.8h}, [x8], x9
+ mov x7, x0 // dst
+ ld1 {v22.8h, v23.8h}, [x8], x9
+ ld1 {v24.8h, v25.8h}, [x8], x9
+ ld1 {v26.8h, v27.8h}, [x8], x9
+ ld1 {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+ calc_qpelh v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+ calc_qpelh2 v4, v4, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+ ld1 {v5.8h, v6.8h}, [x12], x9 // src2
+ saddw v1.4s, v1.4s, v5.4h
+ saddw2 v2.4s, v2.4s, v5.8h
+ saddw v3.4s, v3.4s, v6.4h
+ saddw2 v4.4s, v4.4s, v6.8h
+ rshrn v1.4h, v1.4s, #7
+ rshrn2 v1.8h, v2.4s, #7
+ rshrn v2.4h, v3.4s, #7
+ rshrn2 v2.8h, v4.4s, #7
+ sqxtun v1.8b, v1.8h
+ sqxtun2 v1.16b, v2.8h
+ subs x11, x11, #1
+ st1 {v1.16b}, [x7], x1
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #16
+ add sp, sp, #32
+ subs x10, x10, #16
+ add x4, x4, #32
+ b.ne 0b
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub x10, x10, x6, lsl #1 // part of first line
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv24_8_neon, export=1
+ stp xzr, x30, [sp, #-16]!
+ stp x0, x1, [sp, #-16]!
+ stp x2, x3, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x6, x7, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon)
+ ldp x6, x7, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x2, x3, [sp], #16
+ ldp x0, x1, [sp], #16
+ add x4, x4, #32
+ add x2, x2, #16
+ add x0, x0, #16
+ bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv32_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #32 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv48_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h48_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #48 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv64_8_neon, export=1
+ add w10, w5, #7
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x5, [sp, #-16]!
+ stp x7, x30, [sp, #-16]!
+ add x0, sp, #48
+ sub x1, x2, x3, lsl #1
+ mov x2, x3
+ sub x1, x1, x3
+ add w3, w5, #7
+ mov x4, x6
+ bl X(ff_hevc_put_hevc_qpel_h64_8_neon)
+ ldp x7, x30, [sp], #16
+ ldp x4, x5, [sp], #16
+ ldp x0, x1, [sp], #16
+ mov x6, #64 // width
+ b .Lqpel_bi_hv16_loop
+endfunc
--
2.32.0 (Apple Git-132)
More information about the ffmpeg-devel
mailing list