[FFmpeg-devel] [PATCH] lavc/aarch64: add hevc horizontal qpel/uni/bi
Martin Storsjö
martin at martin.st
Wed May 25 13:17:27 EEST 2022
On Tue, 24 May 2022, J. Dekker wrote:
> libavcodec/aarch64/Makefile | 1 +
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 43 +-
> libavcodec/aarch64/hevcdsp_qpel_neon.S | 520 ++++++++++++++++++++++
> 3 files changed, 563 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
Overall comment, now this looks much more straightforward than before,
that's good! Some inline comments below.
> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> new file mode 100644
> index 0000000000..bbaa32a9d9
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> @@ -0,0 +1,520 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +#define MAX_PB_SIZE 64
> +
> +const qpel_filters, align=4
> + .byte 0, 0, 0, 0, 0, 0, 0, 0
> + .byte -1, 4,-10, 58, 17, -5, 1, 0
> + .byte -1, 4,-11, 40, 40,-11, 4, -1
> + .byte 0, 1, -5, 17, 58,-10, 4, -1
> +endconst
> +
> +.macro load_qpel_filter m
> + movrel x15, qpel_filters
> + add x15, x15, \m, lsl #3
> + ld1 {v0.8b}, [x15]
> + sxtl v0.8h, v0.8b
> +.endm
> +
> +// void put_hevc_qpel_h(int16_t *dst,
> +// uint8_t *_src, ptrdiff_t _srcstride,
> +// int height, intptr_t mx, intptr_t my, int width)
> +
> +// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
> +// uint8_t *_src, ptrdiff_t _srcstride,
> +// int height, intptr_t mx, intptr_t my, int width)
> +
> +// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
> +// uint8_t *_src, ptrdiff_t _srcstride,
> +// int16_t *src2, int height, intptr_t mx,
> +// intptr_t my, int width)
> +
> +.macro put_hevc type
> +function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + lsl x10, x2, #1 // src stride * 2
> + sub x13, x1, #3 // src1 = src - 3
> + mov x15, #(MAX_PB_SIZE << 2) // dst stride
> + add x14, x13, x2 // src2 = src1 + src stride
> + add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
> +.else
> +.ifc \type, qpel_bi
> + load_qpel_filter x6
> + mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
> + add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
> +.else
> + load_qpel_filter x5
> +.endif
> + lsl x10, x3, #1 // src stride * 2
> + sub x13, x2, #3 // src1 = src - 3
> + lsl x15, x1, #1 // dst stride * 2
> + add x14, x13, x3 // src2 = src1 + src stride
> + add x17, x0, x1 // dst2 = dst1 + dst stride
> +.endif
> +0: ld1 {v16.8b, v17.8b}, [x13], x10
> + ld1 {v18.8b, v19.8b}, [x14], x10
> +.ifc \type, qpel_bi
> + ld1 {v25.8h}, [x4], x6
> + ld1 {v26.8h}, [x7], x6
> +.endif
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> + uxtl v19.8h, v19.8b
> +
> + mul v23.8h, v16.8h, v0.h[0]
> + mul v24.8h, v18.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
> + mla v23.8h, v20.8h, v0.h[\i]
> + mla v24.8h, v21.8h, v0.h[\i]
> +.endr
As we're only interested in .4h output here, we can do all the mul/mla
here with .4h too, which should give a bit of extra speedup here.
(Theoretically, one could consider packing two .4h halves into one
register and making do with only one mul/mla .8h, but I think two separate
.4h operations are quicker than the extra gymnastics it would require to
shuffle the inputs for that.)
> +
> +.ifc \type, qpel
> + subs w3, w3, #2
> + st1 {v23.4h}, [ x0], x15
> + st1 {v24.4h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + subs w5, w5, #2
> + sqadd v23.8h, v23.8h, v25.8h
> + sqadd v24.8h, v24.8h, v26.8h
These could also be plain .4h then.
> + sqrshrun v23.8b, v23.8h, #7
> + sqrshrun v24.8b, v24.8h, #7
> +.else
> + subs w4, w4, #2
> + sqrshrun v23.8b, v23.8h, #6
> + sqrshrun v24.8b, v24.8h, #6
> +.endif
> + st1 {v23.s}[0], [ x0], x15
> + st1 {v24.s}[0], [x17], x15
> +.endif
> + b.gt 0b // double line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + lsl x10, x2, #1 // width * 2
> + sub x13, x1, #3 // src1 = src - 3
> + mov x15, #(MAX_PB_SIZE * 4 - 8) // dst stride
> + add x14, x13, x2 // src2 = src1 + src stride
> + add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
> +.else
> +.ifc \type, qpel_bi
> + load_qpel_filter x6
> + mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
> + add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
> +.else
> + load_qpel_filter x5
> +.endif
> + lsl x10, x3, #1 // src stride * 2
> + sub x13, x2, #3 // src1 = src - 3
> + lsl x15, x1, #1 // dst stride * 2
> + subs x15, x15, #4
> + add x14, x13, x3 // src2 = src1 + src stride
> + add x17, x0, x1 // dst2 = dst1 + dst stride
> +.endif
> +0: ld1 {v16.8b, v17.8b}, [x13], x10
> + ld1 {v18.8b, v19.8b}, [x14], x10
> +.ifc \type, qpel_bi
> + ld1 {v25.8h}, [x4], x6
> + ld1 {v26.8h}, [x7], x6
> +.endif
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> + uxtl v19.8h, v19.8b
> +
> + mul v23.8h, v16.8h, v0.h[0]
> + mul v24.8h, v18.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
> + mla v23.8h, v20.8h, v0.h[\i]
> + mla v24.8h, v21.8h, v0.h[\i]
> +.endr
> +
> +.ifc \type, qpel
> + subs w3, w3, #2
> + st1 {v23.4h}, [ x0], #8
> + st1 {v23.s}[2], [ x0], x15
> + st1 {v24.4h}, [x17], #8
> + st1 {v24.s}[2], [x17], x15
As the first st1 updates x0, there's some latency before the next
instruction can start, so here it's better to interleave the stores as x0,
x17, x0, x17. Same thing below, and in the h12 function.
> +.else
> +.ifc \type, qpel_bi
> + subs w5, w5, #2
> + sqadd v23.8h, v23.8h, v25.8h
> + sqadd v24.8h, v24.8h, v26.8h
> + sqrshrun v23.8b, v23.8h, #7
> + sqrshrun v24.8b, v24.8h, #7
> +.else
> + subs w4, w4, #2
> + sqrshrun v23.8b, v23.8h, #6
> + sqrshrun v24.8b, v24.8h, #6
> +.endif
> + st1 {v23.s}[0], [ x0], #4
> + st1 {v23.h}[2], [ x0], x15
> + st1 {v24.s}[0], [x17], #4
> + st1 {v24.h}[2], [x17], x15
> +.endif
> + b.gt 0b // double line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + lsl x10, x2, #1 // width * 2
> + sub x13, x1, #3 // src1 = src - 3
> + mov x15, #(MAX_PB_SIZE << 2) // dst stride
> + add x14, x13, x2 // src2 = src1 + src stride
> + add x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
> +.else
> +.ifc \type, qpel_bi
> + load_qpel_filter x6
> + mov x6, #(MAX_PB_SIZE << 2) // rsrc stride << 1
> + add x7, x4, #(MAX_PB_SIZE << 1) // rsrc2
> +.else
> + load_qpel_filter x5
> +.endif
> + lsl x10, x3, #1 // src stride * 2
> + sub x13, x2, #3 // src1 = src - 3
> + lsl x15, x1, #1 // dst stride * 2
> + add x14, x13, x3 // src2 = src1 + src stride
> + add x17, x0, x1 // dst2 = dst1 + dst stride
> +.endif
> +0: ld1 {v16.8b, v17.8b}, [x13], x10
> + ld1 {v18.8b, v19.8b}, [x14], x10
> +.ifc \type, qpel_bi
> + ld1 {v25.8h}, [x4], x6
> + ld1 {v26.8h}, [x7], x6
> +.endif
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> + uxtl v19.8h, v19.8b
> +
> + mul v23.8h, v16.8h, v0.h[0]
> + mul v24.8h, v18.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
> + mla v23.8h, v20.8h, v0.h[\i]
> + mla v24.8h, v21.8h, v0.h[\i]
> +.endr
> +
> +.ifc \type, qpel
> + subs w3, w3, #2
> + st1 {v23.8h}, [ x0], x15
> + st1 {v24.8h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + subs w5, w5, #2
> + sqadd v23.8h, v23.8h, v25.8h
> + sqadd v24.8h, v24.8h, v26.8h
> + sqrshrun v23.8b, v23.8h, #7
> + sqrshrun v24.8b, v24.8h, #7
> +.else
> + subs w4, w4, #2
> + sqrshrun v23.8b, v23.8h, #6
> + sqrshrun v24.8b, v24.8h, #6
> +.endif
> + st1 {v23.8b}, [ x0], x15
> + st1 {v24.8b}, [x17], x15
> +.endif
> + b.gt 0b // double line
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
> +.ifc \type, qpel
> + load_qpel_filter x4
> + // blocks
> + mov w8, #0xAAAB
> + movk w8, #0x2AAA, lsl #16
> + smull x15, w8, w6
> + asr x15, x15, #33
> + sub w6, w15, w6, asr #31
> + // fast divide by 12, thank gcc for this one...
> +
> + // src constants
> + lsl x10, x2, #1 // width * 2
> + sub x1, x1, #3 // src = src - 3
> +
> + // dst constants
> + mov x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
> +
> + // loop
> + mov x8, xzr // hblock
> +0: mov w7, w3
> +
> + // 12 * hblock
> + lsl x12, x8, #3
> + add x12, x12, x8, lsl #2
> +
> + add x13, x1, x12 // src1 = src0 + 12 * hblock
> + add x14, x13, x2 // src2 = src1 + src stride
> +
> + add x16, x0, x12, lsl #1 // dst1 = dst0 + 12 * hblock * 2
> + add x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + dst stride
> +.else
> + // blocks
> +.ifc \type, qpel_bi
> + ldrh w7, [sp]
> + load_qpel_filter x6
> +.else
> + load_qpel_filter x5
> +.endif
> + mov w9, #0xAAAB
> + movk w9, #0x2AAA, lsl #16
> + smull x15, w9, w7
> + asr x15, x15, #33
> + sub w6, w15, w7, asr #31
> +
> + // src constants
> + lsl x10, x3, #1 // src stride * 2
> + sub x2, x2, #3 // src = src - 3
> +
> + // dst constants
> + lsl x15, x1, #1 // dst stride * 2
> +.ifc \type, qpel_bi
> + mov x9, #(MAX_PB_SIZE << 2)
> +.endif
> + sub x15, x15, #8
> + // loop
> + mov x8, xzr // hblock
> +0:
> +.ifc \type, qpel_bi // height
> + mov w7, w5
> +.else
> + mov w7, w4
> +.endif
> + // 12 * hblock
> + lsl x12, x8, #3
> + add x12, x12, x8, lsl #2
> +
> + add x13, x2, x12 // src1 = src0 + 12 * hblock
> + add x14, x13, x3 // src2 = src1 + src stride
> +
> + add x16, x0, x12 // dst1 = dst0 + 12 * hblock
> + add x17, x16, x1 // dst2 = dst1 + dst stride
> +.ifc \type, qpel_bi
> + add x11, x4, x12, lsl #1 // rsrc1 = rsrc0 + 12 * hblock * 2
> + add x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
> +.endif
> +.endif
> +1: ld1 {v16.8b-v18.8b}, [x13], x10
> + ld1 {v19.8b-v21.8b}, [x14], x10
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> +
> + uxtl v19.8h, v19.8b
> + uxtl v20.8h, v20.8b
> + uxtl v21.8h, v21.8b
> +
> + mul v26.8h, v16.8h, v0.h[0]
> + mul v27.8h, v17.8h, v0.h[0]
> + mul v28.8h, v19.8h, v0.h[0]
> + mul v29.8h, v20.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v22.16b, v16.16b, v17.16b, #(2*\i)
> + ext v23.16b, v17.16b, v18.16b, #(2*\i)
> +
> + ext v24.16b, v19.16b, v20.16b, #(2*\i)
> + ext v25.16b, v20.16b, v21.16b, #(2*\i)
> +
> + mla v26.8h, v22.8h, v0.h[\i]
> + mla v27.8h, v23.8h, v0.h[\i]
> +
> + mla v28.8h, v24.8h, v0.h[\i]
> + mla v29.8h, v25.8h, v0.h[\i]
> +.endr
> + subs w7, w7, #2
> +.ifc \type, qpel
> + st1 {v26.8h}, [x16], #16
> + st1 {v27.4h}, [x16], x15
> + st1 {v28.8h}, [x17], #16
> + st1 {v29.4h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + ld1 {v16.8h, v17.8h}, [x11], x9
> + ld1 {v18.8h, v19.8h}, [x12], x9
> + sqadd v26.8h, v26.8h, v16.8h
> + sqadd v27.8h, v27.8h, v17.8h
> + sqadd v28.8h, v28.8h, v18.8h
> + sqadd v29.8h, v29.8h, v19.8h
> + sqrshrun v26.8b, v26.8h, #7
> + sqrshrun v27.8b, v27.8h, #7
> + sqrshrun v28.8b, v28.8h, #7
> + sqrshrun v29.8b, v29.8h, #7
> +.else
> + sqrshrun v26.8b, v26.8h, #6
> + sqrshrun v27.8b, v27.8h, #6
> + sqrshrun v28.8b, v28.8h, #6
> + sqrshrun v29.8b, v29.8h, #6
> +.endif
> + st1 {v26.8b}, [x16], #8
> + st1 {v27.s}[0], [x16], x15
> + st1 {v28.8b}, [x17], #8
> + st1 {v29.s}[0], [x17], x15
> +.endif
> + b.gt 1b // double line
> + add x8, x8, #1
> + cmp x8, x6
> + b.lt 0b // line of blocks
> + ret
> +endfunc
> +
> +function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
> + mov x8, xzr // hblock
> +.ifc \type, qpel
> + load_qpel_filter x4
> + // blocks
> + lsr w6, w6, #4 // horizontal block count
> + // src constants
> + lsl x10, x2, #1 // width * 2
> + sub x1, x1, #3 // src = src - 3
> + // dst constants
> + mov x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
> + // loop
> +0: mov w7, w3 // reset height
> +
> + add x13, x1, x8, lsl #4
> + add x14, x13, x2 // src2 = src1 + src stride
> +
> + add x16, x0, x8, lsl #5 // dst1 = dst0 + hblock * 16 * 2
> + add x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
Alternatively, instead of doing "src = src_base + hblock*16" at the start
of each loop here, you could consider doing "src -= height*stride; src +=
16" at the end of each loop iteration. Overall I think it amounts to
essentially the same amount of instructions (although that would end up
needing an msub), so it doesn't make any difference in that aspect.
But it would free up the x8 register, so that instead of counting x8 from
0 up to x6, you could just count down x6 and quit the loop when it reaches
zero.
> +.else
> +.ifc \type, qpel_bi
> + mov x9, #(MAX_PB_SIZE << 2)
> + ldrh w7, [sp]
> + load_qpel_filter x6
> +.else
> + load_qpel_filter x5
> +.endif
> + // blocks
> + lsr w6, w7, #4 // horizontal block count
> + // src constants
> + lsl x10, x3, #1 // src stride * 2
> + sub x2, x2, #3 // src = src - 3
> + // dst constants
> + lsl x15, x1, #1 // dst stride * 2
> + sub x15, x15, #8
> + // loop
> +0:
> +.ifc \type, qpel_bi // height
> + mov w7, w5
> +.else
> + mov w7, w4
> +.endif
> +
> + add x13, x2, x8, lsl #4 // src1 = src0 + hblock * 16
> + add x14, x13, x3 // src2 = src1 + src stride
> +
> + add x16, x0, x8, lsl #4 // dst1 = dst0 + hblock * 16
> + add x17, x16, x1 // dst2 = dst1 + dst stride
> +.ifc \type, qpel_bi
> + add x11, x4, x8, lsl #5 // rsrc1 = rsrc0 + 16 * hblock * 2
> + add x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
> +.endif
> +.endif
> +1: ld1 {v16.8b-v18.8b}, [x13], x10
> + ld1 {v19.8b-v21.8b}, [x14], x10
> +
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> +
> + uxtl v19.8h, v19.8b
> + uxtl v20.8h, v20.8b
> + uxtl v21.8h, v21.8b
> +
> + mul v26.8h, v16.8h, v0.h[0]
> + mul v27.8h, v17.8h, v0.h[0]
> + mul v28.8h, v19.8h, v0.h[0]
> + mul v29.8h, v20.8h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v22.16b, v16.16b, v17.16b, #(2*\i)
> + ext v23.16b, v17.16b, v18.16b, #(2*\i)
> +
> + ext v24.16b, v19.16b, v20.16b, #(2*\i)
> + ext v25.16b, v20.16b, v21.16b, #(2*\i)
> +
> + mla v26.8h, v22.8h, v0.h[\i]
> + mla v27.8h, v23.8h, v0.h[\i]
> +
> + mla v28.8h, v24.8h, v0.h[\i]
> + mla v29.8h, v25.8h, v0.h[\i]
> +.endr
> + subs w7, w7, #2
> +.ifc \type, qpel
> + st1 {v26.8h}, [x16], #16
> + st1 {v27.8h}, [x16], x15
> + st1 {v28.8h}, [x17], #16
> + st1 {v29.8h}, [x17], x15
> +.else
> +.ifc \type, qpel_bi
> + ld1 {v16.8h, v17.8h}, [x11], x9
> + ld1 {v18.8h, v19.8h}, [x12], x9
> + sqadd v26.8h, v26.8h, v16.8h
> + sqadd v27.8h, v27.8h, v17.8h
> + sqadd v28.8h, v28.8h, v18.8h
> + sqadd v29.8h, v29.8h, v19.8h
> + sqrshrun v26.8b, v26.8h, #7
> + sqrshrun v27.8b, v27.8h, #7
> + sqrshrun v28.8b, v28.8h, #7
> + sqrshrun v29.8b, v29.8h, #7
> +.else
> + sqrshrun v26.8b, v26.8h, #6
> + sqrshrun v27.8b, v27.8h, #6
> + sqrshrun v28.8b, v28.8h, #6
> + sqrshrun v29.8b, v29.8h, #6
> +.endif
> + st1 {v26.8b}, [x16], #8
> + st1 {v27.8b}, [x16], x15
> + st1 {v28.8b}, [x17], #8
> + st1 {v29.8b}, [x17], x15
> +.endif
> + b.gt 1b // double line
> + add x8, x8, #1
> + cmp x8, x6
> + b.lt 0b // horizontal tiling
If you restructure the loop counting, you could do "subs x8, x8, #1" first
here, then do the resetting/incrementing of the src pointers, then a "b.gt
0b", hiding the latency between the subs and the branch (because here,
there's a tight dependency chain between add, cmp and b.lt).
I think this might show a little difference if benchmarked on an in-order
core, with large widths. (But this is not a big deal.)
// Martin
More information about the ffmpeg-devel
mailing list