[FFmpeg-devel] [PATCH 4/6] aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w
Martin Storsjö
martin at martin.st
Wed Sep 11 15:19:01 EEST 2024
On Sun, 8 Sep 2024, Zhao Zhili wrote:
> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
> index f72746ce03..076d01b477 100644
> --- a/libavcodec/aarch64/h26x/dsp.h
> +++ b/libavcodec/aarch64/h26x/dsp.h
> @@ -248,4 +248,26 @@ NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t *_src, ptrdiff_t _src
> NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
> ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),)
>
> +#undef NEON8_FNPROTO_PARTIAL_6
> +#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \
> + void ff_vvc_put_##fn##4_8_neon##ext args; \
> + void ff_vvc_put_##fn##8_8_neon##ext args; \
> + void ff_vvc_put_##fn##16_8_neon##ext args; \
> + void ff_vvc_put_##fn##32_8_neon##ext args; \
> + void ff_vvc_put_##fn##64_8_neon##ext args; \
> + void ff_vvc_put_##fn##128_8_neon##ext args
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst,
> + const uint8_t *src, ptrdiff_t srcstride, int height,
> + const int8_t *hf, const int8_t *vf, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> + const uint8_t *_src, ptrdiff_t _srcstride, int height,
> + const int8_t *hf, const int8_t *vf, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> + const uint8_t *_src, ptrdiff_t _srcstride,
> + int height, int denom, int wx, int ox,
> + const int8_t *hf, const int8_t *vf, int width),);
> +
> #endif
> diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S
> index 378b0f7fb2..729395f2f0 100644
> --- a/libavcodec/aarch64/h26x/epel_neon.S
> +++ b/libavcodec/aarch64/h26x/epel_neon.S
> @@ -19,7 +19,8 @@
> */
>
> #include "libavutil/aarch64/asm.S"
> -#define MAX_PB_SIZE 64
> +#define HEVC_MAX_PB_SIZE 64
> +#define VVC_MAX_PB_SIZE 128
>
> const epel_filters, align=4
> .byte 0, 0, 0, 0
> @@ -131,8 +132,13 @@ endconst
> b.ne 1b
> .endm
>
> +function ff_vvc_put_pel_pixels4_8_neon, export=1
> + mov x7, #(VVC_MAX_PB_SIZE * 2)
> + b 1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE * 2)
> + mov x7, #(HEVC_MAX_PB_SIZE * 2)
> 1: ld1 {v0.s}[0], [x1], x2
> ushll v4.8h, v0.8b, #6
> subs w3, w3, #1
> @@ -142,7 +148,7 @@ function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE * 2 - 8)
> + mov x7, #(HEVC_MAX_PB_SIZE * 2 - 8)
> 1: ld1 {v0.8b}, [x1], x2
> ushll v4.8h, v0.8b, #6
> st1 {v4.d}[0], [x0], #8
> @@ -152,8 +158,13 @@ function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
> ret
> endfunc
>
> +function ff_vvc_put_pel_pixels8_8_neon, export=1
> + mov x7, #(VVC_MAX_PB_SIZE * 2)
> + b 1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE * 2)
> + mov x7, #(HEVC_MAX_PB_SIZE * 2)
> 1: ld1 {v0.8b}, [x1], x2
> ushll v4.8h, v0.8b, #6
> subs w3, w3, #1
> @@ -163,7 +174,7 @@ function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE * 2 - 16)
> + mov x7, #(HEVC_MAX_PB_SIZE * 2 - 16)
> 1: ld1 {v0.8b, v1.8b}, [x1], x2
> ushll v4.8h, v0.8b, #6
> st1 {v4.8h}, [x0], #16
> @@ -174,8 +185,13 @@ function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
> ret
> endfunc
>
> +function ff_vvc_put_pel_pixels16_8_neon, export=1
> + mov x7, #(VVC_MAX_PB_SIZE * 2)
> + b 1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE * 2)
> + mov x7, #(HEVC_MAX_PB_SIZE * 2)
> 1: ld1 {v0.8b, v1.8b}, [x1], x2
> ushll v4.8h, v0.8b, #6
> ushll v5.8h, v1.8b, #6
> @@ -186,7 +202,7 @@ function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE * 2)
> + mov x7, #(HEVC_MAX_PB_SIZE * 2)
> 1: ld1 {v0.8b-v2.8b}, [x1], x2
> ushll v4.8h, v0.8b, #6
> ushll v5.8h, v1.8b, #6
> @@ -197,8 +213,13 @@ function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
> ret
> endfunc
>
> +function ff_vvc_put_pel_pixels32_8_neon, export=1
> + mov x7, #(VVC_MAX_PB_SIZE * 2)
> + b 1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE * 2)
> + mov x7, #(HEVC_MAX_PB_SIZE * 2)
> 1: ld1 {v0.8b-v3.8b}, [x1], x2
> ushll v4.8h, v0.8b, #6
> ushll v5.8h, v1.8b, #6
> @@ -211,7 +232,7 @@ function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
> - mov x7, #(MAX_PB_SIZE)
> + mov x7, #(HEVC_MAX_PB_SIZE)
> 1: ld1 {v0.16b-v2.16b}, [x1], x2
> ushll v4.8h, v0.8b, #6
> ushll2 v5.8h, v0.16b, #6
> @@ -226,26 +247,50 @@ function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
> ret
> endfunc
>
> -function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
> -1: ld1 {v0.16b-v3.16b}, [x1], x2
> +.macro put_pel_pixels64_8_neon
> ushll v4.8h, v0.8b, #6
> ushll2 v5.8h, v0.16b, #6
> ushll v6.8h, v1.8b, #6
> ushll2 v7.8h, v1.16b, #6
> - st1 {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
> + st1 {v4.8h-v7.8h}, [x0], #64
> ushll v16.8h, v2.8b, #6
> ushll2 v17.8h, v2.16b, #6
> ushll v18.8h, v3.8b, #6
> ushll2 v19.8h, v3.16b, #6
> - subs w3, w3, #1
> - st1 {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
> - b.ne 1b
> + st1 {v16.8h-v19.8h}, [x0], x7
> +.endm
> +
> +function ff_vvc_put_pel_pixels64_8_neon, export=1
> + mov x7, #(2 * VVC_MAX_PB_SIZE - 64)
> + b 1f
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
> + mov x7, #(HEVC_MAX_PB_SIZE)
> +1:
> + ld1 {v0.16b-v3.16b}, [x1], x2
> + sub w3, w3, #1
> + put_pel_pixels64_8_neon
> + cbnz w3, 1b
We'd typically use subs + b.ne, rather than sub+cbnz for loops like these.
Or is there anything inside the macros that clobber the condition flags?
The same thing in most of these functions you're touching in this patch.
> +function ff_vvc_put_pel_uni_pixels128_8_neon, export=1
> +1:
> + mov x5, x2
> + mov x6, x0
> + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
> + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x5]
> + sub w4, w4, #1
> + add x2, x2, x3
> + add x0, x0, x1
> + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], #64
> + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x6]
> + cbnz w4, 1b
> + ret
> +endfunc
subs+b.ne rather than sub+cbnz, for consistency if nothing else.
The copying of values back and forth between x2/x5 and x0/x6 seems
wasteful here. I'd suggest this instead:
sub x1, x1, #64
sub x3, x3, #64
1:
ld1 [x2], #64
subs w4, w4, #1
ld1 [x2], x3
...
st1 [x0], #64
st1 [x0], x1
b.ne 1b
The same goes in ff_vvc_put_pel_uni_w_pixels128_8_neon below as well.
// Martin
More information about the ffmpeg-devel
mailing list