[FFmpeg-devel] [PATCH 4/6] aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w

Wed Sep 11 15:19:01 EEST 2024

On Sun, 8 Sep 2024, Zhao Zhili wrote:

> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
> index f72746ce03..076d01b477 100644
> --- a/libavcodec/aarch64/h26x/dsp.h
> +++ b/libavcodec/aarch64/h26x/dsp.h
> @@ -248,4 +248,26 @@ NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t *_src, ptrdiff_t _src
> NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
>         ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),)
>
> +#undef NEON8_FNPROTO_PARTIAL_6
> +#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \
> +    void ff_vvc_put_##fn##4_8_neon##ext args; \
> +    void ff_vvc_put_##fn##8_8_neon##ext args; \
> +    void ff_vvc_put_##fn##16_8_neon##ext args; \
> +    void ff_vvc_put_##fn##32_8_neon##ext args; \
> +    void ff_vvc_put_##fn##64_8_neon##ext args; \
> +    void ff_vvc_put_##fn##128_8_neon##ext args
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst,
> +        const uint8_t *src, ptrdiff_t srcstride, int height,
> +        const int8_t *hf, const int8_t *vf, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride, int height,
> +        const int8_t *hf, const int8_t *vf, int width),);
> +
> +NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox,
> +        const int8_t *hf, const int8_t *vf, int width),);
> +
> #endif
> diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S
> index 378b0f7fb2..729395f2f0 100644
> --- a/libavcodec/aarch64/h26x/epel_neon.S
> +++ b/libavcodec/aarch64/h26x/epel_neon.S
> @@ -19,7 +19,8 @@
>  */
>
> #include "libavutil/aarch64/asm.S"
> -#define MAX_PB_SIZE 64
> +#define HEVC_MAX_PB_SIZE 64
> +#define VVC_MAX_PB_SIZE 128
>
> const epel_filters, align=4
>         .byte  0,  0,  0,  0
> @@ -131,8 +132,13 @@ endconst
>         b.ne            1b
> .endm
>
> +function ff_vvc_put_pel_pixels4_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.s}[0], [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         subs            w3, w3, #1
> @@ -142,7 +148,7 @@ function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2 - 8)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 8)
> 1:      ld1             {v0.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         st1             {v4.d}[0], [x0], #8
> @@ -152,8 +158,13 @@ function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
>         ret
> endfunc
>
> +function ff_vvc_put_pel_pixels8_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         subs            w3, w3, #1
> @@ -163,7 +174,7 @@ function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2 - 16)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 16)
> 1:      ld1             {v0.8b, v1.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         st1             {v4.8h}, [x0], #16
> @@ -174,8 +185,13 @@ function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
>         ret
> endfunc
>
> +function ff_vvc_put_pel_pixels16_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b, v1.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll           v5.8h, v1.8b, #6
> @@ -186,7 +202,7 @@ function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b-v2.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll           v5.8h, v1.8b, #6
> @@ -197,8 +213,13 @@ function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
>         ret
> endfunc
>
> +function ff_vvc_put_pel_pixels32_8_neon, export=1
> +        mov             x7, #(VVC_MAX_PB_SIZE * 2)
> +        b               1f
> +endfunc
> +
> function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE * 2)
> +        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
> 1:      ld1             {v0.8b-v3.8b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll           v5.8h, v1.8b, #6
> @@ -211,7 +232,7 @@ function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
> endfunc
>
> function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
> -        mov             x7, #(MAX_PB_SIZE)
> +        mov             x7, #(HEVC_MAX_PB_SIZE)
> 1:      ld1             {v0.16b-v2.16b}, [x1], x2
>         ushll           v4.8h, v0.8b, #6
>         ushll2          v5.8h, v0.16b, #6
> @@ -226,26 +247,50 @@ function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
>         ret
> endfunc
>
> -function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
> -1:      ld1             {v0.16b-v3.16b}, [x1], x2
> +.macro put_pel_pixels64_8_neon
>         ushll           v4.8h, v0.8b, #6
>         ushll2          v5.8h, v0.16b, #6
>         ushll           v6.8h, v1.8b, #6
>         ushll2          v7.8h, v1.16b, #6
> -        st1             {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
> +        st1             {v4.8h-v7.8h}, [x0], #64
>         ushll           v16.8h, v2.8b, #6
>         ushll2          v17.8h, v2.16b, #6
>         ushll           v18.8h, v3.8b, #6
>         ushll2          v19.8h, v3.16b, #6
> -        subs            w3, w3, #1
> -        st1             {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
> -        b.ne            1b
> +        st1             {v16.8h-v19.8h}, [x0], x7
> +.endm
> +
> +function ff_vvc_put_pel_pixels64_8_neon, export=1
> +        mov             x7, #(2 * VVC_MAX_PB_SIZE - 64)
> +        b               1f
> +endfunc
> +
> +function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
> +        mov             x7, #(HEVC_MAX_PB_SIZE)
> +1:
> +        ld1             {v0.16b-v3.16b}, [x1], x2
> +        sub             w3, w3, #1
> +        put_pel_pixels64_8_neon
> +        cbnz            w3, 1b

We'd typically use subs + b.ne, rather than sub+cbnz for loops like these. 
Or is there anything inside the macros that clobber the condition flags?

The same thing in most of these functions you're touching in this patch.

> +function ff_vvc_put_pel_uni_pixels128_8_neon, export=1
> +1:
> +        mov             x5, x2
> +        mov             x6, x0
> +        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
> +        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x5]
> +        sub             w4, w4, #1
> +        add             x2, x2, x3
> +        add             x0, x0, x1
> +        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], #64
> +        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x6]
> +        cbnz            w4, 1b
> +        ret
> +endfunc

subs+b.ne rather than sub+cbnz, for consistency if nothing else.

The copying of values back and forth between x2/x5 and x0/x6 seems 
wasteful here. I'd suggest this instead:

   sub x1, x1, #64
   sub x3, x3, #64
1:
   ld1 [x2], #64
   subs w4, w4, #1
   ld1 [x2], x3
   ...
   st1 [x0], #64
   st1 [x0], x1
   b.ne 1b

The same goes in ff_vvc_put_pel_uni_w_pixels128_8_neon below as well.

// Martin