[FFmpeg-devel] [PATCH 3/7] avcodec/aarch64/mpegvideoencdsp: add dotprod implementation for pix_norm1
Martin Storsjö
martin at martin.st
Sun Aug 18 23:44:35 EEST 2024
On Sun, 18 Aug 2024, Ramiro Polla wrote:
> A76
> pix_norm1_c: 231.5
> pix_norm1_neon: 44.2 ( 5.24x)
> pix_norm1_dotprod: 20.7 (11.18x)
> ---
> libavcodec/aarch64/mpegvideoencdsp_init.c | 10 ++++++++
> libavcodec/aarch64/mpegvideoencdsp_neon.S | 28 +++++++++++++++++++++++
> 2 files changed, 38 insertions(+)
>
> diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
> index 7eb632ed1b..d0ce07e178 100644
> --- a/libavcodec/aarch64/mpegvideoencdsp_init.c
> +++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
> @@ -27,6 +27,10 @@
> int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
> int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
>
> +#if HAVE_DOTPROD
> +int ff_pix_norm1_neon_dotprod(const uint8_t *pix, int line_size);
> +#endif
> +
> av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
> AVCodecContext *avctx)
> {
> @@ -36,4 +40,10 @@ av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
> c->pix_sum = ff_pix_sum16_neon;
> c->pix_norm1 = ff_pix_norm1_neon;
> }
> +
> +#if HAVE_DOTPROD
> + if (have_dotprod(cpu_flags)) {
> + c->pix_norm1 = ff_pix_norm1_neon_dotprod;
> + }
> +#endif
> }
> diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> index 89e50e29b3..eccbdd850f 100644
> --- a/libavcodec/aarch64/mpegvideoencdsp_neon.S
> +++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> @@ -65,3 +65,31 @@ function ff_pix_norm1_neon, export=1
>
> ret
> endfunc
> +
> +#if HAVE_DOTPROD
> +ENABLE_DOTPROD
> +
> +function ff_pix_norm1_neon_dotprod, export=1
> +// x0 const uint8_t *pix
> +// x1 int line_size
> +
> + sxtw x1, w1
> + movi v0.16b, #0
> + mov w2, #16
> +
> +1:
> + ld1 { v1.16b }, [x0], x1
> + ld1 { v2.16b }, [x0], x1
Nit, spaces inside of {}
> + udot v0.4s, v1.16b, v1.16b
> + subs w2, w2, #2
> + udot v0.4s, v2.16b, v2.16b
> + b.ne 1b
> +
> + uaddlv d0, v0.4s
> + fmov w0, s0
> +
> + ret
> +endfunc
This implementation LGTM otherwise
// Martin
More information about the ffmpeg-devel
mailing list