[FFmpeg-devel] [PATCH v2 1/1] lavc/aarch64: add some neon pix_abs functions

Sat Apr 16 00:13:43 EEST 2022

On Thu, 14 Apr 2022, Swinney, Jonathan wrote:

> - ff_pix_abs16_neon
> - ff_pix_abs16_xy2_neon
>
> In direct micro benchmarks of these ff functions verses their C implementations,
> these functions performed as follows on AWS Graviton 2:
>
> ff_pix_abs16_neon:
> c:  benchmark ran 100000 iterations in 0.955383 seconds
> ff: benchmark ran 100000 iterations in 0.097669 seconds
>
> ff_pix_abs16_xy2_neon:
> c:  benchmark ran 100000 iterations in 1.916759 seconds
> ff: benchmark ran 100000 iterations in 0.370729 seconds

It's generally preferred to include the numbers from checkasm --bench for 
these functions. You can execute it with e.g. "checkasm --bench=pix_fmt 
--test=motion" to run only the relevant tests and benchmark some specific 
function.

Also for the checkasm test; generally I'd suggest looking closer at some 
existing test as a good example. I think e.g. vp8dsp is a decent testcase 
to use as model.

> Signed-off-by: Jonathan Swinney <jswinney at amazon.com>
> ---
> libavcodec/aarch64/Makefile              |   2 +
> libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
> libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
> libavcodec/me_cmp.c                      |   2 +
> libavcodec/me_cmp.h                      |   1 +
> libavcodec/x86/me_cmp.asm                |   7 +
> libavcodec/x86/me_cmp_init.c             |   3 +
> tests/checkasm/Makefile                  |   2 +-
> tests/checkasm/checkasm.c                |   1 +
> tests/checkasm/checkasm.h                |   1 +
> tests/checkasm/motion.c                  | 155 +++++++++++++++++
> 11 files changed, 421 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
> create mode 100644 libavcodec/aarch64/me_cmp_neon.S
> create mode 100644 tests/checkasm/motion.c

> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 954461f81d..18869da1b4 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
> OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
> OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
> OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
> +OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
> OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
> OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
> OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o

If this is gated behind a CONFIG_ME_CMP here, we should use the same 
CONFIG_ME_CMP for conditionals in checkasm too.

> +++ b/libavcodec/me_cmp.c
> @@ -1062,6 +1062,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
>
>     if (ARCH_ALPHA)
>         ff_me_cmp_init_alpha(c, avctx);
> +    if (ARCH_AARCH64)
> +        ff_me_cmp_init_aarch64(c, avctx);

Please add this in alphabetical order, aarch64 comes before alpha.

>     if (ARCH_ARM)
>         ff_me_cmp_init_arm(c, avctx);
>     if (ARCH_PPC)
> diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
> index e9b5161c9a..2c13bb9d3b 100644
> --- a/libavcodec/me_cmp.h
> +++ b/libavcodec/me_cmp.h
> @@ -81,6 +81,7 @@ typedef struct MECmpContext {
>
> void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
> +void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);

Ditto about alphabetical order

> void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
> void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
> diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
> index ad06d485ab..f73b9f9161 100644
> --- a/libavcodec/x86/me_cmp.asm
> +++ b/libavcodec/x86/me_cmp.asm
> @@ -255,6 +255,7 @@ hadamard8x8_diff %+ SUFFIX:
>
>     HSUM                         m0, m1, eax
>     and                         rax, 0xFFFF
> +    emms
>     ret
>

I think we shouldn't be changing the existing x86 functions here. Let's 
originally assume that the existing x86 functions are correct - they're 
expected to not call emms (as the code expects that to be done at a higher 
level somewhere). Therefore, the new checkasm test needs to check the emms 
handling in a way which acecpts the current x86 code. Lots of checkasm 
tests uses "declare_func_emms(AV_CPU_FLAG_MMX, ..." which I think implies 
this intent.

> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index f768b1144e..f542ce0768 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -30,7 +30,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
> AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
> AVCODECOBJS-$(CONFIG_VP9_DECODER)       += vp9dsp.o
>
> -CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
> +CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes) motion.o
>

I guess this should use CONFIG_ME_CMP?

> # libavfilter tests
> AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index f74125e810..bbfc38636c 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -155,6 +155,7 @@ static const struct {
>     #if CONFIG_VIDEODSP
>         { "videodsp", checkasm_check_videodsp },
>     #endif
> +        { "motion", checkasm_check_motion },

Ditto about a CONFIG_ME_CMP condition?

> diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
> new file mode 100644
> index 0000000000..9191a35c01
> --- /dev/null
> +++ b/tests/checkasm/motion.c
> @@ -0,0 +1,155 @@
> +/*
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "libavutil/common.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem_internal.h"
> +
> +#include "libavcodec/me_cmp.h"
> +#include "libavutil/cpu.h"
> +
> +#include "checkasm.h"
> +
> +int dummy;

This looks unused?

> +
> +#define WIDTH 64
> +#define HEIGHT 64
> +
> +static uint8_t img1[WIDTH * HEIGHT];
> +static uint8_t img2[WIDTH * HEIGHT];

These are usually stack allocated. See e.g. vp8dsp.c - they are also 
normally allocated aligned, e.g. LOCAL_ALIGNED_16(...).

> +
> +
> +static void fill_random(uint8_t *tab, int size)
> +{
> +    int i;
> +    AVLFG prng;
> +
> +    av_lfg_init(&prng, 1);

Don't use your own PRNG here, use the rnd() macro from checkasm.h. This is 
hooked up to the test seed that is printed when checkasm is started, which 
allows covering different test combinations each time the test is executed 
(and can be replayed by rerunning checkasm with --seed=1234).

Overall, check other examples, e.g. vp8dsp.c, for use of the rnd() macro.

> +    for(i=0;i<size;i++) {

Please add spaces around operators.

> +        tab[i] = av_lfg_get(&prng) % 256;
> +    }
> +}
> +
> +static void test_motion(const char *name,
> +                 me_cmp_func test_func, me_cmp_func ref_func)
> +{
> +    int x, y, d1, d2, it;
> +    uint8_t *ptr;
> +
> +declare_func(int, struct MpegEncContext *c,
> +             uint8_t *blk1 /* align width (8 or 16) */,
> +             uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
> +             int h);

Maybe declare_func_emms would avoid the need for the emms changes.

> +
> +    if (test_func == ref_func || test_func == NULL || ref_func == NULL) {
> +        return;
> +    }
> +
> +    /* test correctness */
> +    for(it=0;it<20;it++) {
> +
> +        fill_random(img1, WIDTH * HEIGHT);
> +        fill_random(img2, WIDTH * HEIGHT);

Here, fill_random reruns things deterministically as it always reinits the 
PRNG in the same way, so running 20 iterations doesn't increase test 
coverage. With proper use of rnd() it could work though.

> +
> +        if (check_func(test_func, "%s", name)) {

Don't rerun check_func() many times when testing multiple iterations; use 
check_func() once outermost, and while testing that specific function, do 
as many iterations as are relevant

> +            for(y=0;y<HEIGHT-17;y++) {
> +                for(x=0;x<WIDTH-17;x++) {

This is indeed a very exhaustive test, that's great. But if we already 
have such an exhaustive test we probably don't need to do 20 iterations of 
it. If the test instead only tests a couple randomly chosen dimensions, 
doing e.g. 20 iterations sounds like a good idea.

> +                    ptr = img2 + y * WIDTH + x;
> +                    d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
> +                    d1 = call_new(NULL, img1, ptr, WIDTH, 8);
> +
> +                    if (d1 != d2) {
> +                        fail();
> +                        printf("error: mmx=%d c=%d\n", d1, d2);
> +                    }
> +                    bench_new(NULL, img1, ptr, WIDTH, 8);

If doing multiple iterations of the same, I would suggest not running 
bench_new each of them; normally you'd have exhaustive testing using 
call_ref/call_new and checking their outputs, and then just a couple calls 
with bench_new to benchmark things. (In this setup, the benchmark score 
ends up an average of all input size combinations. In some cases, the 
benchmark is only done on the biggest dimension, or on a couple relevant 
cases.)

> +                }
> +            }
> +        }
> +    }
> +    emms_c();
> +}
> +
> +#define sizeof_array(ar) (sizeof(ar)/sizeof((ar)[0]))

Use FF_ARRAY_ELEMS

> +
> +#define ME_CMP_1D_ARRAYS(XX)                                                   \
> +    XX(sad)                                                                    \
> +    XX(sse)                                                                    \
> +    XX(hadamard8_diff)                                                         \
> +    XX(dct_sad)                                                                \
> +    XX(quant_psnr)                                                             \
> +    XX(bit)                                                                    \
> +    XX(rd)                                                                     \
> +    XX(vsad)                                                                   \
> +    XX(vsse)                                                                   \
> +    XX(nsse)                                                                   \
> +    XX(w53)                                                                    \
> +    XX(w97)                                                                    \
> +    XX(dct_max)                                                                \
> +    XX(dct264_sad)                                                             \
> +    XX(me_pre_cmp)                                                             \
> +    XX(me_cmp)                                                                 \
> +    XX(me_sub_cmp)                                                             \
> +    XX(mb_cmp)                                                                 \
> +    XX(ildct_cmp)                                                              \
> +    XX(frame_skip_cmp)                                                         \
> +    XX(median_sad)
> +
> +
> +static void check_motion(void)
> +{
> +    char buf[64];
> +    AVCodecContext *ctx;
> +    MECmpContext c_ctx, ff_ctx;
> +
> +    memset(&c_ctx, 0, sizeof(c_ctx));
> +    memset(&ff_ctx, 0, sizeof(ff_ctx));
> +
> +    /* allocate AVCodecContext */
> +    ctx = avcodec_alloc_context3(NULL);
> +    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
> +    /* clear cpu flags to get C versions of functions */
> +    ff_me_cmp_init(&ff_ctx, ctx);
> +    av_force_cpu_flags(0);
> +    ff_me_cmp_init(&c_ctx, ctx);

No, this isn't how you do it. A test shouldn't touch the cpu flags 
manually. (This probably is what causes the surprising difference in test 
counts that Michael noticed.)

On a high level, the checkasm test framework runs your test multiple 
times, with the cpu mask set to all intermediate levels. So first your 
test gets the C-only version of the function. On the next time around, it 
gets e.g. the MMX version of the function (if any). Then later it gets an 
SSE2, SSSE3, etc version of the function.

The check_func() macro does the magic - it looks up (using the string key) 
the previous function implementation for the same key, so that that 
function gets used as reference. So within your test you should only init 
your DSP functions once, using the cpu feature mask that the test 
framework sets up for you.

> +
> +    for (int i = 0; i < sizeof_array(c_ctx.pix_abs); i++) {
> +        for (int j = 0; j < sizeof_array(c_ctx.pix_abs[0]); j++) {
> +            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
> +            test_motion(buf, ff_ctx.pix_abs[i][j], c_ctx.pix_abs[i][j]);
> +        }
> +    }
> +
> +#define XX(me_cmp_array)                                                        \
> +    for (int i = 0; i < sizeof_array(c_ctx.me_cmp_array); i++) {                \
> +        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                     \
> +        test_motion(buf, ff_ctx.me_cmp_array[i], c_ctx.me_cmp_array[i]);        \
> +    }
> +    ME_CMP_1D_ARRAYS(XX)
> +#undef XX
> +
> +}
> +
> +void checkasm_check_motion(void)
> +{
> +    check_motion();
> +    report("motion");
> +}
> -- 
> 2.32.0

In addition to the test setup you've done, you also need to add the test 
to tests/fate/checkasm.mak, so that "make fate-checkasm" (and make fate) 
includes this new test.

// Martin