[FFmpeg-devel] [PATCH v2 1/1] lavc/aarch64: add some neon pix_abs functions
Swinney, Jonathan
jswinney at amazon.com
Thu Apr 14 19:22:58 EEST 2022
- ff_pix_abs16_neon
- ff_pix_abs16_xy2_neon
In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:
ff_pix_abs16_neon:
c: benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds
ff_pix_abs16_xy2_neon:
c: benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.370729 seconds
Signed-off-by: Jonathan Swinney <jswinney at amazon.com>
---
libavcodec/aarch64/Makefile | 2 +
libavcodec/aarch64/me_cmp_init_aarch64.c | 39 +++++
libavcodec/aarch64/me_cmp_neon.S | 209 +++++++++++++++++++++++
libavcodec/me_cmp.c | 2 +
libavcodec/me_cmp.h | 1 +
libavcodec/x86/me_cmp.asm | 7 +
libavcodec/x86/me_cmp_init.c | 3 +
tests/checkasm/Makefile | 2 +-
tests/checkasm/checkasm.c | 1 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/motion.c | 155 +++++++++++++++++
11 files changed, 421 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
create mode 100644 libavcodec/aarch64/me_cmp_neon.S
create mode 100644 tests/checkasm/motion.c
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
@@ -46,6 +47,7 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..9fb63e9973
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->pix_abs[0][0] = ff_pix_abs16_neon;
+ c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..3b48cb156d
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney at amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_abs16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ cmp w4, #4 // if h < 4, jump to completion section
+ movi v18.4S, #0 // clear result accumulator
+ b.lt 2f
+1:
+ movi v16.8h, #0 // clear uabal accumulator
+ ld1 {v0.16b}, [x1], x3 // load pix1
+ ld1 {v4.16b}, [x2], x3 // load pix2
+ ld1 {v1.16b}, [x1], x3 // load pix1
+ ld1 {v5.16b}, [x2], x3 // load pix2
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v16.8h, v0.16b, v4.16b
+ ld1 {v2.16b}, [x1], x3 // load pix1
+ ld1 {v6.16b}, [x2], x3 // load pix2
+ uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate
+ uabal2 v16.8h, v1.16b, v5.16b
+ ld1 {v3.16b}, [x1], x3
+ ld1 {v7.16b}, [x2], x3
+ uabal v16.8h, v2.8b, v6.8b
+ uabal2 v16.8h, v2.16b, v6.16b
+ sub w4, w4, #4 // h -= 4
+ uabal v16.8h, v3.8b, v7.8b
+ uabal2 v16.8h, v3.16b, v7.16b
+ cmp w4, #4 // if h >= 4, loop
+ uaddlv s17, v16.8h // add up everything in v16 accumulator
+ add d18, d17, d18 // add to the end result register
+
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain, jump to completion section
+
+ fmov w0, s18 // copy result to general purpose register
+ ret
+
+2:
+ movi v16.8h, #0 // clear the uabal accumulator
+ ld1 {v0.16b}, [x1], x3 // load pix1
+ ld1 {v4.16b}, [x2], x3 // load pix2
+ uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v16.8h, v0.16b, v4.16b
+ addv h17, v16.8h // add up v16
+ add d18, d17, d18 // add to result
+ subs w4, w4, #1 // h -= 1
+ b.ne 2b
+
+ fmov w0, s18 // copy result to general purpose register
+ ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ // x5 uint8_t *pix3
+ add x5, x2, x3 // create a pointer for pix3
+ movi v0.2d, #0 // initialize the result register
+
+ // Load initial pix2 values for either the unrolled version or completion version.
+ ldr q4, [x2, #1] // load pix2+1
+ ldr q3, [x2] // load pix2
+ uaddl v2.8h, v4.8b, v3.8b // pix2 + pix2+1 0..7
+ uaddl2 v3.8h, v4.16b, v3.16b // pix2 + pix2+1 8..15
+ cmp w4, #4 // if h < 4 jump to the completion version
+ b.lt 2f
+1:
+ // This is an unrolled implemntation. It completes 4 iterations of the C for each branch.
+ // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+ // plus two at the begining to start.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v4.16b}, [x5], x3 // load pix3
+ ld1 {v1.16b}, [x1], x3 // load pix1
+
+ ldr q7, [x5, #1] // load pix3+1
+ ld1 {v6.16b}, [x5], x3 // load pix3
+ ld1 {v16.16b}, [x1], x3 // load pix1
+
+ ldr q19, [x5, #1] // load pix3+1
+ ld1 {v18.16b}, [x5], x3 // load pix3
+ ld1 {v17.16b}, [x1], x3 // load pix1
+
+ ldr q22, [x5, #1] // load pix3+1
+ ld1 {v21.16b}, [x5], x3 // load pix3
+ ld1 {v20.16b}, [x1], x3 // load pix1
+
+ // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+ uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
+ uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
+ add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
+ add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
+ urshr v23.8h, v23.8h, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v24.8h, v24.8h, #2 // shift right 2 8..15
+
+ uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
+ uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
+ add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ urshr v26.8h, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v27.8h, v27.8h, #2 // shift right 2 8..15
+
+ uaddl v4.8h, v18.8b, v19.8b // pix3 + pix3+1 0..7
+ uaddl2 v5.8h, v18.16b, v19.16b // pix3 + pix3+1 8..15
+ add v28.8h, v2.8h, v4.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v29.8h, v3.8h, v5.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ urshr v28.8h, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v29.8h, v29.8h, #2 // shift right 2 8..15
+
+ uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
+ uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
+ add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ urshr v30.8h, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
+ urshr v31.8h, v31.8h, #2 // shift right 2 8..15
+
+ // Averages are now stored in these registers:
+ // v23, v24
+ // v26, v27
+ // v28, v29
+ // v30, v31
+ // pix1 values in these registers:
+ // v1, v16, v17, v20
+ // available
+ // v4, v5, v7, v16, v18, v19, v25
+
+ uxtl2 v4.8h, v1.16b // 8->16 bits pix1 8..15
+ uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
+ uxtl2 v7.8h, v16.16b // 8->16 bits pix1 8..15
+ uxtl v6.8h, v16.8b // 8->16 bits pix1 0..7
+ uxtl2 v18.8h, v17.16b // 8->16 bits pix1 8..15
+ uxtl v17.8h, v17.8b // 8->16 bits pix1 0..7
+ uxtl2 v25.8h, v20.16b // 8->16 bits pix1 8..15
+ uxtl v20.8h, v20.8b // 8->16 bits pix1 0..7
+
+ uabd v5.8h, v1.8h, v23.8h // absolute difference 0..7
+ uaba v5.8h, v4.8h, v24.8h // absolute difference accumulate 8..15
+ uaba v5.8h, v6.8h, v26.8h // absolute difference accumulate 0..7
+ uaba v5.8h, v7.8h, v27.8h // absolute difference accumulate 8..15
+ uaba v5.8h, v17.8h, v28.8h // absolute difference accumulate 0..7
+ uaba v5.8h, v18.8h, v29.8h // absolute difference accumulate 8..15
+ uaba v5.8h, v20.8h, v30.8h // absolute difference accumulate 0..7
+ uaba v5.8h, v25.8h, v31.8h // absolute difference accumulate 8..15
+
+ uaddlv s5, v5.8h // add up accumulated values
+ sub w4, w4, #4 // h -= 4
+ add d0, d0, d5 // add to final result
+ cmp w4, #4 // loop if h >= 4
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain jump to completion section
+
+ fmov w0, s0 // copy result to general purpose register
+ ret
+2:
+ // v2 and v3 are set either at the end of this loop or at from the unrolled version
+ // which branches here to complete iterations when h % 4 != 0.
+ ldr q5, [x5, #1] // load pix3+1
+ ld1 {v4.16b}, [x5], x3 // load pix3
+ ld1 {v1.16b}, [x1], x3 // load pix1
+ sub w4, w4, #1 // decrement h
+
+ uaddl v18.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
+ uaddl2 v19.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
+ add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
+ add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
+ // divide by 4 to compute the average of values summed above
+ urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
+ urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
+
+ uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15
+ uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
+
+ uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
+ uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15
+ mov v2.16b, v18.16b // pix3 -> pix2
+ mov v3.16b, v19.16b // pix3+1 -> pix2+1
+ addv h6, v6.8h // add up accumulator in v6
+ add d0, d0, d6 // add to the final result
+
+ cbnz w4, 2b // loop if h > 0
+ fmov w0, s0 // copy result to general purpose register
+ ret
+endfunc
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index b2f87d2e1b..2bda7c030c 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1062,6 +1062,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
if (ARCH_ALPHA)
ff_me_cmp_init_alpha(c, avctx);
+ if (ARCH_AARCH64)
+ ff_me_cmp_init_aarch64(c, avctx);
if (ARCH_ARM)
ff_me_cmp_init_arm(c, avctx);
if (ARCH_PPC)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..2c13bb9d3b 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -81,6 +81,7 @@ typedef struct MECmpContext {
void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index ad06d485ab..f73b9f9161 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -255,6 +255,7 @@ hadamard8x8_diff %+ SUFFIX:
HSUM m0, m1, eax
and rax, 0xFFFF
+ emms
ret
hadamard8_16_wrapper 0, 14
@@ -345,6 +346,7 @@ cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
HADDD m7, m1
movd eax, m7 ; return value
+ emms
RET
%endmacro
@@ -463,6 +465,7 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h
psrlq m6, 32
paddd m0, m6
movd eax, m0 ; eax = result of hf_noise8;
+ emms
REP_RET ; return eax;
%endmacro
@@ -516,6 +519,7 @@ align 16
paddw m2, m0
%endif
movd eax, m2
+ emms
RET
%endmacro
@@ -593,6 +597,7 @@ align 16
paddw m0, m1
%endif
movd eax, m0
+ emms
RET
%endmacro
@@ -663,6 +668,7 @@ align 16
paddw m0, m1
%endif
movd eax, m0
+ emms
RET
%endmacro
@@ -825,6 +831,7 @@ cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
paddd m0, m1
%endif
movd eax, m0
+ emms
RET
%endmacro
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 9af911bb88..b330868a38 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -186,6 +186,8 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
: "r" (stride), "m" (h)
: "%ecx");
+ emms_c();
+
return tmp & 0xFFFF;
}
#undef SUM
@@ -418,6 +420,7 @@ static inline int sum_mmx(void)
"paddw %%mm0, %%mm6 \n\t"
"movd %%mm6, %0 \n\t"
: "=r" (ret));
+ emms_c();
return ret & 0xFFFF;
}
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f768b1144e..f542ce0768 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -30,7 +30,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o
AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o
AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o
-CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
+CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) motion.o
# libavfilter tests
AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index f74125e810..bbfc38636c 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -155,6 +155,7 @@ static const struct {
#if CONFIG_VIDEODSP
{ "videodsp", checkasm_check_videodsp },
#endif
+ { "motion", checkasm_check_motion },
#endif
#if CONFIG_AVFILTER
#if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index c3192d8c23..1269ab7cc0 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -67,6 +67,7 @@ void checkasm_check_huffyuvdsp(void);
void checkasm_check_jpeg2000dsp(void);
void checkasm_check_llviddsp(void);
void checkasm_check_llviddspenc(void);
+void checkasm_check_motion(void);
void checkasm_check_nlmeans(void);
void checkasm_check_opusdsp(void);
void checkasm_check_pixblockdsp(void);
diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
new file mode 100644
index 0000000000..9191a35c01
--- /dev/null
+++ b/tests/checkasm/motion.c
@@ -0,0 +1,155 @@
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/me_cmp.h"
+#include "libavutil/cpu.h"
+
+#include "checkasm.h"
+
+int dummy;
+
+#define WIDTH 64
+#define HEIGHT 64
+
+static uint8_t img1[WIDTH * HEIGHT];
+static uint8_t img2[WIDTH * HEIGHT];
+
+
+static void fill_random(uint8_t *tab, int size)
+{
+ int i;
+ AVLFG prng;
+
+ av_lfg_init(&prng, 1);
+ for(i=0;i<size;i++) {
+ tab[i] = av_lfg_get(&prng) % 256;
+ }
+}
+
+static void test_motion(const char *name,
+ me_cmp_func test_func, me_cmp_func ref_func)
+{
+ int x, y, d1, d2, it;
+ uint8_t *ptr;
+
+declare_func(int, struct MpegEncContext *c,
+ uint8_t *blk1 /* align width (8 or 16) */,
+ uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
+ int h);
+
+ if (test_func == ref_func || test_func == NULL || ref_func == NULL) {
+ return;
+ }
+
+ /* test correctness */
+ for(it=0;it<20;it++) {
+
+ fill_random(img1, WIDTH * HEIGHT);
+ fill_random(img2, WIDTH * HEIGHT);
+
+ if (check_func(test_func, "%s", name)) {
+ for(y=0;y<HEIGHT-17;y++) {
+ for(x=0;x<WIDTH-17;x++) {
+ ptr = img2 + y * WIDTH + x;
+ d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
+ d1 = call_new(NULL, img1, ptr, WIDTH, 8);
+
+ if (d1 != d2) {
+ fail();
+ printf("error: mmx=%d c=%d\n", d1, d2);
+ }
+ bench_new(NULL, img1, ptr, WIDTH, 8);
+ }
+ }
+ }
+ }
+ emms_c();
+}
+
+#define sizeof_array(ar) (sizeof(ar)/sizeof((ar)[0]))
+
+#define ME_CMP_1D_ARRAYS(XX) \
+ XX(sad) \
+ XX(sse) \
+ XX(hadamard8_diff) \
+ XX(dct_sad) \
+ XX(quant_psnr) \
+ XX(bit) \
+ XX(rd) \
+ XX(vsad) \
+ XX(vsse) \
+ XX(nsse) \
+ XX(w53) \
+ XX(w97) \
+ XX(dct_max) \
+ XX(dct264_sad) \
+ XX(me_pre_cmp) \
+ XX(me_cmp) \
+ XX(me_sub_cmp) \
+ XX(mb_cmp) \
+ XX(ildct_cmp) \
+ XX(frame_skip_cmp) \
+ XX(median_sad)
+
+
+static void check_motion(void)
+{
+ char buf[64];
+ AVCodecContext *ctx;
+ MECmpContext c_ctx, ff_ctx;
+
+ memset(&c_ctx, 0, sizeof(c_ctx));
+ memset(&ff_ctx, 0, sizeof(ff_ctx));
+
+ /* allocate AVCodecContext */
+ ctx = avcodec_alloc_context3(NULL);
+ ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+ /* clear cpu flags to get C versions of functions */
+ ff_me_cmp_init(&ff_ctx, ctx);
+ av_force_cpu_flags(0);
+ ff_me_cmp_init(&c_ctx, ctx);
+
+ for (int i = 0; i < sizeof_array(c_ctx.pix_abs); i++) {
+ for (int j = 0; j < sizeof_array(c_ctx.pix_abs[0]); j++) {
+ snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
+ test_motion(buf, ff_ctx.pix_abs[i][j], c_ctx.pix_abs[i][j]);
+ }
+ }
+
+#define XX(me_cmp_array) \
+ for (int i = 0; i < sizeof_array(c_ctx.me_cmp_array); i++) { \
+ snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i); \
+ test_motion(buf, ff_ctx.me_cmp_array[i], c_ctx.me_cmp_array[i]); \
+ }
+ ME_CMP_1D_ARRAYS(XX)
+#undef XX
+
+}
+
+void checkasm_check_motion(void)
+{
+ check_motion();
+ report("motion");
+}
--
2.32.0
More information about the ffmpeg-devel
mailing list