[FFmpeg-devel] [PATCH v2 1/2] checkasm: sw_rgb: Add a test for interleaveBytes
Martin Storsjö
martin at martin.st
Fri May 15 21:11:09 EEST 2020
---
This depends on "checkasm: Add functions for printing pixel buffers".
The existing x86 implementations of interleaveBytes seem to slow
down significantly for unaligned copies (GCC 7.5, Sandy Bridge):
interleave_bytes_c: 36251.6
interleave_bytes_mmx: 10038.8
interleave_bytes_mmxext: 58450.3
interleave_bytes_sse2: 57746.3
For the properly aligned case, it behaves better:
interleave_bytes_aligned_c: 36109.8
interleave_bytes_aligned_mmx: 6033.8
interleave_bytes_aligned_mmxext: 6473.1
interleave_bytes_aligned_sse2: 6163.1
But Clang (in Xcode 11.3, run on Kaby Lake) seems to beat all the asm
implementations, in its (autovectorized?) C version:
interleave_bytes_c: 9893.0
interleave_bytes_mmx: 23153.5
interleave_bytes_mmxext: 43693.8
interleave_bytes_sse2: 55894.8
interleave_bytes_aligned_c: 3456.0
interleave_bytes_aligned_mmx: 5780.0
interleave_bytes_aligned_mmxext: 4913.8
interleave_bytes_aligned_sse2: 4154.3
v2: Extended the test further to test all combinations of negative
strides for all three buffers.
---
tests/checkasm/sw_rgb.c | 70 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 70 insertions(+)
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index 000420d8f7..1e8ea151c0 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -111,6 +111,73 @@ static void check_uyvy_to_422p(void)
}
}
+static void check_interleave_bytes(void)
+{
+ LOCAL_ALIGNED_16(uint8_t, src0_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+ LOCAL_ALIGNED_16(uint8_t, src1_buf, [MAX_STRIDE*MAX_HEIGHT+1]);
+ LOCAL_ALIGNED_16(uint8_t, dst0_buf, [2*MAX_STRIDE*MAX_HEIGHT+2]);
+ LOCAL_ALIGNED_16(uint8_t, dst1_buf, [2*MAX_STRIDE*MAX_HEIGHT+2]);
+ // Intentionally using unaligned buffers, as this function doesn't have
+ // any alignment requirements.
+ uint8_t *src0 = src0_buf + 1;
+ uint8_t *src1 = src1_buf + 1;
+ uint8_t *dst0 = dst0_buf + 2;
+ uint8_t *dst1 = dst1_buf + 2;
+
+ declare_func_emms(AV_CPU_FLAG_MMX, void, const uint8_t *, const uint8_t *,
+ uint8_t *, int, int, int, int, int);
+
+ randomize_buffers(src0, MAX_STRIDE * MAX_HEIGHT);
+ randomize_buffers(src1, MAX_STRIDE * MAX_HEIGHT);
+
+ if (check_func(interleaveBytes, "interleave_bytes")) {
+ for (int i = 0; i <= 16; i++) {
+ // Try all widths [1,16], and try one random width.
+
+ int w = i > 0 ? i : (1 + (rnd() % (MAX_STRIDE-2)));
+ int h = 1 + (rnd() % (MAX_HEIGHT-2));
+
+ memset(dst0, 0, 2 * MAX_STRIDE * MAX_HEIGHT);
+ memset(dst1, 0, 2 * MAX_STRIDE * MAX_HEIGHT);
+
+ int src0_offset = 0, src0_stride = MAX_STRIDE;
+ int src1_offset = 0, src1_stride = MAX_STRIDE;
+ int dst_offset = 0, dst_stride = 2 * MAX_STRIDE;
+ // Try different combinations of negative strides
+ if (i & 1) {
+ src0_offset = (h-1)*src0_stride;
+ src0_stride = -src0_stride;
+ }
+ if (i & 2) {
+ src1_offset = (h-1)*src1_stride;
+ src1_stride = -src1_stride;
+ }
+ if (i & 4) {
+ dst_offset = (h-1)*dst_stride;
+ dst_stride = -dst_stride;
+ }
+
+ call_ref(src0 + src0_offset, src1 + src1_offset, dst0 + dst_offset,
+ w, h, src0_stride, src1_stride, dst_stride);
+ call_new(src0 + src0_offset, src1 + src1_offset, dst1 + dst_offset,
+ w, h, src0_stride, src1_stride, dst_stride);
+ // Check a one pixel-pair edge around the destination area,
+ // to catch overwrites past the end.
+ checkasm_check(uint8_t, dst0, 2*MAX_STRIDE, dst1, 2*MAX_STRIDE,
+ 2 * w + 2, h + 1, "dst");
+ }
+
+ bench_new(src0, src1, dst1, 127, MAX_HEIGHT,
+ MAX_STRIDE, MAX_STRIDE, 2*MAX_STRIDE);
+ }
+ if (check_func(interleaveBytes, "interleave_bytes_aligned")) {
+ // Bench the function in a more typical case, with aligned
+ // buffers and widths.
+ bench_new(src0_buf, src1_buf, dst1_buf, 128, MAX_HEIGHT,
+ MAX_STRIDE, MAX_STRIDE, 2*MAX_STRIDE);
+ }
+}
+
void checkasm_check_sw_rgb(void)
{
ff_sws_rgb2rgb_init();
@@ -132,4 +199,7 @@ void checkasm_check_sw_rgb(void)
check_uyvy_to_422p();
report("uyvytoyuv422");
+
+ check_interleave_bytes();
+ report("interleave_bytes");
}
--
2.17.1
More information about the ffmpeg-devel
mailing list