[FFmpeg-devel] [PATCH 13/17] swscale/ops_memcpy: add 'memcpy' backend for plane->plane copies
Niklas Haas
ffmpeg at haasn.xyz
Sun May 18 17:59:49 EEST 2025
From: Niklas Haas <git at haasn.dev>
Provides a generic fast path for any operation list that can be decomposed
into a series of memcpy and memset operations.
25% faster than the x86 backend for yuv444p -> yuva444p
33% faster than the x86 backend for gray -> yuvj444p
---
libswscale/Makefile | 1 +
libswscale/ops.c | 2 +
libswscale/ops_memcpy.c | 132 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 135 insertions(+)
create mode 100644 libswscale/ops_memcpy.c
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 6e5696c5a6..136d33f6bc 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -18,6 +18,7 @@ OBJS = alphablend.o \
ops.o \
ops_backend.o \
ops_chain.o \
+ ops_memcpy.o \
ops_optimizer.o \
options.o \
output.o \
diff --git a/libswscale/ops.c b/libswscale/ops.c
index ca6a5aacac..7e6590ec14 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -28,8 +28,10 @@
#include "ops_internal.h"
extern SwsOpBackend backend_c;
+extern SwsOpBackend backend_murder;
const SwsOpBackend * const ff_sws_op_backends[] = {
+ &backend_murder,
&backend_c,
NULL
};
diff --git a/libswscale/ops_memcpy.c b/libswscale/ops_memcpy.c
new file mode 100644
index 0000000000..dcdc6767a3
--- /dev/null
+++ b/libswscale/ops_memcpy.c
@@ -0,0 +1,132 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "ops_backend.h"
+
+typedef struct MemcpyPriv {
+ int num_planes;
+ int index[4]; /* or -1 to clear plane */
+ uint8_t clear_value[4];
+} MemcpyPriv;
+
+/* Memcpy backend for trivial cases */
+
+static void process(const SwsOpExec *exec, const void *priv,
+ int x_start, int y_start, int x_end, int y_end)
+{
+ const MemcpyPriv *p = priv;
+ const int lines = y_end - y_start;
+ av_assert1(x_start == 0 && x_end == exec->width);
+
+ for (int i = 0; i < p->num_planes; i++) {
+ uint8_t *out = exec->out[i];
+ const int idx = p->index[i];
+ if (idx < 0) {
+ memset(out, p->clear_value[i], exec->out_stride[i] * lines);
+ } else if (exec->out_stride[i] == exec->in_stride[idx]) {
+ memcpy(out, exec->in[idx], exec->out_stride[i] * lines);
+ } else {
+ const int bytes = x_end * exec->pixel_bits_out >> 3;
+ const uint8_t *in = exec->in[idx];
+ for (int y = y_start; y < y_end; y++) {
+ memcpy(out, in, bytes);
+ out += exec->out_stride[i];
+ in += exec->in_stride[idx];
+ }
+ }
+ }
+}
+
+static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
+{
+ MemcpyPriv p = {0};
+
+ for (int n = 0; n < ops->num_ops; n++) {
+ const SwsOp *op = &ops->ops[n];
+ switch (op->op) {
+ case SWS_OP_READ:
+ if (op->rw.packed || op->rw.frac)
+ return AVERROR(ENOTSUP);
+ for (int i = 0; i < op->rw.elems; i++)
+ p.index[i] = i;
+ break;
+
+ case SWS_OP_SWIZZLE: {
+ const MemcpyPriv orig = p;
+ for (int i = 0; i < 4; i++) {
+ /* Explicitly exclude swizzle masks that contain duplicates,
+ * because these are wasteful to implement as a memcpy */
+ for (int j = 0; j < i; j++) {
+ if (op->swizzle.in[i] == op->swizzle.in[j])
+ return AVERROR(ENOTSUP);
+ }
+ p.index[i] = orig.index[op->swizzle.in[i]];
+ }
+ break;
+ }
+
+ case SWS_OP_CLEAR:
+ for (int i = 0; i < 4; i++) {
+ if (!op->c.q4[i].den)
+ continue;
+ if (op->c.q4[i].den != 1)
+ return AVERROR(ENOTSUP);
+
+ /* Ensure all bytes to be cleared are the same, because we
+ * can't memset on multi-byte sequences */
+ uint8_t val = op->c.q4[i].num & 0xFF;
+ uint32_t ref = val;
+ switch (ff_sws_pixel_type_size(op->type)) {
+ case 2: ref *= 0x101; break;
+ case 4: ref *= 0x1010101; break;
+ }
+ if (ref != op->c.q4[i].num)
+ return AVERROR(ENOTSUP);
+ p.clear_value[i] = val;
+ p.index[i] = -1;
+ }
+ break;
+
+ case SWS_OP_WRITE:
+ if (op->rw.packed || op->rw.frac)
+ return AVERROR(ENOTSUP);
+ p.num_planes = op->rw.elems;
+ break;
+
+ default:
+ return AVERROR(ENOTSUP);
+ }
+ }
+
+ *out = (SwsCompiledOp) {
+ .block_size = 1,
+ .func = process,
+ .priv = av_memdup(&p, sizeof(p)),
+ .free = av_free,
+ };
+ return out->priv ? 0 : AVERROR(ENOMEM);
+}
+
+SwsOpBackend backend_murder = {
+ .name = "memcpy",
+ .compile = compile,
+};
--
2.49.0
More information about the ffmpeg-devel
mailing list