[FFmpeg-devel] [PATCH 08/17] swscale/ops: introduce new low level framework

Sat Apr 26 20:41:12 EEST 2025

From: Niklas Haas <git at haasn.dev>

See the design document introduced in the previous commit for an
in-depth introduction to the new approach.

This commit merely introduces the common dispatch code, the SwsOpList
boilerplate, and the high-level optimizer. The subsequent commits will
add the underlying implementations.
---
 libswscale/Makefile        |   2 +
 libswscale/ops.c           | 843 +++++++++++++++++++++++++++++++++++++
 libswscale/ops.h           | 265 ++++++++++++
 libswscale/ops_internal.h  | 103 +++++
 libswscale/ops_optimizer.c | 810 +++++++++++++++++++++++++++++++++++
 5 files changed, 2023 insertions(+)
 create mode 100644 libswscale/ops.c
 create mode 100644 libswscale/ops.h
 create mode 100644 libswscale/ops_internal.h
 create mode 100644 libswscale/ops_optimizer.c

diff --git a/libswscale/Makefile b/libswscale/Makefile
index d5e10d17dc..810c9dee78 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -15,6 +15,8 @@ OBJS = alphablend.o                                     \
        graph.o                                          \
        input.o                                          \
        lut3d.o                                          \
+       ops.o                                            \
+       ops_optimizer.o                                  \
        options.o                                        \
        output.o                                         \
        rgb2rgb.o                                        \
diff --git a/libswscale/ops.c b/libswscale/ops.c
new file mode 100644
index 0000000000..6d9a844e06
--- /dev/null
+++ b/libswscale/ops.c
@@ -0,0 +1,843 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/bswap.h"
+#include "libavutil/mem.h"
+#include "libavutil/rational.h"
+#include "libavutil/refstruct.h"
+
+#include "ops.h"
+#include "ops_internal.h"
+
+const SwsOpBackend * const ff_sws_op_backends[] = {
+    NULL
+};
+
+const int ff_sws_num_op_backends = FF_ARRAY_ELEMS(ff_sws_op_backends) - 1;
+
+#define Q(N) ((AVRational) { N, 1 })
+
+#define RET(x)                                                                 \
+    do {                                                                       \
+        if ((ret = (x)) < 0)                                                   \
+            return ret;                                                        \
+    } while (0)
+
+const char *ff_sws_pixel_type_name(SwsPixelType type)
+{
+    switch (type) {
+    case SWS_PIXEL_U8:   return "u8";
+    case SWS_PIXEL_U16:  return "u16";
+    case SWS_PIXEL_U32:  return "u32";
+    case SWS_PIXEL_F32:  return "f32";
+    case SWS_PIXEL_NONE: return "none";
+    case SWS_PIXEL_TYPE_NB: break;
+    }
+
+    av_assert0(!"Invalid pixel type!");
+    return "ERR";
+}
+
+int ff_sws_pixel_type_size(SwsPixelType type)
+{
+    switch (type) {
+    case SWS_PIXEL_U8:  return sizeof(uint8_t);
+    case SWS_PIXEL_U16: return sizeof(uint16_t);
+    case SWS_PIXEL_U32: return sizeof(uint32_t);
+    case SWS_PIXEL_F32: return sizeof(float);
+    case SWS_PIXEL_NONE: break;
+    case SWS_PIXEL_TYPE_NB: break;
+    }
+
+    av_assert0(!"Invalid pixel type!");
+    return 0;
+}
+
+bool ff_sws_pixel_type_is_int(SwsPixelType type)
+{
+    switch (type) {
+    case SWS_PIXEL_U8:
+    case SWS_PIXEL_U16:
+    case SWS_PIXEL_U32:
+        return true;
+    case SWS_PIXEL_F32:
+        return false;
+    case SWS_PIXEL_NONE:
+    case SWS_PIXEL_TYPE_NB: break;
+    }
+
+    av_assert0(!"Invalid pixel type!");
+    return false;
+}
+
+SwsPixelType ff_sws_pixel_type_to_uint(SwsPixelType type)
+{
+    if (!type)
+        return type;
+
+    switch (ff_sws_pixel_type_size(type)) {
+    case 8:  return SWS_PIXEL_U8;
+    case 16: return SWS_PIXEL_U16;
+    case 32: return SWS_PIXEL_U32;
+    }
+
+    av_assert0(!"Invalid pixel type!");
+    return SWS_PIXEL_NONE;
+}
+
+/* biased towards `a` */
+static AVRational av_min_q(AVRational a, AVRational b)
+{
+    return av_cmp_q(a, b) == 1 ? b : a;
+}
+
+static AVRational av_max_q(AVRational a, AVRational b)
+{
+    return av_cmp_q(a, b) == -1 ? b : a;
+}
+
+static AVRational expand_factor(SwsPixelType from, SwsPixelType to)
+{
+    const int src = ff_sws_pixel_type_size(from);
+    const int dst = ff_sws_pixel_type_size(to);
+    int scale = 0;
+    for (int i = 0; i < dst / src; i++)
+        scale = scale << src * 8 | 1;
+    return Q(scale);
+}
+
+void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
+{
+    switch (op->op) {
+    case SWS_OP_READ:
+    case SWS_OP_WRITE:
+        return;
+    case SWS_OP_UNPACK: {
+        unsigned val = x[0].num;
+        int shift = ff_sws_pixel_type_size(op->type) * 8;
+        for (int i = 0; i < 4; i++) {
+            const unsigned mask = (1 << op->pack.pattern[i]) - 1;
+            shift -= op->pack.pattern[i];
+            x[i] = Q((val >> shift) & mask);
+        }
+        return;
+    }
+    case SWS_OP_PACK: {
+        unsigned val = 0;
+        int shift = ff_sws_pixel_type_size(op->type) * 8;
+        for (int i = 0; i < 4; i++) {
+            const unsigned mask = (1 << op->pack.pattern[i]) - 1;
+            shift -= op->pack.pattern[i];
+            val |= (x[i].num & mask) << shift;
+        }
+        x[0] = Q(val);
+        return;
+    }
+    case SWS_OP_SWAP_BYTES:
+        switch (ff_sws_pixel_type_size(op->type)) {
+        case 2:
+            for (int i = 0; i < 4; i++)
+                x[i].num = av_bswap16(x[i].num);
+            break;
+        case 4:
+            for (int i = 0; i < 4; i++)
+                x[i].num = av_bswap32(x[i].num);
+            break;
+        }
+        return;
+    case SWS_OP_CLEAR:
+        for (int i = 0; i < 4; i++) {
+            if (op->c.q4[i].den)
+                x[i] = op->c.q4[i];
+        }
+        return;
+    case SWS_OP_LSHIFT: {
+        AVRational mult = Q(1 << op->c.u);
+        for (int i = 0; i < 4; i++)
+            x[i] = x[i].den ? av_mul_q(x[i], mult) : x[i];
+        return;
+    }
+    case SWS_OP_RSHIFT: {
+        AVRational mult = Q(1 << op->c.u);
+        for (int i = 0; i < 4; i++)
+            x[i] = x[i].den ? av_div_q(x[i], mult) : x[i];
+        return;
+    }
+    case SWS_OP_SWIZZLE: {
+        const AVRational orig[4] = { x[0], x[1], x[2], x[3] };
+        for (int i = 0; i < 4; i++)
+            x[i] = orig[op->swizzle.in[i]];
+        return;
+    }
+    case SWS_OP_CONVERT:
+        if (ff_sws_pixel_type_is_int(op->convert.to)) {
+            const AVRational scale = expand_factor(op->type, op->convert.to);
+            for (int i = 0; i < 4; i++) {
+                x[i] = x[i].den ? Q(x[i].num / x[i].den) : x[i];
+                if (op->convert.expand)
+                    x[i] = av_mul_q(x[i], scale);
+            }
+        }
+        return;
+    case SWS_OP_DITHER:
+        for (int i = 0; i < 4; i++)
+            x[i] = x[i].den ? av_add_q(x[i], av_make_q(1, 2)) : x[i];
+        return;
+    case SWS_OP_MIN:
+        for (int i = 0; i < 4; i++)
+            x[i] = av_min_q(x[i], op->c.q4[i]);
+        return;
+    case SWS_OP_MAX:
+        for (int i = 0; i < 4; i++)
+            x[i] = av_max_q(x[i], op->c.q4[i]);
+        return;
+    case SWS_OP_LINEAR: {
+        const AVRational orig[4] = { x[0], x[1], x[2], x[3] };
+        for (int i = 0; i < 4; i++) {
+            AVRational sum = op->lin.m[i][4];
+            for (int j = 0; j < 4; j++)
+                sum = av_add_q(sum, av_mul_q(orig[j], op->lin.m[i][j]));
+            x[i] = sum;
+        }
+        return;
+    }
+    case SWS_OP_SCALE:
+        for (int i = 0; i < 4; i++)
+            x[i] = x[i].den ? av_mul_q(x[i], op->c.q) : x[i];
+        return;
+    }
+
+    av_assert0(!"Invalid operation type!");
+}
+
+static void op_uninit(SwsOp *op)
+{
+    switch (op->op) {
+    case SWS_OP_DITHER:
+        av_refstruct_unref(&op->dither.matrix);
+        break;
+    }
+
+    *op = (SwsOp) {0};
+}
+
+SwsOpList *ff_sws_op_list_alloc(void)
+{
+    return av_mallocz(sizeof(SwsOpList));
+}
+
+void ff_sws_op_list_free(SwsOpList **p_ops)
+{
+    SwsOpList *ops = *p_ops;
+    if (!ops)
+        return;
+
+    for (int i = 0; i < ops->num_ops; i++)
+        op_uninit(&ops->ops[i]);
+
+    av_freep(&ops->ops);
+    av_free(ops);
+    *p_ops = NULL;
+}
+
+SwsOpList *ff_sws_op_list_duplicate(const SwsOpList *ops)
+{
+    SwsOpList *copy = av_malloc(sizeof(*copy));
+    if (!copy)
+        return NULL;
+
+    *copy = *ops;
+    copy->ops = av_memdup(ops->ops, ops->num_ops * sizeof(ops->ops[0]));
+    if (!copy->ops) {
+        av_free(copy);
+        return NULL;
+    }
+
+    for (int i = 0; i < ops->num_ops; i++) {
+        const SwsOp *op = &ops->ops[i];
+        switch (op->op) {
+        case SWS_OP_DITHER:
+            av_refstruct_ref(copy->ops[i].dither.matrix);
+            break;
+        }
+    }
+
+    return copy;
+}
+
+void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count)
+{
+    const int end = ops->num_ops - count;
+    av_assert2(index >= 0 && count >= 0 && index + count <= ops->num_ops);
+    for (int i = index; i < end; i++)
+        ops->ops[i] = ops->ops[i + count];
+    ops->num_ops = end;
+}
+
+int ff_sws_op_list_insert_at(SwsOpList *ops, int index, SwsOp *op)
+{
+    void *ret;
+    ret = av_dynarray2_add((void **) &ops->ops, &ops->num_ops, sizeof(*op),
+                           (const void *) op);
+    if (!ret) {
+        op_uninit(op);
+        return AVERROR(ENOMEM);
+    }
+
+    for (int i = ops->num_ops - 1; i > index; i--)
+        ops->ops[i] = ops->ops[i - 1];
+    ops->ops[index] = *op;
+    *op = (SwsOp) {0};
+    return 0;
+}
+
+int ff_sws_op_list_append(SwsOpList *ops, SwsOp *op)
+{
+    return ff_sws_op_list_insert_at(ops, ops->num_ops, op);
+}
+
+int ff_sws_op_list_max_size(const SwsOpList *ops)
+{
+    int max_size = 0;
+    for (int i = 0; i < ops->num_ops; i++) {
+        const int size = ff_sws_pixel_type_size(ops->ops[i].type);
+        max_size = FFMAX(max_size, size);
+    }
+
+    return max_size;
+}
+
+uint32_t ff_sws_linear_mask(const SwsLinearOp c)
+{
+    uint32_t mask = 0;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 5; j++) {
+            if (av_cmp_q(c.m[i][j], Q(i == j)))
+                mask |= SWS_MASK(i, j);
+        }
+    }
+    return mask;
+}
+
+static const char *describe_lin_mask(uint32_t mask)
+{
+    /* Try to be fairly descriptive without assuming too much */
+    static const struct {
+        const char *name;
+        uint32_t mask;
+    } patterns[] = {
+        { "noop",               0 },
+        { "luma",               SWS_MASK_LUMA },
+        { "alpha",              SWS_MASK_ALPHA },
+        { "luma+alpha",         SWS_MASK_LUMA | SWS_MASK_ALPHA },
+        { "dot3",               0b111 },
+        { "dot4",               0b1111 },
+        { "row0",               SWS_MASK_ROW(0) },
+        { "row0+alpha",         SWS_MASK_ROW(0) | SWS_MASK_ALPHA },
+        { "col0",               SWS_MASK_COL(0) },
+        { "col0+off3",          SWS_MASK_COL(0) | SWS_MASK_OFF3 },
+        { "off3",               SWS_MASK_OFF3 },
+        { "off3+alpha",         SWS_MASK_OFF3 | SWS_MASK_ALPHA },
+        { "diag3",              SWS_MASK_DIAG3 },
+        { "diag4",              SWS_MASK_DIAG4 },
+        { "diag3+alpha",        SWS_MASK_DIAG3 | SWS_MASK_ALPHA },
+        { "diag3+off3",         SWS_MASK_DIAG3 | SWS_MASK_OFF3 },
+        { "diag3+off3+alpha",   SWS_MASK_DIAG3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA },
+        { "diag4+off4",         SWS_MASK_DIAG4 | SWS_MASK_OFF4 },
+        { "matrix3",            SWS_MASK_MAT3 },
+        { "matrix3+off3",       SWS_MASK_MAT3 | SWS_MASK_OFF3 },
+        { "matrix3+off3+alpha", SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA },
+        { "matrix4",            SWS_MASK_MAT4 },
+        { "matrix4+off4",       SWS_MASK_MAT4 | SWS_MASK_OFF4 },
+    };
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(patterns); i++) {
+        if (!(mask & ~patterns[i].mask))
+            return patterns[i].name;
+    }
+
+    return "full";
+}
+
+static char describe_comp_flags(unsigned flags)
+{
+    if (flags & SWS_COMP_GARBAGE)
+        return 'X';
+    else if (flags & SWS_COMP_ZERO)
+        return '0';
+    else if (flags & SWS_COMP_EXACT)
+        return '+';
+    else
+        return '.';
+}
+
+static const char *print_q(const AVRational q, char buf[], int buf_len)
+{
+    if (!q.den) {
+        switch (q.num) {
+        case  1: return "inf";
+        case -1: return "-inf";
+        default: return "nan";
+        }
+    }
+
+    if (q.den == 1) {
+        snprintf(buf, buf_len, "%d", q.num);
+        return buf;
+    }
+
+    if (abs(q.num) > 1000 || abs(q.den) > 1000) {
+        snprintf(buf, buf_len, "%f", av_q2d(q));
+        return buf;
+    }
+
+    snprintf(buf, buf_len, "%d/%d", q.num, q.den);
+    return buf;
+}
+
+#define PRINTQ(q) print_q(q, (char[32]){0}, sizeof(char[32]) - 1)
+
+
+void ff_sws_op_list_print(void *log, int lev, const SwsOpList *ops)
+{
+    if (!ops->num_ops) {
+        av_log(log, lev, "  (empty)\n");
+        return;
+    }
+
+    for (int i = 0; i < ops->num_ops; i++) {
+        const SwsOp *op = &ops->ops[i];
+        av_log(log, lev, "  [%3s %c%c%c%c -> %c%c%c%c] ",
+               ff_sws_pixel_type_name(op->type),
+               op->comps.unused[0] ? 'X' : '.',
+               op->comps.unused[1] ? 'X' : '.',
+               op->comps.unused[2] ? 'X' : '.',
+               op->comps.unused[3] ? 'X' : '.',
+               describe_comp_flags(op->comps.flags[0]),
+               describe_comp_flags(op->comps.flags[1]),
+               describe_comp_flags(op->comps.flags[2]),
+               describe_comp_flags(op->comps.flags[3]));
+
+        switch (op->op) {
+        case SWS_OP_INVALID:
+            av_log(log, lev, "SWS_OP_INVALID\n");
+            break;
+        case SWS_OP_READ:
+        case SWS_OP_WRITE:
+            av_log(log, lev, "%-20s: %d elem(s) %s >> %d\n",
+                   op->op == SWS_OP_READ ? "SWS_OP_READ"
+                                         : "SWS_OP_WRITE",
+                   op->rw.elems,  op->rw.packed ? "packed" : "planar",
+                   op->rw.frac);
+            break;
+        case SWS_OP_SWAP_BYTES:
+            av_log(log, lev, "SWS_OP_SWAP_BYTES\n");
+            break;
+        case SWS_OP_LSHIFT:
+            av_log(log, lev, "%-20s: << %u\n", "SWS_OP_LSHIFT", op->c.u);
+            break;
+        case SWS_OP_RSHIFT:
+            av_log(log, lev, "%-20s: >> %u\n", "SWS_OP_RSHIFT", op->c.u);
+            break;
+        case SWS_OP_PACK:
+        case SWS_OP_UNPACK:
+            av_log(log, lev, "%-20s: {%d %d %d %d}\n",
+                   op->op == SWS_OP_PACK ? "SWS_OP_PACK"
+                                         : "SWS_OP_UNPACK",
+                   op->pack.pattern[0], op->pack.pattern[1],
+                   op->pack.pattern[2], op->pack.pattern[3]);
+            break;
+        case SWS_OP_CLEAR:
+            av_log(log, lev, "%-20s: {%s %s %s %s}\n", "SWS_OP_CLEAR",
+                   op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_",
+                   op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_",
+                   op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_",
+                   op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_");
+            break;
+        case SWS_OP_SWIZZLE:
+            av_log(log, lev, "%-20s: %d%d%d%d\n", "SWS_OP_SWIZZLE",
+                   op->swizzle.x, op->swizzle.y, op->swizzle.z, op->swizzle.w);
+            break;
+        case SWS_OP_CONVERT:
+            av_log(log, lev, "%-20s: %s -> %s%s\n", "SWS_OP_CONVERT",
+                   ff_sws_pixel_type_name(op->type),
+                   ff_sws_pixel_type_name(op->convert.to),
+                   op->convert.expand ? " (expand)" : "");
+            break;
+        case SWS_OP_DITHER:
+            av_log(log, lev, "%-20s: %dx%d matrix\n", "SWS_OP_DITHER",
+                    1 << op->dither.size_log2, 1 << op->dither.size_log2);
+            break;
+        case SWS_OP_MIN:
+            av_log(log, lev, "%-20s: x <= {%s %s %s %s}\n", "SWS_OP_MIN",
+                    op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_",
+                    op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_",
+                    op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_",
+                    op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_");
+            break;
+        case SWS_OP_MAX:
+            av_log(log, lev, "%-20s: {%s %s %s %s} <= x\n", "SWS_OP_MAX",
+                    op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_",
+                    op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_",
+                    op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_",
+                    op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_");
+            break;
+        case SWS_OP_LINEAR:
+            av_log(log, lev, "%-20s: %s [[%s %s %s %s %s] "
+                                        "[%s %s %s %s %s] "
+                                        "[%s %s %s %s %s] "
+                                        "[%s %s %s %s %s]]\n",
+                   "SWS_OP_LINEAR", describe_lin_mask(op->lin.mask),
+                   PRINTQ(op->lin.m[0][0]), PRINTQ(op->lin.m[0][1]), PRINTQ(op->lin.m[0][2]), PRINTQ(op->lin.m[0][3]), PRINTQ(op->lin.m[0][4]),
+                   PRINTQ(op->lin.m[1][0]), PRINTQ(op->lin.m[1][1]), PRINTQ(op->lin.m[1][2]), PRINTQ(op->lin.m[1][3]), PRINTQ(op->lin.m[1][4]),
+                   PRINTQ(op->lin.m[2][0]), PRINTQ(op->lin.m[2][1]), PRINTQ(op->lin.m[2][2]), PRINTQ(op->lin.m[2][3]), PRINTQ(op->lin.m[2][4]),
+                   PRINTQ(op->lin.m[3][0]), PRINTQ(op->lin.m[3][1]), PRINTQ(op->lin.m[3][2]), PRINTQ(op->lin.m[3][3]), PRINTQ(op->lin.m[3][4]));
+            break;
+        case SWS_OP_SCALE:
+            av_log(log, lev, "%-20s: * %s\n", "SWS_OP_SCALE",
+                   PRINTQ(op->c.q));
+            break;
+        case SWS_OP_TYPE_NB:
+            break;
+        }
+
+        if (op->comps.min[0].den || op->comps.min[1].den ||
+            op->comps.min[2].den || op->comps.min[3].den ||
+            op->comps.max[0].den || op->comps.max[1].den ||
+            op->comps.max[2].den || op->comps.max[3].den)
+        {
+            av_log(log, AV_LOG_TRACE, "    min: {%s, %s, %s, %s}, max: {%s, %s, %s, %s}\n",
+                PRINTQ(op->comps.min[0]), PRINTQ(op->comps.min[1]),
+                PRINTQ(op->comps.min[2]), PRINTQ(op->comps.min[3]),
+                PRINTQ(op->comps.max[0]), PRINTQ(op->comps.max[1]),
+                PRINTQ(op->comps.max[2]), PRINTQ(op->comps.max[3]));
+        }
+
+    }
+
+    av_log(log, lev, "    (X = unused, + = exact, 0 = zero)\n");
+}
+
+typedef struct SwsOpPass {
+    SwsCompiledOp comp;
+    SwsOpExec exec_base;
+    int num_blocks;
+    int tail_off_in;
+    int tail_off_out;
+    int tail_size_in;
+    int tail_size_out;
+    bool memcpy_in;
+    bool memcpy_out;
+} SwsOpPass;
+
+static void op_pass_free(void *ptr)
+{
+    SwsOpPass *p = ptr;
+    if (!p)
+        return;
+
+    if (p->comp.free)
+        p->comp.free(p->comp.priv);
+
+    av_free(p);
+}
+
+static void op_pass_setup(const SwsImg *out, const SwsImg *in, const SwsPass *pass)
+{
+    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->fmt);
+    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->fmt);
+
+    SwsOpPass *p = pass->priv;
+    SwsOpExec *exec = &p->exec_base;
+    const SwsCompiledOp *comp = &p->comp;
+    const int block_size = comp->block_size;
+    p->num_blocks = (pass->width + block_size - 1) / block_size;
+
+    /* Set up main loop parameters */
+    const int aligned_w  = p->num_blocks * block_size;
+    const int safe_width = (p->num_blocks - 1) * block_size;
+    const int tail_size  = pass->width - safe_width;
+    p->tail_off_in   = safe_width * exec->pixel_bits_in  >> 3;
+    p->tail_off_out  = safe_width * exec->pixel_bits_out >> 3;
+    p->tail_size_in  = tail_size  * exec->pixel_bits_in  >> 3;
+    p->tail_size_out = tail_size  * exec->pixel_bits_out >> 3;
+    p->memcpy_in     = false;
+    p->memcpy_out    = false;
+
+    for (int i = 0; i < 4 && in->data[i]; i++) {
+        const int sub_x      = (i == 1 || i == 2) ? indesc->log2_chroma_w : 0;
+        const int plane_w    = (aligned_w + sub_x) >> sub_x;
+        const int plane_pad  = (comp->over_read + sub_x) >> sub_x;
+        const int plane_size = plane_w * exec->pixel_bits_in >> 3;
+        p->memcpy_in |= plane_size + plane_pad > in->linesize[i];
+        exec->in_stride[i] = in->linesize[i];
+    }
+
+    for (int i = 0; i < 4 && out->data[i]; i++) {
+        const int sub_x      = (i == 1 || i == 2) ? outdesc->log2_chroma_w : 0;
+        const int plane_w    = (aligned_w + sub_x) >> sub_x;
+        const int plane_pad  = (comp->over_write + sub_x) >> sub_x;
+        const int plane_size = plane_w * exec->pixel_bits_out >> 3;
+        p->memcpy_out |= plane_size + plane_pad > out->linesize[i];
+        exec->out_stride[i] = out->linesize[i];
+    }
+}
+
+/* Dispatch kernel over the last column of the image using memcpy */
+static av_always_inline void
+handle_tail(const SwsOpPass *p, SwsOpExec *exec,
+            const SwsImg *out_base, const bool copy_out,
+            const SwsImg *in_base, const bool copy_in,
+            const int y, const int y_end)
+{
+    DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];
+
+    const SwsCompiledOp *comp = &p->comp;
+    const int block_size    = comp->block_size;
+    const int tail_size_in  = p->tail_size_in;
+    const int tail_size_out = p->tail_size_out;
+
+    SwsImg in  = ff_sws_img_shift(*in_base,  y);
+    SwsImg out = ff_sws_img_shift(*out_base, y);
+    for (int i = 0; i < 4 && in.data[i]; i++) {
+        in.data[i]  += p->tail_off_in;
+        if (copy_in) {
+            exec->in[i] = (void *) tmp[0][i];
+            exec->in_stride[i] = sizeof(tmp[0][i]);
+        } else {
+            exec->in[i] = in.data[i];
+        }
+    }
+
+    for (int i = 0; i < 4 && out.data[i]; i++) {
+        out.data[i] += p->tail_off_out;
+        if (copy_out) {
+            exec->out[i] = (void *) tmp[1][i];
+            exec->out_stride[i] = sizeof(tmp[1][i]);
+        } else {
+            exec->out[i] = out.data[i];
+        }
+    }
+
+    exec->x = (p->num_blocks - 1) * block_size;
+    for (exec->y = y; exec->y < y_end; exec->y++) {
+        if (copy_in) {
+            for (int i = 0; i < 4 && in.data[i]; i++) {
+                av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
+                memcpy(tmp[0][i], in.data[i], tail_size_in);
+                in.data[i] += in.linesize[i];
+            }
+        }
+
+        comp->func(exec, comp->priv, 1);
+
+        if (copy_out) {
+            for (int i = 0; i < 4 && out.data[i]; i++) {
+                av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
+                memcpy(out.data[i], tmp[1][i], tail_size_out);
+                out.data[i] += out.linesize[i];
+            }
+        }
+
+        for (int i = 0; i < 4; i++) {
+            if (!copy_in)
+                exec->in[i] += in.linesize[i];
+            if (!copy_out)
+                exec->out[i] += out.linesize[i];
+        }
+    }
+}
+
+static av_always_inline void
+op_pass_run(const SwsImg *out_base, const SwsImg *in_base,
+            const int y, const int h, const SwsPass *pass)
+{
+    const SwsOpPass *p = pass->priv;
+    const SwsCompiledOp *comp = &p->comp;
+
+    /* Fill exec metadata for this slice */
+    const SwsImg in  = ff_sws_img_shift(*in_base,  y);
+    const SwsImg out = ff_sws_img_shift(*out_base, y);
+    SwsOpExec exec = p->exec_base;
+    exec.slice_y = y;
+    exec.slice_h = h;
+    for (int i = 0; i < 4; i++) {
+        exec.in[i]  = in.data[i];
+        exec.out[i] = out.data[i];
+    }
+
+    /**
+     *  To ensure safety, we need to consider the following:
+     *
+     * 1. We can overread the input, unless this is the last line of an
+     *    unpadded buffer. All operation chains must be able to handle
+     *    arbitrary pixel input, so arbitrary overread is fine.
+     *
+     * 2. We can overwrite the output, as long as we don't write more than the
+     *    amount of pixels that fit into one linesize. So we always need to
+     *    memcpy the last column on the output side if unpadded.
+     *
+     * 3. For the last row, we also need to memcpy the remainder of the input,
+     *    to avoid reading past the end of the buffer. Note that since we know
+     *    the run() function is called on stripes of the same buffer, we don't
+     *    need to worry about this for the end of a slice.
+     */
+
+    const int last_slice  = y + h == pass->height;
+    const bool memcpy_in  = last_slice && p->memcpy_in;
+    const bool memcpy_out = p->memcpy_out;
+    const int num_blocks  = p->num_blocks;
+    const int blocks_main = num_blocks - memcpy_out;
+    const int y_end       = y + h - memcpy_in;
+
+    /* Handle main section */
+    for (exec.y = y; exec.y < y_end; exec.y++) {
+        comp->func(&exec, comp->priv, blocks_main);
+        for (int i = 0; i < 4; i++) {
+            exec.in[i]  += in.linesize[i];
+            exec.out[i] += out.linesize[i];
+        }
+    }
+
+    if (memcpy_in)
+        comp->func(&exec, comp->priv, num_blocks - 1); /* safe part of last row */
+
+    /* Handle last column via memcpy, takes over `exec` so call these last */
+    if (memcpy_out)
+        handle_tail(p, &exec, out_base, true, in_base, false, y, y_end);
+    if (memcpy_in)
+        handle_tail(p, &exec, out_base, memcpy_out, in_base, true, y_end, y + h);
+}
+
+static int rw_pixel_bits(const SwsOp *op)
+{
+    const int elems = op->rw.packed ? op->rw.elems : 1;
+    const int size  = ff_sws_pixel_type_size(op->type);
+    const int bits  = 8 >> op->rw.frac;
+    av_assert1(bits >= 1);
+    return elems * size * bits;
+}
+
+int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
+                               const SwsOpList *ops, SwsCompiledOp *out)
+{
+    SwsOpList *copy, rest;
+    int ret = 0;
+
+    copy = ff_sws_op_list_duplicate(ops);
+    if (!copy)
+        return AVERROR(ENOMEM);
+
+    /* Ensure these are always set during compilation */
+    ff_sws_op_list_update_comps(copy);
+
+    /* Make an on-stack copy of `ops` to ensure we can still properly clean up
+     * the copy afterwards */
+    rest = *copy;
+
+    ret = backend->compile(ctx, &rest, out);
+    if (ret == AVERROR(ENOTSUP)) {
+        av_log(ctx, AV_LOG_DEBUG, "Backend '%s' does not support operations:\n", backend->name);
+        ff_sws_op_list_print(ctx, AV_LOG_DEBUG, &rest);
+    } else if (ret < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to compile operations: %s\n", av_err2str(ret));
+        ff_sws_op_list_print(ctx, AV_LOG_ERROR, &rest);
+    }
+
+    ff_sws_op_list_free(&copy);
+    return ret;
+}
+
+int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
+{
+    for (int n = 0; ff_sws_op_backends[n]; n++) {
+        const SwsOpBackend *backend = ff_sws_op_backends[n];
+        if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
+            continue;
+
+        av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
+               "block size = %d, over-read = %d, over-write = %d\n",
+               backend->name, out->block_size, out->over_read, out->over_write);
+        return 0;
+    }
+
+    av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
+    ff_sws_op_list_print(ctx, AV_LOG_WARNING, ops);
+    return AVERROR(ENOTSUP);
+}
+
+int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst,
+                        SwsPass *input, SwsPass **output)
+{
+    SwsContext *ctx = graph->ctx;
+    SwsOpPass *p = NULL;
+    const SwsOp *read = &ops->ops[0];
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
+    SwsPass *pass;
+    int ret;
+
+    if (ops->num_ops < 2) {
+        av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) {
+        av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
+               "and write, respectively.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (flags & SWS_OP_FLAG_OPTIMIZE)
+        RET(ff_sws_op_list_optimize(ops));
+    else
+        ff_sws_op_list_update_comps(ops);
+
+    p = av_mallocz(sizeof(*p));
+    if (!p)
+        return AVERROR(ENOMEM);
+
+    p->exec_base = (SwsOpExec) {
+        .width  = dst.width,
+        .height = dst.height,
+        .pixel_bits_in  = rw_pixel_bits(read),
+        .pixel_bits_out = rw_pixel_bits(write),
+    };
+
+    ret = ff_sws_ops_compile(ctx, ops, &p->comp);
+    if (ret < 0)
+        goto fail;
+
+    pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height, input,
+                                 1, p, op_pass_run);
+    if (!pass) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    pass->setup = op_pass_setup;
+    pass->free  = op_pass_free;
+
+    *output = pass;
+    return 0;
+
+fail:
+    op_pass_free(p);
+    return ret;
+}
diff --git a/libswscale/ops.h b/libswscale/ops.h
new file mode 100644
index 0000000000..c9c5706cbf
--- /dev/null
+++ b/libswscale/ops.h
@@ -0,0 +1,265 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_OPS_H
+#define SWSCALE_OPS_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdalign.h>
+
+#include "graph.h"
+
+typedef enum SwsPixelType {
+    SWS_PIXEL_NONE = 0,
+    SWS_PIXEL_U8,
+    SWS_PIXEL_U16,
+    SWS_PIXEL_U32,
+    SWS_PIXEL_F32,
+    SWS_PIXEL_TYPE_NB
+} SwsPixelType;
+
+const char *ff_sws_pixel_type_name(SwsPixelType type);
+int ff_sws_pixel_type_size(SwsPixelType type) av_const;
+bool ff_sws_pixel_type_is_int(SwsPixelType type) av_const;
+SwsPixelType ff_sws_pixel_type_to_uint(SwsPixelType type) av_const;
+
+typedef enum SwsOpType {
+    SWS_OP_INVALID = 0,
+
+    /* Input/output handling */
+    SWS_OP_READ,            /* gather raw pixels from planes */
+    SWS_OP_WRITE,           /* write raw pixels to planes */
+    SWS_OP_SWAP_BYTES,      /* swap byte order (for differing endianness) */
+    SWS_OP_UNPACK,          /* split tightly packed data into components */
+    SWS_OP_PACK,            /* compress components into tightly packed data */
+
+    /* Pixel manipulation */
+    SWS_OP_CLEAR,           /* clear pixel values */
+    SWS_OP_LSHIFT,          /* logical left shift of raw pixel values by (u8) */
+    SWS_OP_RSHIFT,          /* right shift of raw pixel values by (u8) */
+    SWS_OP_SWIZZLE,         /* rearrange channel order, or duplicate channels */
+    SWS_OP_CONVERT,         /* convert (cast) between formats */
+    SWS_OP_DITHER,          /* add dithering noise */
+
+    /* Arithmetic operations */
+    SWS_OP_LINEAR,          /* generalized linear affine transform */
+    SWS_OP_SCALE,           /* multiplication by scalar (q) */
+    SWS_OP_MIN,             /* numeric minimum (q4) */
+    SWS_OP_MAX,             /* numeric maximum (q4) */
+
+    SWS_OP_TYPE_NB,
+} SwsOpType;
+
+enum SwsCompFlags {
+    SWS_COMP_GARBAGE = 1 << 0, /* contents are undefined / garbage data */
+    SWS_COMP_EXACT   = 1 << 1, /* value is an in-range, exact, integer */
+    SWS_COMP_ZERO    = 1 << 2, /* known to be a constant zero */
+};
+
+typedef union SwsConst {
+    /* Generic constant value */
+    AVRational q;
+    AVRational q4[4];
+    unsigned u;
+} SwsConst;
+
+typedef struct SwsComps {
+    unsigned flags[4]; /* knowledge about (output) component contents */
+    bool unused[4];    /* which input components are definitely unused */
+
+    /* Keeps track of the known possible value range, or {0, 0} for undefined
+     * or (unknown range) floating point inputs */
+    AVRational min[4], max[4];
+} SwsComps;
+
+typedef struct SwsReadWriteOp {
+    /* Note: Unread pixel data is explicitly cleared to {0} for sanity */
+
+    int elems;   /* number of elements (of type `op.type`) to read/write */
+    bool packed; /* read multiple elements from a single plane */
+    int frac;    /* fractional pixel step factor (log2) */
+
+    /** Examples:
+     *    rgba      = 4x u8 packed
+     *    yuv444p   = 3x u8
+     *    rgb565    = 1x u16   <- use SWS_OP_UNPACK to unpack
+     *    monow     = 1x u8 (frac 3)
+     *    rgb4      = 1x u8 (frac 1)
+     */
+} SwsReadWriteOp;
+
+typedef struct SwsPackOp {
+    int pattern[4]; /* bit depth pattern, from MSB to LSB */
+} SwsPackOp;
+
+typedef struct SwsSwizzleOp {
+    /**
+     * Input component for each output component:
+     *   Out[x] := In[swizzle.in[x]]
+     */
+    union {
+        uint32_t mask;
+        uint8_t in[4];
+        struct { uint8_t x, y, z, w; };
+    };
+} SwsSwizzleOp;
+
+#define SWS_SWIZZLE(X,Y,Z,W) ((SwsSwizzleOp) { .in = {X, Y, Z, W} })
+
+typedef struct SwsConvertOp {
+    SwsPixelType to; /* type of pixel to convert to */
+    bool expand; /* if true, integers are expanded to the full range */
+} SwsConvertOp;
+
+typedef struct SwsDitherOp {
+    AVRational *matrix; /* tightly packed dither matrix (refstruct) */
+    int size_log2; /* size (in bits) of the dither matrix */
+} SwsDitherOp;
+
+typedef struct SwsLinearOp {
+    /**
+     * Generalized 5x5 affine transformation:
+     *   [ Out.x ] = [ A B C D E ]
+     *   [ Out.y ] = [ F G H I J ] * [ x y z w 1 ]
+     *   [ Out.z ] = [ K L M N O ]
+     *   [ Out.w ] = [ P Q R S T ]
+     *
+     * The mask keeps track of which components differ from an identity matrix.
+     * There may be more efficient implementations of particular subsets, for
+     * example the common subset of {A, E, G, J, M, O} can be implemented with
+     * just three fused multiply-add operations.
+     */
+    AVRational m[4][5];
+    uint32_t mask; /* m[i][j] <-> 1 << (5 * i + j) */
+} SwsLinearOp;
+
+#define SWS_MASK(I, J)  (1 << (5 * (I) + (J)))
+#define SWS_MASK_OFF(I) SWS_MASK(I, 4)
+#define SWS_MASK_ROW(I) (0b11111 << (5 * (I)))
+#define SWS_MASK_COL(J) (0b1000010000100001 << J)
+
+enum {
+    SWS_MASK_ALL   = (1 << 20) - 1,
+    SWS_MASK_LUMA  = SWS_MASK(0, 0) | SWS_MASK_OFF(0),
+    SWS_MASK_ALPHA = SWS_MASK(3, 3) | SWS_MASK_OFF(3),
+
+    SWS_MASK_DIAG3 = SWS_MASK(0, 0)  | SWS_MASK(1, 1)  | SWS_MASK(2, 2),
+    SWS_MASK_OFF3  = SWS_MASK_OFF(0) | SWS_MASK_OFF(1) | SWS_MASK_OFF(2),
+    SWS_MASK_MAT3  = SWS_MASK(0, 0)  | SWS_MASK(0, 1)  | SWS_MASK(0, 2) |
+                     SWS_MASK(1, 0)  | SWS_MASK(1, 1)  | SWS_MASK(1, 2) |
+                     SWS_MASK(2, 0)  | SWS_MASK(2, 1)  | SWS_MASK(2, 2),
+
+    SWS_MASK_DIAG4 = SWS_MASK_DIAG3  | SWS_MASK(3, 3),
+    SWS_MASK_OFF4  = SWS_MASK_OFF3   | SWS_MASK_OFF(3),
+    SWS_MASK_MAT4  = SWS_MASK_ALL & ~SWS_MASK_OFF4,
+};
+
+/* Helper function to compute the correct mask */
+uint32_t ff_sws_linear_mask(SwsLinearOp);
+
+typedef struct SwsOp {
+    SwsOpType op;      /* operation to perform */
+    SwsPixelType type; /* pixel type to operate on */
+    union {
+        SwsReadWriteOp  rw;
+        SwsPackOp       pack;
+        SwsSwizzleOp    swizzle;
+        SwsConvertOp    convert;
+        SwsDitherOp     dither;
+        SwsLinearOp     lin;
+        SwsConst        c;
+    };
+
+    /* For use internal use inside ff_sws_*() functions */
+    SwsComps comps;
+} SwsOp;
+
+/**
+ * Frees any allocations associated with an SwsOp and sets it to {0}.
+ */
+void ff_sws_op_uninit(SwsOp *op);
+
+/**
+ * Apply an operation to an AVRational. No-op for read/write operations.
+ */
+void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4]);
+
+/**
+ * Helper struct for representing a list of operations.
+ */
+typedef struct SwsOpList {
+    SwsOp *ops;
+    int num_ops;
+} SwsOpList;
+
+SwsOpList *ff_sws_op_list_alloc(void);
+void ff_sws_op_list_free(SwsOpList **ops);
+
+/**
+ * Returns a duplicate of `ops`, or NULL on OOM.
+ */
+SwsOpList *ff_sws_op_list_duplicate(const SwsOpList *ops);
+
+/**
+ * Returns the size of the largest pixel type used in `ops`.
+ */
+int ff_sws_op_list_max_size(const SwsOpList *ops);
+
+/**
+ * These will take over ownership of `op` and set it to {0}, even on failure.
+ */
+int ff_sws_op_list_append(SwsOpList *ops, SwsOp *op);
+int ff_sws_op_list_insert_at(SwsOpList *ops, int index, SwsOp *op);
+
+void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count);
+
+/**
+ * Print out the contents of an operation list.
+ */
+void ff_sws_op_list_print(void *log_ctx, int log_level, const SwsOpList *ops);
+
+/**
+ * Infer + propagate known information about components. Called automatically
+ * when needed by the optimizer and compiler.
+ */
+void ff_sws_op_list_update_comps(SwsOpList *ops);
+
+/**
+ * Fuse compatible and eliminate redundant operations, as well as replacing
+ * some operations with more efficient alternatives.
+ */
+int ff_sws_op_list_optimize(SwsOpList *ops);
+
+enum SwsOpCompileFlags {
+    /* Automatically optimize the operations when compiling */
+    SWS_OP_FLAG_OPTIMIZE = 1 << 0,
+};
+
+/**
+ * Resolves an operation list to a graph pass. The first and last operations
+ * must be a read/write respectively. `flags` is a list of SwsOpCompileFlags.
+ *
+ * Note: `ops` may be modified by this function.
+ */
+int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst,
+                        SwsPass *input, SwsPass **output);
+
+#endif
diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
new file mode 100644
index 0000000000..ac0319321e
--- /dev/null
+++ b/libswscale/ops_internal.h
@@ -0,0 +1,103 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_OPS_INTERNAL_H
+#define SWSCALE_OPS_INTERNAL_H
+
+#include "libavutil/mem_internal.h"
+
+#include "ops.h"
+
+/**
+ * Global execution context for all compiled functions.
+ *
+ * Note: This struct is hard-coded in assembly, so do not change the layout
+ * without updating the corresponding assembly definitions.
+ */
+typedef struct SwsOpExec {
+    /* The data pointers point to the first pixel to process */
+    const uint8_t *in[4];
+    uint8_t *out[4];
+
+    /* Separation between lines in bytes */
+    ptrdiff_t in_stride[4];
+    ptrdiff_t out_stride[4];
+
+    /* Extra metadata, may or may not be useful */
+    int32_t x, y;               /* Starting pixel coordinates */
+    int32_t width, height;      /* Overall image dimensions */
+    int32_t slice_y, slice_h;   /* Start and height of current slice */
+    int32_t pixel_bits_in;      /* Bits per input pixel */
+    int32_t pixel_bits_out;     /* Bits per output pixel */
+} SwsOpExec;
+
+static_assert(sizeof(SwsOpExec) == 16 * sizeof(void *) + 8 * sizeof(int32_t),
+              "SwsOpExec layout mismatch");
+
+/* Process a given number of pixel blocks */
+typedef void (*SwsOpFunc)(const SwsOpExec *exec, const void *priv, int blocks);
+
+#define SWS_DECL_FUNC(NAME) \
+    void NAME(const SwsOpExec *, const void *, int)
+
+typedef struct SwsCompiledOp {
+    SwsOpFunc func;
+
+    int block_size; /* number of pixels processed per iteration */
+    int over_read;  /* implementation over-reads input by this many bytes */
+    int over_write; /* implementation over-writes output by this many bytes */
+
+    /* Arbitrary private data */
+    void *priv;
+    void (*free)(void *priv);
+} SwsCompiledOp;
+
+typedef struct SwsOpBackend {
+    const char *name; /* Descriptive name for this backend */
+
+    /**
+     * Compile an operation list to an implementation chain. May modify `ops`
+     * freely; the original list will be freed automatically by the caller.
+     *
+     * Returns 0 or a negative error code.
+     */
+    int (*compile)(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out);
+} SwsOpBackend;
+
+/* List of all backends, terminated by NULL */
+extern const SwsOpBackend *const ff_sws_op_backends[];
+extern const int ff_sws_num_op_backends; /* excludes terminating NULL */
+
+/**
+ * Attempt to compile a list of operations using a specific backend.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
+                               const SwsOpList *ops, SwsCompiledOp *out);
+
+/**
+ * Compile a list of operations using the best available backend.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out);
+
+#endif
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
new file mode 100644
index 0000000000..9f509085ba
--- /dev/null
+++ b/libswscale/ops_optimizer.c
@@ -0,0 +1,810 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/rational.h"
+
+#include "ops.h"
+#include "ops_internal.h"
+
+#define Q(N) ((AVRational) { N, 1 })
+
+#define RET(x)                                                                 \
+    do {                                                                       \
+        if ((ret = (x)) < 0)                                                   \
+            return ret;                                                        \
+    } while (0)
+
+/* Returns true for operations that are independent per channel. These can
+ * usually be commuted freely other such operations. */
+static bool op_type_is_independent(SwsOpType op)
+{
+    switch (op) {
+    case SWS_OP_SWAP_BYTES:
+    case SWS_OP_LSHIFT:
+    case SWS_OP_RSHIFT:
+    case SWS_OP_CONVERT:
+    case SWS_OP_DITHER:
+    case SWS_OP_MIN:
+    case SWS_OP_MAX:
+    case SWS_OP_SCALE:
+        return true;
+    case SWS_OP_INVALID:
+    case SWS_OP_READ:
+    case SWS_OP_WRITE:
+    case SWS_OP_SWIZZLE:
+    case SWS_OP_CLEAR:
+    case SWS_OP_LINEAR:
+    case SWS_OP_PACK:
+    case SWS_OP_UNPACK:
+        return false;
+    case SWS_OP_TYPE_NB:
+        break;
+    }
+
+    av_assert0(!"Invalid operation type!");
+    return false;
+}
+
+static AVRational expand_factor(SwsPixelType from, SwsPixelType to)
+{
+    const int src = ff_sws_pixel_type_size(from);
+    const int dst = ff_sws_pixel_type_size(to);
+    int scale = 0;
+    for (int i = 0; i < dst / src; i++)
+        scale = scale << src * 8 | 1;
+    return Q(scale);
+}
+
+/* merge_comp_flags() forms a monoid with flags_identity as the null element */
+static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
+static unsigned merge_comp_flags(unsigned a, unsigned b)
+{
+    const unsigned flags_or  = SWS_COMP_GARBAGE;
+    const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
+    return ((a & b) & flags_and) | ((a | b) & flags_or);
+}
+
+/* Infer + propagate known information about components */
+void ff_sws_op_list_update_comps(SwsOpList *ops)
+{
+    SwsComps next = { .unused = {true, true, true, true} };
+    SwsComps prev = { .flags = {
+        SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE,
+    }};
+
+    /* Forwards pass, propagates knowledge about the incoming pixel values */
+    for (int n = 0; n < ops->num_ops; n++) {
+        SwsOp *op = &ops->ops[n];
+
+        /* Prefill min/max values automatically; may have to be fixed in
+         * special cases */
+        memcpy(op->comps.min, prev.min, sizeof(prev.min));
+        memcpy(op->comps.max, prev.max, sizeof(prev.max));
+        ff_sws_apply_op_q(op, op->comps.min);
+        ff_sws_apply_op_q(op, op->comps.max);
+
+        switch (op->op) {
+        case SWS_OP_READ:
+            for (int i = 0; i < op->rw.elems; i++) {
+                if (ff_sws_pixel_type_is_int(op->type)) {
+                    const int size = ff_sws_pixel_type_size(op->type);
+                    const uint64_t max_val = (1 << 8 * size) - 1;
+                    op->comps.flags[i] |= SWS_COMP_EXACT;
+                    op->comps.min[i] = Q(0);
+                    op->comps.max[i] = Q(max_val);
+                }
+            }
+            for (int i = op->rw.elems; i < 4; i++)
+                op->comps.flags[i] |= prev.flags[i];
+            break;
+        case SWS_OP_WRITE:
+            for (int i = 0; i < op->rw.elems; i++)
+                av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE));
+            /* fall through */
+        case SWS_OP_SWAP_BYTES:
+        case SWS_OP_LSHIFT:
+        case SWS_OP_RSHIFT:
+        case SWS_OP_MIN:
+        case SWS_OP_MAX:
+            /* Linearly propagate flags per component */
+            for (int i = 0; i < 4; i++)
+                op->comps.flags[i] |= prev.flags[i];
+            break;
+        case SWS_OP_DITHER:
+            /* Strip zero flag because of the nonzero dithering offset */
+            for (int i = 0; i < 4; i++)
+                op->comps.flags[i] |= prev.flags[i] & ~SWS_COMP_ZERO;
+            break;
+        case SWS_OP_UNPACK:
+            for (int i = 0; i < 4; i++) {
+                if (op->pack.pattern[i])
+                    op->comps.flags[i] |= prev.flags[0];
+                else
+                    op->comps.flags[i] = SWS_COMP_GARBAGE;
+            }
+            break;
+        case SWS_OP_PACK: {
+            unsigned flags = flags_identity;
+            for (int i = 0; i < 4; i++) {
+                if (op->pack.pattern[i])
+                    flags = merge_comp_flags(flags, prev.flags[i]);
+                if (i > 0) /* clear remaining comps for sanity */
+                    op->comps.flags[i] = SWS_COMP_GARBAGE;
+            }
+            op->comps.flags[0] |= flags;
+            break;
+        }
+        case SWS_OP_CLEAR:
+            for (int i = 0; i < 4; i++) {
+                if (op->c.q4[i].den) {
+                    if (op->c.q4[i].num == 0)
+                        op->comps.flags[i] |= SWS_COMP_ZERO | SWS_COMP_EXACT;
+                    if (op->c.q4[i].den == 1)
+                        op->comps.flags[i] |= SWS_COMP_EXACT;
+                }
+                else
+                    op->comps.flags[i] |= prev.flags[i];
+            }
+            break;
+        case SWS_OP_SWIZZLE:
+            for (int i = 0; i < 4; i++)
+                op->comps.flags[i] |= prev.flags[op->swizzle.in[i]];
+            break;
+        case SWS_OP_CONVERT:
+            for (int i = 0; i < 4; i++) {
+                op->comps.flags[i] |= prev.flags[i];
+                if (ff_sws_pixel_type_is_int(op->convert.to))
+                    op->comps.flags[i] |= SWS_COMP_EXACT;
+            }
+            break;
+        case SWS_OP_LINEAR:
+            for (int i = 0; i < 4; i++) {
+                unsigned flags = flags_identity;
+                AVRational min = Q(0), max = Q(0);
+                for (int j = 0; j < 4; j++) {
+                    const AVRational k = op->lin.m[i][j];
+                    AVRational mink = av_mul_q(prev.min[j], k);
+                    AVRational maxk = av_mul_q(prev.max[j], k);
+                    if (k.num) {
+                        flags = merge_comp_flags(flags, prev.flags[j]);
+                        if (k.den != 1) /* fractional coefficient */
+                            flags &= ~SWS_COMP_EXACT;
+                        if (k.num < 0)
+                            FFSWAP(AVRational, mink, maxk);
+                        min = av_add_q(min, mink);
+                        max = av_add_q(max, maxk);
+                    }
+                }
+                if (op->lin.m[i][4].num) { /* nonzero offset */
+                    flags &= ~SWS_COMP_ZERO;
+                    if (op->lin.m[i][4].den != 1) /* fractional offset */
+                        flags &= ~SWS_COMP_EXACT;
+                    min = av_add_q(min, op->lin.m[i][4]);
+                    max = av_add_q(max, op->lin.m[i][4]);
+                }
+                op->comps.flags[i] |= flags;
+                op->comps.min[i] = min;
+                op->comps.max[i] = max;
+            }
+            break;
+        case SWS_OP_SCALE:
+            for (int i = 0; i < 4; i++) {
+                op->comps.flags[i] |= prev.flags[i];
+                if (op->c.q.den != 1) /* fractional scale */
+                    op->comps.flags[i] &= ~SWS_COMP_EXACT;
+                if (op->c.q.num < 0)
+                    FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]);
+            }
+            break;
+
+        case SWS_OP_INVALID:
+        case SWS_OP_TYPE_NB:
+            av_assert0(!"Invalid operation type!");
+        }
+
+        prev = op->comps;
+    }
+
+    /* Backwards pass, solves for component dependencies */
+    for (int n = ops->num_ops - 1; n >= 0; n--) {
+        SwsOp *op = &ops->ops[n];
+
+        switch (op->op) {
+        case SWS_OP_READ:
+        case SWS_OP_WRITE:
+            for (int i = 0; i < op->rw.elems; i++)
+                op->comps.unused[i] = op->op == SWS_OP_READ;
+            for (int i = op->rw.elems; i < 4; i++)
+                op->comps.unused[i] |= next.unused[i];
+            break;
+        case SWS_OP_SWAP_BYTES:
+        case SWS_OP_LSHIFT:
+        case SWS_OP_RSHIFT:
+        case SWS_OP_CONVERT:
+        case SWS_OP_DITHER:
+        case SWS_OP_MIN:
+        case SWS_OP_MAX:
+        case SWS_OP_SCALE:
+            for (int i = 0; i < 4; i++)
+                op->comps.unused[i] |= next.unused[i];
+            break;
+        case SWS_OP_UNPACK: {
+            bool unused = true;
+            for (int i = 0; i < 4; i++) {
+                if (op->pack.pattern[i])
+                    unused &= next.unused[i];
+                op->comps.unused[i] |= i > 0;
+            }
+            op->comps.unused[0] = unused;
+            break;
+        }
+        case SWS_OP_PACK:
+            for (int i = 0; i < 4; i++) {
+                if (op->pack.pattern[i])
+                    op->comps.unused[i] |= next.unused[0];
+                else
+                    op->comps.unused[i] = true;
+            }
+            break;
+        case SWS_OP_CLEAR:
+            for (int i = 0; i < 4; i++) {
+                if (op->c.q4[i].den)
+                    op->comps.unused[i] = true;
+                else
+                    op->comps.unused[i] |= next.unused[i];
+            }
+            break;
+        case SWS_OP_SWIZZLE: {
+            bool unused[4] = { true, true, true, true };
+            for (int i = 0; i < 4; i++)
+                unused[op->swizzle.in[i]] &= next.unused[i];
+            for (int i = 0; i < 4; i++)
+                op->comps.unused[i] = unused[i];
+            break;
+        }
+        case SWS_OP_LINEAR:
+            for (int j = 0; j < 4; j++) {
+                bool unused = true;
+                for (int i = 0; i < 4; i++) {
+                    if (op->lin.m[i][j].num)
+                        unused &= next.unused[i];
+                }
+                op->comps.unused[j] = unused;
+            }
+            break;
+        }
+
+        next = op->comps;
+    }
+}
+
+/* returns log2(x) only if x is a power of two, or 0 otherwise */
+static int exact_log2(const int x)
+{
+    int p;
+    if (x <= 0)
+        return 0;
+    p = av_log2(x);
+    return (1 << p) == x ? p : 0;
+}
+
+static int exact_log2_q(const AVRational x)
+{
+    if (x.den == 1)
+        return exact_log2(x.num);
+    else if (x.num == 1)
+        return -exact_log2(x.den);
+    else
+        return 0;
+}
+
+/**
+ * If a linear operation can be reduced to a scalar multiplication, returns
+ * the corresponding scaling factor, or 0 otherwise.
+ */
+static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next,
+                           SwsConst *out_scale)
+{
+    SwsConst scale = {0};
+
+    /* There are components not on the main diagonal */
+    if (c->mask & ~SWS_MASK_DIAG4)
+        return false;
+
+    for (int i = 0; i < 4; i++) {
+        const AVRational s = c->m[i][i];
+        if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i])
+            continue;
+        if (scale.q.den && av_cmp_q(s, scale.q))
+            return false;
+        scale.q = s;
+    }
+
+    if (scale.q.den)
+        *out_scale = scale;
+    return scale.q.den;
+}
+
+/* Extracts an integer clear operation (subset) from the given linear op. */
+static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev,
+                                  SwsConst *out_clear)
+{
+    SwsConst clear = {0};
+    bool ret = false;
+
+    for (int i = 0; i < 4; i++) {
+        bool const_row = c->m[i][4].den == 1; /* offset is integer */
+        for (int j = 0; j < 4; j++) {
+            const_row &= c->m[i][j].num == 0 || /* scalar is zero */
+                         (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */
+        }
+        if (const_row && (c->mask & SWS_MASK_ROW(i))) {
+            clear.q4[i] = c->m[i][4];
+            for (int j = 0; j < 5; j++)
+                c->m[i][j] = Q(i == j);
+            c->mask &= ~SWS_MASK_ROW(i);
+            ret = true;
+        }
+    }
+
+    if (ret)
+        *out_clear = clear;
+    return ret;
+}
+
+/* Unswizzle a linear operation by aligning single-input rows with
+ * their corresponding diagonal */
+static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz)
+{
+    SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
+    SwsLinearOp c = *op;
+
+    for (int i = 0; i < 4; i++) {
+        int idx = -1;
+        for (int j = 0; j < 4; j++) {
+            if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO))
+                continue;
+            if (idx >= 0)
+                return false; /* multiple inputs */
+            idx = j;
+        }
+
+        if (idx >= 0 && idx != i) {
+            /* Move coefficient to the diagonal */
+            c.m[i][i] = c.m[i][idx];
+            c.m[i][idx] = Q(0);
+            swiz.in[i] = idx;
+        }
+    }
+
+    if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
+        return false; /* no swizzle was identified */
+
+    c.mask = ff_sws_linear_mask(c);
+    *out_swiz = swiz;
+    *op = c;
+    return true;
+}
+
+static void op_copy_flags(SwsOp *op, const SwsOp *op2)
+{
+    for (int i = 0; i < 4; i++)
+        op->comps.flags[i] = op2->comps.flags[i];
+}
+
+/* Should only be used on ops that commute with each other, and only after
+ * applying the necessary adjustments
+ */
+static void swap_ops(SwsOp *op, SwsOp *next)
+{
+    /* Clear all inferred flags */
+    op->comps = next->comps = (SwsComps) {0};
+    FFSWAP(SwsOp, *op, *next);
+}
+
+int ff_sws_op_list_optimize(SwsOpList *ops)
+{
+    int prev_num_ops, ret;
+    bool progress;
+
+    do {
+        prev_num_ops = ops->num_ops;
+        progress = false;
+
+        ff_sws_op_list_update_comps(ops);
+
+        for (int n = 0; n < ops->num_ops;) {
+            SwsOp dummy = {0};
+            SwsOp *op = &ops->ops[n];
+            SwsOp *prev = n ? &ops->ops[n - 1] : &dummy;
+            SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy;
+
+            /* common helper variables */
+            bool changed = false;
+            bool noop = true;
+
+            switch (op->op) {
+            case SWS_OP_READ:
+                /* Optimized further into refcopy / memcpy */
+                if (next->op == SWS_OP_WRITE &&
+                    next->rw.elems == op->rw.elems &&
+                    next->rw.packed == op->rw.packed &&
+                    next->rw.frac == op->rw.frac)
+                {
+                    ff_sws_op_list_remove_at(ops, n, 2);
+                    av_assert1(ops->num_ops == 0);
+                    return 0;
+                }
+
+                /* Skip reading extra unneeded components */
+                if (!op->rw.packed) {
+                    int needed = op->rw.elems;
+                    while (needed > 0 && next->comps.unused[needed - 1])
+                        needed--;
+                    if (op->rw.elems != needed) {
+                        op->rw.elems = needed;
+                        op->rw.packed &= op->rw.elems > 1;
+                        progress = true;
+                        continue;
+                    }
+                }
+                break;
+
+            case SWS_OP_SWAP_BYTES:
+                /* Redundant (double) swap */
+                if (next->op == SWS_OP_SWAP_BYTES) {
+                    ff_sws_op_list_remove_at(ops, n, 2);
+                    continue;
+                }
+                break;
+
+            case SWS_OP_UNPACK:
+                /* Redundant unpack+pack */
+                if (next->op == SWS_OP_PACK && next->type == op->type &&
+                    next->pack.pattern[0] == op->pack.pattern[0] &&
+                    next->pack.pattern[1] == op->pack.pattern[1] &&
+                    next->pack.pattern[2] == op->pack.pattern[2] &&
+                    next->pack.pattern[3] == op->pack.pattern[3])
+                {
+                    ff_sws_op_list_remove_at(ops, n, 2);
+                    continue;
+                }
+
+                /* Skip unpacking components that are not used */
+                for (int i = 3; i > 0 && next->comps.unused[i]; i--)
+                    op->pack.pattern[i] = 0;
+                break;
+
+            case SWS_OP_PACK:
+                /* Skip packing known-to-be-zero components */
+                for (int i = 3; i > 0; i--) {
+                    if (!(prev->comps.flags[i] & SWS_COMP_ZERO))
+                        break;
+                    op->pack.pattern[i] = 0;
+                }
+                break;
+
+            case SWS_OP_LSHIFT:
+            case SWS_OP_RSHIFT:
+                /* Two shifts in the same direction */
+                if (next->op == op->op) {
+                    op->c.u += next->c.u;
+                    ff_sws_op_list_remove_at(ops, n + 1, 1);
+                    continue;
+                }
+
+                /* No-op shift */
+                if (!op->c.u) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+                break;
+
+            case SWS_OP_CLEAR:
+                for (int i = 0; i < 4; i++) {
+                    if (!op->c.q4[i].den)
+                        continue;
+
+                    if ((prev->comps.flags[i] & SWS_COMP_ZERO) &&
+                        !(prev->comps.flags[i] & SWS_COMP_GARBAGE) &&
+                        op->c.q4[i].num == 0)
+                    {
+                        /* Redundant clear-to-zero of zero component */
+                        op->c.q4[i].den = 0;
+                    } else if (next->comps.unused[i]) {
+                        /* Unnecessary clear of unused component */
+                        op->c.q4[i] = (AVRational) {0, 0};
+                    } else if (op->c.q4[i].den) {
+                        noop = false;
+                    }
+                }
+
+                if (noop) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+
+                /* Transitive clear */
+                if (next->op == SWS_OP_CLEAR) {
+                    for (int i = 0; i < 4; i++) {
+                        if (next->c.q4[i].den)
+                            op->c.q4[i] = next->c.q4[i];
+                    }
+                    ff_sws_op_list_remove_at(ops, n + 1, 1);
+                    continue;
+                }
+
+                /* Prefer to clear as late as possible, to avoid doing
+                 * redundant work */
+                if ((op_type_is_independent(next->op) && next->op != SWS_OP_SWAP_BYTES) ||
+                    next->op == SWS_OP_SWIZZLE)
+                {
+                    if (next->op == SWS_OP_CONVERT)
+                        op->type = next->convert.to;
+                    ff_sws_apply_op_q(next, op->c.q4);
+                    swap_ops(op, next);
+                    progress = true;
+                    continue;
+                }
+                break;
+
+            case SWS_OP_SWIZZLE: {
+                bool seen[4] = {0};
+                bool has_duplicates = false;
+                for (int i = 0; i < 4; i++) {
+                    if (next->comps.unused[i])
+                        continue;
+                    if (op->swizzle.in[i] != i)
+                        noop = false;
+                    has_duplicates |= seen[op->swizzle.in[i]];
+                    seen[op->swizzle.in[i]] = true;
+                }
+
+                /* Identity swizzle */
+                if (noop) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+
+                /* Transitive swizzle */
+                if (next->op == SWS_OP_SWIZZLE) {
+                    const SwsSwizzleOp orig = op->swizzle;
+                    for (int i = 0; i < 4; i++)
+                        op->swizzle.in[i] = orig.in[next->swizzle.in[i]];
+                    op_copy_flags(op, next);
+                    ff_sws_op_list_remove_at(ops, n + 1, 1);
+                    continue;
+                }
+
+                /* Try to push swizzles with duplicates towards the output */
+                if (has_duplicates && op_type_is_independent(next->op)) {
+                    if (next->op == SWS_OP_CONVERT)
+                        op->type = next->convert.to;
+                    if (next->op == SWS_OP_MIN || next->op == SWS_OP_MAX) {
+                        /* Un-swizzle the next operation */
+                        const SwsConst c = next->c;
+                        for (int i = 0; i < 4; i++) {
+                            if (!next->comps.unused[i])
+                                next->c.q4[op->swizzle.in[i]] = c.q4[i];
+                        }
+                    }
+                    swap_ops(op, next);
+                    progress = true;
+                    continue;
+                }
+                break;
+            }
+
+            case SWS_OP_CONVERT:
+                /* No-op conversion */
+                if (op->type == op->convert.to) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+
+                /* Transitive conversion */
+                if (next->op == SWS_OP_CONVERT &&
+                    op->convert.expand == next->convert.expand)
+                {
+                    av_assert1(op->convert.to == next->type);
+                    op->convert.to = next->convert.to;
+                    op_copy_flags(op, next);
+                    ff_sws_op_list_remove_at(ops, n + 1, 1);
+                    continue;
+                }
+
+                /* Conversion followed by integer expansion */
+                if (next->op == SWS_OP_SCALE &&
+                    !av_cmp_q(next->c.q, expand_factor(op->type, op->convert.to)))
+                {
+                    op->convert.expand = true;
+                    ff_sws_op_list_remove_at(ops, n + 1, 1);
+                    continue;
+                }
+                break;
+
+            case SWS_OP_MIN:
+                for (int i = 0; i < 4; i++) {
+                    if (next->comps.unused[i] || !op->c.q4[i].den)
+                        continue;
+                    if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0)
+                        noop = false;
+                }
+
+                if (noop) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+                break;
+
+            case SWS_OP_MAX:
+                for (int i = 0; i < 4; i++) {
+                    if (next->comps.unused[i] || !op->c.q4[i].den)
+                        continue;
+                    if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0)
+                        noop = false;
+                }
+
+                if (noop) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+                break;
+
+            case SWS_OP_DITHER:
+                for (int i = 0; i < 4; i++) {
+                    noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) ||
+                            next->comps.unused[i];
+                }
+
+                if (noop) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+                break;
+
+            case SWS_OP_LINEAR: {
+                SwsSwizzleOp swizzle;
+                SwsConst c;
+
+                /* No-op (identity) linear operation */
+                if (!op->lin.mask) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+
+                if (next->op == SWS_OP_LINEAR) {
+                    /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */
+                    const SwsLinearOp m1 = op->lin;
+                    const SwsLinearOp m2 = next->lin;
+                    for (int i = 0; i < 4; i++) {
+                        for (int j = 0; j < 5; j++) {
+                            AVRational sum = Q(0);
+                            for (int k = 0; k < 4; k++)
+                                sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j]));
+                            if (j == 4) /* m1.m[4][j] == 1 */
+                                sum = av_add_q(sum, m2.m[i][4]);
+                            op->lin.m[i][j] = sum;
+                        }
+                    }
+                    op_copy_flags(op, next);
+                    op->lin.mask = ff_sws_linear_mask(op->lin);
+                    ff_sws_op_list_remove_at(ops, n + 1, 1);
+                    continue;
+                }
+
+                /* Optimize away zero columns */
+                for (int j = 0; j < 4; j++) {
+                    const uint32_t col = SWS_MASK_COL(j);
+                    if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col))
+                        continue;
+                    for (int i = 0; i < 4; i++)
+                        op->lin.m[i][j] = Q(i == j);
+                    op->lin.mask &= ~col;
+                    changed = true;
+                }
+
+                /* Optimize away unused rows */
+                for (int i = 0; i < 4; i++) {
+                    const uint32_t row = SWS_MASK_ROW(i);
+                    if (!next->comps.unused[i] || !(op->lin.mask & row))
+                        continue;
+                    for (int j = 0; j < 5; j++)
+                        op->lin.m[i][j] = Q(i == j);
+                    op->lin.mask &= ~row;
+                    changed = true;
+                }
+
+                if (changed) {
+                    progress = true;
+                    continue;
+                }
+
+                /* Convert constant rows to explicit clear instruction */
+                if (extract_constant_rows(&op->lin, prev->comps, &c)) {
+                    RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
+                        .op    = SWS_OP_CLEAR,
+                        .type  = op->type,
+                        .comps = op->comps,
+                        .c     = c,
+                    }));
+                    continue;
+                }
+
+                /* Multiplication by scalar constant */
+                if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) {
+                    op->op = SWS_OP_SCALE;
+                    op->c  = c;
+                    progress = true;
+                    continue;
+                }
+
+                /* Swizzle by fixed pattern */
+                if (extract_swizzle(&op->lin, prev->comps, &swizzle)) {
+                    RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) {
+                        .op      = SWS_OP_SWIZZLE,
+                        .type    = op->type,
+                        .swizzle = swizzle,
+                    }));
+                    continue;
+                }
+                break;
+            }
+
+            case SWS_OP_SCALE: {
+                const int factor2 = exact_log2_q(op->c.q);
+
+                /* No-op scaling */
+                if (op->c.q.num == 1 && op->c.q.den == 1) {
+                    ff_sws_op_list_remove_at(ops, n, 1);
+                    continue;
+                }
+
+                /* Scaling by integer before conversion to int */
+                if (op->c.q.den == 1 &&
+                    next->op == SWS_OP_CONVERT &&
+                    ff_sws_pixel_type_is_int(next->convert.to))
+                {
+                    op->type = next->convert.to;
+                    swap_ops(op, next);
+                    progress = true;
+                    continue;
+                }
+
+                /* Scaling by exact power of two */
+                if (factor2 && ff_sws_pixel_type_is_int(op->type)) {
+                    op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT;
+                    op->c.u = FFABS(factor2);
+                    progress = true;
+                    continue;
+                }
+                break;
+            }
+            }
+
+            /* No optimization triggered, move on to next operation */
+            n++;
+        }
+    } while (prev_num_ops != ops->num_ops || progress);
+
+    return 0;
+}
-- 
2.49.0