[FFmpeg-devel] [PATCH 10/17] swscale/optimizer: add packed shuffle solver

Sun May 18 17:59:46 EEST 2025

From: Niklas Haas <git at haasn.dev>

This can turn any compatible sequence of operations into a single packed
shuffle, including packed swizzling, grayscale->RGB conversion, endianness
swapping, RGB bit depth conversions, rgb24->rgb0 alpha clearing and more.
---
 libswscale/ops_internal.h  | 17 +++++++
 libswscale/ops_optimizer.c | 93 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index 9fd866430b..ab957b0837 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -105,4 +105,21 @@ int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
  */
 int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out);
 
+/**
+ * "Solve" an op list into a fixed shuffle mask, with an optional ability to
+ * also directly clear the output value (for e.g. rgb24 -> rgb0).
+ *
+ * @param ops         The operation list to decompose.
+ * @param shuffle     The output shuffle mask.
+ * @param size        The size (in bytes) of the output shuffle mask.
+ * @param clear_val   If nonzero, this index will be used to clear the output.
+ * @param read_bytes  Returns the number of bytes read per shuffle iteration.
+ * @param write_bytes Returns the number of bytes written per shuffle iteration.
+ *
+ * @return  The number of pixels processed per iteration, or a negative error
+            code; in particular AVERROR(ENOTSUP) for unsupported operations.
+ */
+int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size,
+                         uint8_t clear_val, int *read_bytes, int *write_bytes);
+
 #endif
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 829e691b3f..17c15859c7 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -19,9 +19,11 @@
  */
 
 #include "libavutil/avassert.h"
+#include <libavutil/bswap.h>
 #include "libavutil/rational.h"
 
 #include "ops.h"
+#include "ops_internal.h"
 
 #define Q(N) ((AVRational) { N, 1 })
 
@@ -779,3 +781,94 @@ retry:
 
     return 0;
 }
+
+int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
+                         int shuffle_size, uint8_t clear_val,
+                         int *out_read_bytes, int *out_write_bytes)
+{
+    const SwsOp read = ops->ops[0];
+    const int read_size = ff_sws_pixel_type_size(read.type);
+    uint32_t mask[4] = {0};
+
+    if (!ops->num_ops || read.op != SWS_OP_READ)
+        return AVERROR(EINVAL);
+    if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
+        return AVERROR(ENOTSUP);
+
+    for (int i = 0; i < read.rw.elems; i++)
+        mask[i] = 0x01010101 * i * read_size + 0x03020100;
+
+    for (int opidx = 1; opidx < ops->num_ops; opidx++) {
+        const SwsOp *op = &ops->ops[opidx];
+        switch (op->op) {
+        case SWS_OP_SWIZZLE: {
+            uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
+            for (int i = 0; i < 4; i++)
+                mask[i] = orig[op->swizzle.in[i]];
+            break;
+        }
+
+        case SWS_OP_SWAP_BYTES:
+            for (int i = 0; i < 4; i++) {
+                switch (ff_sws_pixel_type_size(op->type)) {
+                case 2: mask[i] = av_bswap16(mask[i]); break;
+                case 4: mask[i] = av_bswap32(mask[i]); break;
+                }
+            }
+            break;
+
+        case SWS_OP_CLEAR:
+            for (int i = 0; i < 4; i++) {
+                if (!op->c.q4[i].den)
+                    continue;
+                if (op->c.q4[i].num != 0 || !clear_val)
+                    return AVERROR(ENOTSUP);
+                mask[i] = 0x1010101ul * clear_val;
+            }
+            break;
+
+        case SWS_OP_CONVERT: {
+            if (!op->convert.expand)
+                return AVERROR(ENOTSUP);
+            for (int i = 0; i < 4; i++) {
+                switch (ff_sws_pixel_type_size(op->type)) {
+                case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF);   break;
+                case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
+                }
+            }
+            break;
+        }
+
+        case SWS_OP_WRITE: {
+            if (op->rw.frac || !op->rw.packed)
+                return AVERROR(ENOTSUP);
+
+            /* Initialize to no-op */
+            memset(shuffle, clear_val, shuffle_size);
+
+            const int write_size  = ff_sws_pixel_type_size(op->type);
+            const int read_chunk  = read.rw.elems * read_size;
+            const int write_chunk = op->rw.elems * write_size;
+            const int num_groups  = shuffle_size / FFMAX(read_chunk, write_chunk);
+            for (int n = 0; n < num_groups; n++) {
+                const int base_in  = n * read_chunk;
+                const int base_out = n * write_chunk;
+                for (int i = 0; i < op->rw.elems; i++) {
+                    const int offset = base_out + i * write_size;
+                    for (int b = 0; b < write_size; b++)
+                        shuffle[offset + b] = base_in + (mask[i] >> (b * 8));
+                }
+            }
+
+            *out_read_bytes  = num_groups * read_chunk;
+            *out_write_bytes = num_groups * write_chunk;
+            return num_groups;
+        }
+
+        default:
+            return AVERROR(ENOTSUP);
+        }
+    }
+
+    return AVERROR(EINVAL);
+}
-- 
2.49.0