[FFmpeg-devel] [PATCH 10/17] swscale/optimizer: add packed shuffle solver
Niklas Haas
ffmpeg at haasn.xyz
Sun May 18 17:59:46 EEST 2025
From: Niklas Haas <git at haasn.dev>
This can turn any compatible sequence of operations into a single packed
shuffle, including packed swizzling, grayscale->RGB conversion, endianness
swapping, RGB bit depth conversions, rgb24->rgb0 alpha clearing and more.
---
libswscale/ops_internal.h | 17 +++++++
libswscale/ops_optimizer.c | 93 ++++++++++++++++++++++++++++++++++++++
2 files changed, 110 insertions(+)
diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index 9fd866430b..ab957b0837 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -105,4 +105,21 @@ int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
*/
int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out);
+/**
+ * "Solve" an op list into a fixed shuffle mask, with an optional ability to
+ * also directly clear the output value (for e.g. rgb24 -> rgb0).
+ *
+ * @param ops The operation list to decompose.
+ * @param shuffle The output shuffle mask.
+ * @param size The size (in bytes) of the output shuffle mask.
+ * @param clear_val If nonzero, this index will be used to clear the output.
+ * @param read_bytes Returns the number of bytes read per shuffle iteration.
+ * @param write_bytes Returns the number of bytes written per shuffle iteration.
+ *
+ * @return The number of pixels processed per iteration, or a negative error
+ code; in particular AVERROR(ENOTSUP) for unsupported operations.
+ */
+int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size,
+ uint8_t clear_val, int *read_bytes, int *write_bytes);
+
#endif
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 829e691b3f..17c15859c7 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -19,9 +19,11 @@
*/
#include "libavutil/avassert.h"
+#include <libavutil/bswap.h>
#include "libavutil/rational.h"
#include "ops.h"
+#include "ops_internal.h"
#define Q(N) ((AVRational) { N, 1 })
@@ -779,3 +781,94 @@ retry:
return 0;
}
+
+int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
+ int shuffle_size, uint8_t clear_val,
+ int *out_read_bytes, int *out_write_bytes)
+{
+ const SwsOp read = ops->ops[0];
+ const int read_size = ff_sws_pixel_type_size(read.type);
+ uint32_t mask[4] = {0};
+
+ if (!ops->num_ops || read.op != SWS_OP_READ)
+ return AVERROR(EINVAL);
+ if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
+ return AVERROR(ENOTSUP);
+
+ for (int i = 0; i < read.rw.elems; i++)
+ mask[i] = 0x01010101 * i * read_size + 0x03020100;
+
+ for (int opidx = 1; opidx < ops->num_ops; opidx++) {
+ const SwsOp *op = &ops->ops[opidx];
+ switch (op->op) {
+ case SWS_OP_SWIZZLE: {
+ uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
+ for (int i = 0; i < 4; i++)
+ mask[i] = orig[op->swizzle.in[i]];
+ break;
+ }
+
+ case SWS_OP_SWAP_BYTES:
+ for (int i = 0; i < 4; i++) {
+ switch (ff_sws_pixel_type_size(op->type)) {
+ case 2: mask[i] = av_bswap16(mask[i]); break;
+ case 4: mask[i] = av_bswap32(mask[i]); break;
+ }
+ }
+ break;
+
+ case SWS_OP_CLEAR:
+ for (int i = 0; i < 4; i++) {
+ if (!op->c.q4[i].den)
+ continue;
+ if (op->c.q4[i].num != 0 || !clear_val)
+ return AVERROR(ENOTSUP);
+ mask[i] = 0x1010101ul * clear_val;
+ }
+ break;
+
+ case SWS_OP_CONVERT: {
+ if (!op->convert.expand)
+ return AVERROR(ENOTSUP);
+ for (int i = 0; i < 4; i++) {
+ switch (ff_sws_pixel_type_size(op->type)) {
+ case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF); break;
+ case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
+ }
+ }
+ break;
+ }
+
+ case SWS_OP_WRITE: {
+ if (op->rw.frac || !op->rw.packed)
+ return AVERROR(ENOTSUP);
+
+ /* Initialize to no-op */
+ memset(shuffle, clear_val, shuffle_size);
+
+ const int write_size = ff_sws_pixel_type_size(op->type);
+ const int read_chunk = read.rw.elems * read_size;
+ const int write_chunk = op->rw.elems * write_size;
+ const int num_groups = shuffle_size / FFMAX(read_chunk, write_chunk);
+ for (int n = 0; n < num_groups; n++) {
+ const int base_in = n * read_chunk;
+ const int base_out = n * write_chunk;
+ for (int i = 0; i < op->rw.elems; i++) {
+ const int offset = base_out + i * write_size;
+ for (int b = 0; b < write_size; b++)
+ shuffle[offset + b] = base_in + (mask[i] >> (b * 8));
+ }
+ }
+
+ *out_read_bytes = num_groups * read_chunk;
+ *out_write_bytes = num_groups * write_chunk;
+ return num_groups;
+ }
+
+ default:
+ return AVERROR(ENOTSUP);
+ }
+ }
+
+ return AVERROR(EINVAL);
+}
--
2.49.0
More information about the ffmpeg-devel
mailing list