[FFmpeg-devel] [PATCH 11/17] swscale/x86: add SIMD backend
Niklas Haas
ffmpeg at haasn.xyz
Sat Apr 26 20:41:15 EEST 2025
From: Niklas Haas <git at haasn.dev>
This covers most 8-bit and 16-bit ops, and some 32-bit ops. It also covers all
floating point operations. While this is not yet 100% coverage, it's good
enough for the vast majority of formats out there.
Of special note is the packed shuffle solver, which can reduce any compatible
series of operations down to a single pshufb loop. This takes care of any sort
of packed swizzle, but also e.g. grayscale to packed RGB expansion, RGB bit
depth conversions, endianness swapping and so on.
---
libswscale/ops.c | 4 +
libswscale/x86/Makefile | 3 +
libswscale/x86/ops.c | 735 ++++++++++++++++++++++++++++
libswscale/x86/ops_common.asm | 208 ++++++++
libswscale/x86/ops_float.asm | 376 +++++++++++++++
libswscale/x86/ops_int.asm | 882 ++++++++++++++++++++++++++++++++++
6 files changed, 2208 insertions(+)
create mode 100644 libswscale/x86/ops.c
create mode 100644 libswscale/x86/ops_common.asm
create mode 100644 libswscale/x86/ops_float.asm
create mode 100644 libswscale/x86/ops_int.asm
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 9600e3c9df..e408d7ca42 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -27,9 +27,13 @@
#include "ops.h"
#include "ops_internal.h"
+extern SwsOpBackend backend_x86;
extern SwsOpBackend backend_c;
const SwsOpBackend * const ff_sws_op_backends[] = {
+#if ARCH_X86
+ &backend_x86,
+#endif
&backend_c,
NULL
};
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index f00154941d..a04bc8336f 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -10,6 +10,9 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
X86ASM-OBJS += x86/input.o \
x86/output.o \
+ x86/ops_int.o \
+ x86/ops_float.o \
+ x86/ops.o \
x86/scale.o \
x86/scale_avx2.o \
x86/range_convert.o \
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
new file mode 100644
index 0000000000..d37edb72f1
--- /dev/null
+++ b/libswscale/x86/ops.c
@@ -0,0 +1,735 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <float.h>
+
+#include <libavutil/avassert.h>
+#include <libavutil/bswap.h>
+#include <libavutil/mem.h>
+
+#include "../ops_chain.h"
+
+#define DECL_ENTRY(TYPE, NAME, ...) \
+ static const SwsOpEntry op_##NAME = { \
+ .op.type = SWS_PIXEL_##TYPE, \
+ __VA_ARGS__ \
+ }
+
+#define DECL_ASM(TYPE, NAME, ...) \
+ void ff_##NAME(void); \
+ DECL_ENTRY(TYPE, NAME, \
+ .func = ff_##NAME, \
+ __VA_ARGS__)
+
+#define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
+ DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
+ .op.comps.unused = { !X, !Y, !Z, !W }, \
+ __VA_ARGS__ \
+ )
+
+#define REF_PATTERN(NAME, X, Y, Z, W) \
+ op_p##X##Y##Z##W##_##NAME
+
+#define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
+ DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
+ DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
+ DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
+ DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
+
+#define REF_COMMON_PATTERNS(NAME) \
+ REF_PATTERN(NAME, 1, 0, 0, 0), \
+ REF_PATTERN(NAME, 1, 0, 0, 1), \
+ REF_PATTERN(NAME, 1, 1, 1, 0), \
+ REF_PATTERN(NAME, 1, 1, 1, 1)
+
+#define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
+ DECL_ASM(TYPE, NAME##ELEMS##EXT, \
+ .op.op = SWS_OP_##OP, \
+ .op.rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
+ );
+
+#define DECL_PACKED_RW(EXT, DEPTH) \
+ DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
+ DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
+ DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
+ DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
+ DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
+ DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
+
+static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
+{
+ const int mask = ff_sws_pixel_type_size(op->type) - 1;
+ for (int i = 0; i < 16; i++)
+ out->u8[i] = (i & ~mask) | (mask - (i & mask));
+ return 0;
+}
+
+#define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
+ DECL_PATTERN(TYPE, swap_bytes_##TYPE##EXT, X, Y, Z, W, \
+ .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
+ .op.op = SWS_OP_SWAP_BYTES, \
+ .setup = setup_swap_bytes, \
+ );
+
+#define DECL_CLEAR_ALPHA(EXT, IDX) \
+ DECL_ASM(U8, clear_alpha##IDX##EXT, \
+ .op.op = SWS_OP_CLEAR, \
+ .op.c.q4[IDX] = { .num = -1, .den = 1 }, \
+ .op.comps.unused[IDX] = true, \
+ ); \
+
+#define DECL_CLEAR_ZERO(EXT, IDX) \
+ DECL_ASM(U8, clear_zero##IDX##EXT, \
+ .op.op = SWS_OP_CLEAR, \
+ .op.c.q4[IDX] = { .num = 0, .den = 1 }, \
+ .op.comps.unused[IDX] = true, \
+ );
+
+static int setup_clear(const SwsOp *op, SwsOpPriv *out)
+{
+ for (int i = 0; i < 4; i++)
+ out->u32[i] = (uint32_t) op->c.q4[i].num;
+ return 0;
+}
+
+#define DECL_CLEAR(EXT, X, Y, Z, W) \
+ DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
+ .op.op = SWS_OP_CLEAR, \
+ .setup = setup_clear, \
+ .flexible = true, \
+ );
+
+#define DECL_SWIZZLE(EXT, X, Y, Z, W) \
+ DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
+ .op.op = SWS_OP_SWIZZLE, \
+ .op.swizzle = SWS_SWIZZLE( X, Y, Z, W ), \
+ );
+
+#define DECL_CONVERT(EXT, FROM, TO) \
+ DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
+ .op.op = SWS_OP_CONVERT, \
+ .op.convert.to = SWS_PIXEL_##TO, \
+ );
+
+#define DECL_EXPAND(EXT, FROM, TO) \
+ DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
+ .op.op = SWS_OP_CONVERT, \
+ .op.convert.to = SWS_PIXEL_##TO, \
+ .op.convert.expand = true, \
+ );
+
+static int setup_shift(const SwsOp *op, SwsOpPriv *out)
+{
+ out->u16[0] = op->c.u;
+ return 0;
+}
+
+#define DECL_SHIFT16(EXT) \
+ DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
+ .op.op = SWS_OP_LSHIFT, \
+ .setup = setup_shift, \
+ ); \
+ \
+ DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
+ .op.op = SWS_OP_RSHIFT, \
+ .setup = setup_shift, \
+ );
+
+#define DECL_MIN_MAX(EXT) \
+ DECL_COMMON_PATTERNS(F32, min##EXT, \
+ .op.op = SWS_OP_MIN, \
+ .setup = ff_sws_setup_q4, \
+ .flexible = true, \
+ ); \
+ \
+ DECL_COMMON_PATTERNS(F32, max##EXT, \
+ .op.op = SWS_OP_MAX, \
+ .setup = ff_sws_setup_q4, \
+ .flexible = true, \
+ );
+
+#define DECL_SCALE(EXT) \
+ DECL_COMMON_PATTERNS(F32, scale##EXT, \
+ .op.op = SWS_OP_SCALE, \
+ .setup = ff_sws_setup_q, \
+ );
+
+/* 2x2 matrix fits inside SwsOpPriv directly, save an indirect in this case */
+static_assert(sizeof(SwsOpPriv) >= sizeof(float[2][2]), "2x2 dither matrix too large");
+static int setup_dither(const SwsOp *op, SwsOpPriv *out)
+{
+ const int size = 1 << op->dither.size_log2;
+ float *matrix = out->f32;
+ if (size > 2) {
+ matrix = out->ptr = av_mallocz(size * size * sizeof(*matrix));
+ if (!matrix)
+ return AVERROR(ENOMEM);
+ }
+
+ for (int i = 0; i < size * size; i++)
+ matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
+
+ return 0;
+}
+
+#define DECL_DITHER(EXT, SIZE) \
+ DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT, \
+ .op.op = SWS_OP_DITHER, \
+ .op.dither.size_log2 = SIZE, \
+ .setup = setup_dither, \
+ .free = SIZE > 2 ? av_free : NULL, \
+ );
+
+static int setup_linear(const SwsOp *op, SwsOpPriv *out)
+{
+ float *matrix = out->ptr = av_mallocz(sizeof(float[4][5]));
+ if (!matrix)
+ return AVERROR(ENOMEM);
+
+ for (int y = 0; y < 4; y++) {
+ for (int x = 0; x < 5; x++)
+ matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
+ }
+
+ return 0;
+}
+
+#define DECL_LINEAR(EXT, NAME, MASK) \
+ DECL_ASM(F32, NAME##EXT, \
+ .op.op = SWS_OP_LINEAR, \
+ .op.lin.mask = (MASK), \
+ .setup = setup_linear, \
+ .free = av_free, \
+ );
+
+#define DECL_FUNCS_8(SIZE, EXT, FLAG) \
+ DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
+ DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
+ DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
+ DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
+ DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
+ DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
+ DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
+ DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
+ DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
+ DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
+ DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
+ DECL_PACKED_RW(EXT, 8) \
+ void ff_p1000_shuffle##EXT(void); \
+ void ff_p1001_shuffle##EXT(void); \
+ void ff_p1110_shuffle##EXT(void); \
+ void ff_p1111_shuffle##EXT(void); \
+ DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
+ DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
+ DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
+ DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
+ DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
+ DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
+ DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
+ DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
+ DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
+ DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
+ DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
+ DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
+ DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
+ DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
+ DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
+ DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
+ DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
+ DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
+ DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
+ DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
+ DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
+ DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
+ DECL_CLEAR_ALPHA(EXT, 0) \
+ DECL_CLEAR_ALPHA(EXT, 1) \
+ DECL_CLEAR_ALPHA(EXT, 3) \
+ DECL_CLEAR_ZERO(EXT, 0) \
+ DECL_CLEAR_ZERO(EXT, 1) \
+ DECL_CLEAR_ZERO(EXT, 3) \
+ DECL_CLEAR(EXT, 1, 1, 1, 0) \
+ DECL_CLEAR(EXT, 0, 1, 1, 1) \
+ DECL_CLEAR(EXT, 0, 0, 1, 1) \
+ DECL_CLEAR(EXT, 1, 0, 0, 1) \
+ DECL_CLEAR(EXT, 1, 1, 0, 0) \
+ DECL_CLEAR(EXT, 0, 1, 0, 1) \
+ DECL_CLEAR(EXT, 1, 0, 1, 0) \
+ DECL_CLEAR(EXT, 1, 0, 0, 0) \
+ DECL_CLEAR(EXT, 0, 1, 0, 0) \
+ DECL_CLEAR(EXT, 0, 0, 1, 0) \
+ \
+static const SwsOpTable ops8##EXT = { \
+ .cpu_flags = AV_CPU_FLAG_##FLAG, \
+ .block_size = SIZE, \
+ .entries = { \
+ op_read_planar1##EXT, \
+ op_read_planar2##EXT, \
+ op_read_planar3##EXT, \
+ op_read_planar4##EXT, \
+ op_write_planar1##EXT, \
+ op_write_planar2##EXT, \
+ op_write_planar3##EXT, \
+ op_write_planar4##EXT, \
+ op_read8_packed2##EXT, \
+ op_read8_packed3##EXT, \
+ op_read8_packed4##EXT, \
+ op_write8_packed2##EXT, \
+ op_write8_packed3##EXT, \
+ op_write8_packed4##EXT, \
+ op_read_nibbles1##EXT, \
+ op_read_bits1##EXT, \
+ op_write_bits1##EXT, \
+ op_swizzle_3012##EXT, \
+ op_swizzle_3021##EXT, \
+ op_swizzle_2103##EXT, \
+ op_swizzle_3210##EXT, \
+ op_swizzle_3102##EXT, \
+ op_swizzle_3201##EXT, \
+ op_swizzle_1203##EXT, \
+ op_swizzle_1023##EXT, \
+ op_swizzle_2013##EXT, \
+ op_swizzle_2310##EXT, \
+ op_swizzle_2130##EXT, \
+ op_swizzle_1230##EXT, \
+ op_swizzle_1320##EXT, \
+ op_swizzle_0213##EXT, \
+ op_swizzle_0231##EXT, \
+ op_swizzle_0312##EXT, \
+ op_swizzle_3120##EXT, \
+ op_swizzle_0321##EXT, \
+ op_swizzle_0003##EXT, \
+ op_swizzle_0001##EXT, \
+ op_swizzle_3000##EXT, \
+ op_swizzle_1000##EXT, \
+ op_clear_alpha0##EXT, \
+ op_clear_alpha1##EXT, \
+ op_clear_alpha3##EXT, \
+ op_clear_zero0##EXT, \
+ op_clear_zero1##EXT, \
+ op_clear_zero3##EXT, \
+ REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
+ REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
+ REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
+ REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
+ REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
+ REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
+ REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
+ REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
+ REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
+ REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
+ {{0}} \
+ }, \
+};
+
+#define DECL_FUNCS_16(SIZE, EXT, FLAG) \
+ DECL_PACKED_RW(EXT, 16) \
+ DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
+ DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
+ DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
+ DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
+ DECL_SHIFT16(EXT) \
+ DECL_CONVERT(EXT, U8, U16) \
+ DECL_CONVERT(EXT, U16, U8) \
+ DECL_EXPAND(EXT, U8, U16) \
+ \
+static const SwsOpTable ops16##EXT = { \
+ .cpu_flags = AV_CPU_FLAG_##FLAG, \
+ .block_size = SIZE, \
+ .entries = { \
+ op_read16_packed2##EXT, \
+ op_read16_packed3##EXT, \
+ op_read16_packed4##EXT, \
+ op_write16_packed2##EXT, \
+ op_write16_packed3##EXT, \
+ op_write16_packed4##EXT, \
+ REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
+ REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
+ REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
+ REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
+ REF_COMMON_PATTERNS(lshift16##EXT), \
+ REF_COMMON_PATTERNS(rshift16##EXT), \
+ {{0}} \
+ }, \
+};
+
+#define DECL_FUNCS_32(SIZE, EXT, FLAG) \
+ DECL_PACKED_RW(_m2##EXT, 32) \
+ DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
+ DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
+ DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
+ DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
+ DECL_CONVERT(EXT, U8, U32) \
+ DECL_CONVERT(EXT, U32, U8) \
+ DECL_CONVERT(EXT, U16, U32) \
+ DECL_CONVERT(EXT, U32, U16) \
+ DECL_CONVERT(EXT, U8, F32) \
+ DECL_CONVERT(EXT, F32, U8) \
+ DECL_CONVERT(EXT, U16, F32) \
+ DECL_CONVERT(EXT, F32, U16) \
+ DECL_EXPAND(EXT, U8, U32) \
+ DECL_MIN_MAX(EXT) \
+ DECL_SCALE(EXT) \
+ DECL_DITHER(EXT, 0) \
+ DECL_DITHER(EXT, 1) \
+ DECL_DITHER(EXT, 2) \
+ DECL_DITHER(EXT, 3) \
+ DECL_DITHER(EXT, 4) \
+ DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
+ DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
+ DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
+ DECL_LINEAR(EXT, dot3, 0b111) \
+ DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
+ DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
+ DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
+ DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
+ DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
+ DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
+ DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
+ DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
+ DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
+ DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
+ \
+static const SwsOpTable ops32##EXT = { \
+ .cpu_flags = AV_CPU_FLAG_##FLAG, \
+ .block_size = SIZE, \
+ .entries = { \
+ op_read32_packed2_m2##EXT, \
+ op_read32_packed3_m2##EXT, \
+ op_read32_packed4_m2##EXT, \
+ op_write32_packed2_m2##EXT, \
+ op_write32_packed3_m2##EXT, \
+ op_write32_packed4_m2##EXT, \
+ REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
+ REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
+ REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
+ REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
+ REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
+ REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
+ REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
+ REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
+ REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
+ REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
+ REF_COMMON_PATTERNS(min##EXT), \
+ REF_COMMON_PATTERNS(max##EXT), \
+ REF_COMMON_PATTERNS(scale##EXT), \
+ REF_COMMON_PATTERNS(dither0##EXT), \
+ REF_COMMON_PATTERNS(dither1##EXT), \
+ REF_COMMON_PATTERNS(dither2##EXT), \
+ REF_COMMON_PATTERNS(dither3##EXT), \
+ REF_COMMON_PATTERNS(dither4##EXT), \
+ op_luma##EXT, \
+ op_alpha##EXT, \
+ op_lumalpha##EXT, \
+ op_dot3##EXT, \
+ op_row0##EXT, \
+ op_row0a##EXT, \
+ op_diag3##EXT, \
+ op_diag4##EXT, \
+ op_diagoff3##EXT, \
+ op_matrix3##EXT, \
+ op_affine3##EXT, \
+ op_affine3a##EXT, \
+ op_matrix4##EXT, \
+ op_affine4##EXT, \
+ {{0}} \
+ }, \
+};
+
+DECL_FUNCS_8(16, _m1_sse4, SSE4)
+DECL_FUNCS_8(32, _m1_avx2, AVX2)
+DECL_FUNCS_8(32, _m2_sse4, SSE4)
+DECL_FUNCS_8(64, _m2_avx2, AVX2)
+
+DECL_FUNCS_16(16, _m1_avx2, AVX2)
+DECL_FUNCS_16(32, _m2_avx2, AVX2)
+
+DECL_FUNCS_32(16, _avx2, AVX2)
+
+static av_const int get_mmsize(const int cpu_flags)
+{
+ if (cpu_flags & AV_CPU_FLAG_AVX2)
+ return 32;
+ else if (cpu_flags & AV_CPU_FLAG_SSE4)
+ return 16;
+ else
+ return AVERROR(ENOTSUP);
+}
+
+/**
+ * Returns true if the operation's implementation only depends on the block
+ * size, and not the underlying pixel type
+ */
+static bool op_is_type_invariant(const SwsOp *op)
+{
+ switch (op->op) {
+ case SWS_OP_READ:
+ case SWS_OP_WRITE:
+ return !op->rw.packed && !op->rw.frac;
+ case SWS_OP_SWIZZLE:
+ case SWS_OP_CLEAR:
+ return true;
+ }
+
+ return false;
+}
+
+/* Tries to reduce a series of operations to an in-place shuffle mask.
+ * Returns the block size, 0 or a negative error code. */
+static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
+{
+ const SwsOp read = ops->ops[0];
+ const int read_size = ff_sws_pixel_type_size(read.type);
+ uint32_t mask[4] = {0};
+
+ if (!ops->num_ops || read.op != SWS_OP_READ)
+ return AVERROR(EINVAL);
+ if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
+ return AVERROR(ENOTSUP);
+
+ for (int i = 0; i < read.rw.elems; i++)
+ mask[i] = 0x01010101 * i * read_size + 0x03020100;
+
+ for (int opidx = 1; opidx < ops->num_ops; opidx++) {
+ const SwsOp *op = &ops->ops[opidx];
+ switch (op->op) {
+ case SWS_OP_SWIZZLE: {
+ uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
+ for (int i = 0; i < 4; i++)
+ mask[i] = orig[op->swizzle.in[i]];
+ break;
+ }
+
+ case SWS_OP_SWAP_BYTES:
+ for (int i = 0; i < 4; i++) {
+ switch (ff_sws_pixel_type_size(op->type)) {
+ case 2: mask[i] = av_bswap16(mask[i]); break;
+ case 4: mask[i] = av_bswap32(mask[i]); break;
+ }
+ }
+ break;
+
+ case SWS_OP_CLEAR:
+ for (int i = 0; i < 4; i++) {
+ if (!op->c.q4[i].den)
+ continue;
+ if (op->c.q4[i].num != 0)
+ return AVERROR(ENOTSUP);
+ mask[i] = 0x80808080ul; /* pshufb implicit clear to zero */
+ }
+ break;
+
+ case SWS_OP_CONVERT: {
+ if (!op->convert.expand)
+ return AVERROR(ENOTSUP);
+ for (int i = 0; i < 4; i++) {
+ switch (ff_sws_pixel_type_size(op->type)) {
+ case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF); break;
+ case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
+ }
+ }
+ break;
+ }
+
+ case SWS_OP_WRITE: {
+ if (op->rw.frac || !op->rw.packed)
+ return AVERROR(ENOTSUP);
+
+ /* Initialize to no-op */
+ uint8_t shuffle[16];
+ for (int i = 0; i < 16; i++)
+ shuffle[i] = 0x80;
+
+ const int write_size = ff_sws_pixel_type_size(op->type);
+ const int read_chunk = read.rw.elems * read_size;
+ const int write_chunk = op->rw.elems * write_size;
+ const int groups_per_lane = 16 / FFMAX(read_chunk, write_chunk);
+ for (int n = 0; n < groups_per_lane; n++) {
+ const int base_in = n * read_chunk;
+ const int base_out = n * write_chunk;
+ for (int i = 0; i < op->rw.elems; i++) {
+ const int offset = base_out + i * write_size;
+ for (int b = 0; b < write_size; b++)
+ shuffle[offset + b] = base_in + (mask[i] >> (b * 8));
+ }
+ }
+
+ const int in_per_lane = groups_per_lane * read_chunk;
+ const int out_per_lane = groups_per_lane * write_chunk;
+ if (in_per_lane < 16 || out_per_lane < 16)
+ mmsize = 16; /* avoid cross-lane shuffle */
+
+ const int num_lanes = mmsize / 16;
+ const int in_total = num_lanes * in_per_lane;
+ const int out_total = num_lanes * out_per_lane;
+ const int read_size = in_total <= 4 ? 4 : in_total <= 8 ? 8 : mmsize;
+ *out = (SwsCompiledOp) {
+ .priv = av_memdup(shuffle, sizeof(shuffle)),
+ .free = av_free,
+ .block_size = groups_per_lane * num_lanes,
+ .over_read = read_size - in_total,
+ .over_write = mmsize - out_total,
+ };
+
+ if (!out->priv)
+ return AVERROR(ENOMEM);
+
+#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
+do { \
+ SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
+ if (in_total == IN && out_total == OUT) \
+ out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
+} while (0)
+
+ ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
+ ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
+ ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
+ ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
+ ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
+ ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
+ ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
+ ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
+ ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
+ ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
+ ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
+ ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
+ ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
+ ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
+ av_assert1(out->func);
+ return 0;
+ }
+
+ default:
+ return AVERROR(ENOTSUP);
+ }
+ }
+
+ return AVERROR(EINVAL);
+}
+
+/* Normalize clear values into 32-bit integer constants */
+static void normalize_clear(SwsOp *op)
+{
+ static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
+ SwsOpPriv priv;
+ union {
+ uint32_t u32;
+ int i;
+ } c;
+
+ ff_sws_setup_q4(op, &priv);
+ for (int i = 0; i < 4; i++) {
+ if (!op->c.q4[i].den)
+ continue;
+ switch (ff_sws_pixel_type_size(op->type)) {
+ case 1: c.u32 = 0x1010101 * priv.u8[i]; break;
+ case 2: c.u32 = priv.u16[i] << 16 | priv.u16[i]; break;
+ case 4: c.u32 = priv.u32[i]; break;
+ }
+
+ op->c.q4[i].num = c.i;
+ op->c.q4[i].den = 1;
+ }
+}
+
+static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
+{
+ const int cpu_flags = av_get_cpu_flags();
+ const int mmsize = get_mmsize(cpu_flags);
+ if (mmsize < 0)
+ return mmsize;
+
+ av_assert1(ops->num_ops > 0);
+ const SwsOp read = ops->ops[0];
+ const SwsOp write = ops->ops[ops->num_ops - 1];
+ int ret;
+
+ /* Special fast path for in-place packed shuffle */
+ ret = solve_shuffle(ops, mmsize, out);
+ if (ret != AVERROR(ENOTSUP))
+ return ret;
+
+ SwsOpChain *chain = ff_sws_op_chain_alloc();
+ if (!chain)
+ return AVERROR(ENOMEM);
+
+ *out = (SwsCompiledOp) {
+ .priv = chain,
+ .free = (void (*)(void *)) ff_sws_op_chain_free,
+
+ /* Use at most two full vregs during the widest precision section */
+ .block_size = 2 * mmsize / ff_sws_op_list_max_size(ops),
+ };
+
+ /* 3-component reads/writes process one extra garbage word */
+ if (read.rw.packed && read.rw.elems == 3)
+ out->over_read = sizeof(uint32_t);
+ if (write.rw.packed && write.rw.elems == 3)
+ out->over_write = sizeof(uint32_t);
+
+ static const SwsOpTable *const tables[] = {
+ &ops8_m1_sse4,
+ &ops8_m1_avx2,
+ &ops8_m2_sse4,
+ &ops8_m2_avx2,
+ &ops16_m1_avx2,
+ &ops16_m2_avx2,
+ &ops32_avx2,
+ };
+
+ do {
+ int op_block_size = out->block_size;
+ SwsOp *op = &ops->ops[0];
+
+ if (op_is_type_invariant(op)) {
+ if (op->op == SWS_OP_CLEAR)
+ normalize_clear(op);
+ op_block_size *= ff_sws_pixel_type_size(op->type);
+ op->type = SWS_PIXEL_U8;
+ }
+
+ ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops,
+ op_block_size, chain);
+ } while (ret == AVERROR(EAGAIN));
+ if (ret < 0) {
+ ff_sws_op_chain_free(chain);
+ return ret;
+ }
+
+ SWS_DECL_FUNC(ff_sws_process1_x86);
+ SWS_DECL_FUNC(ff_sws_process2_x86);
+ SWS_DECL_FUNC(ff_sws_process3_x86);
+ SWS_DECL_FUNC(ff_sws_process4_x86);
+
+ const int read_planes = read.rw.packed ? 1 : read.rw.elems;
+ const int write_planes = write.rw.packed ? 1 : write.rw.elems;
+ switch (FFMAX(read_planes, write_planes)) {
+ case 1: out->func = ff_sws_process1_x86; break;
+ case 2: out->func = ff_sws_process2_x86; break;
+ case 3: out->func = ff_sws_process3_x86; break;
+ case 4: out->func = ff_sws_process4_x86; break;
+ }
+
+ return ret;
+}
+
+SwsOpBackend backend_x86 = {
+ .name = "x86",
+ .compile = compile,
+};
diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm
new file mode 100644
index 0000000000..15d171329d
--- /dev/null
+++ b/libswscale/x86/ops_common.asm
@@ -0,0 +1,208 @@
+;******************************************************************************
+;* Copyright (c) 2025 Niklas Haas
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+struc SwsOpExec
+ .in0 resq 1
+ .in1 resq 1
+ .in2 resq 1
+ .in3 resq 1
+ .out0 resq 1
+ .out1 resq 1
+ .out2 resq 1
+ .out3 resq 1
+ .in_stride0 resq 1
+ .in_stride1 resq 1
+ .in_stride2 resq 1
+ .in_stride3 resq 1
+ .out_stride0 resq 1
+ .out_stride1 resq 1
+ .out_stride2 resq 1
+ .out_stride3 resq 1
+ .x resd 1
+ .y resd 1
+ .width resd 1
+ .height resd 1
+ .slice_y resd 1
+ .slice_h resd 1
+ .pixel_bits_in resd 1
+ .pixel_bits_out resd 1
+endstruc
+
+struc SwsOpImpl
+ .cont resb 16
+ .priv resb 16
+ .next resb 0
+endstruc
+
+; common macros for declaring operations
+%macro op 1 ; name
+ %ifdef X
+ %define ADD_PAT(name) p %+ X %+ Y %+ Z %+ W %+ _ %+ name
+ %else
+ %define ADD_PAT(name) name
+ %endif
+
+ %ifdef V2
+ %if V2
+ %define ADD_MUL(name) name %+ _m2
+ %else
+ %define ADD_MUL(name) name %+ _m1
+ %endif
+ %else
+ %define ADD_MUL(name) name
+ %endif
+
+ cglobal ADD_PAT(ADD_MUL(%1)), 0, 0, 16
+
+ %undef ADD_PAT
+ %undef ADD_MUL
+%endmacro
+
+%macro decl_v2 2+ ; v2, func
+ %xdefine V2 %1
+ %2
+ %undef V2
+%endmacro
+
+%macro decl_pattern 5+ ; X, Y, Z, W, func
+ %xdefine X %1
+ %xdefine Y %2
+ %xdefine Z %3
+ %xdefine W %4
+ %5
+ %undef X
+ %undef Y
+ %undef Z
+ %undef W
+%endmacro
+
+%macro decl_common_patterns 1+ ; func
+ decl_pattern 1, 0, 0, 0, %1 ; y
+ decl_pattern 1, 0, 0, 1, %1 ; ya
+ decl_pattern 1, 1, 1, 0, %1 ; yuv
+ decl_pattern 1, 1, 1, 1, %1 ; yuva
+%endmacro
+
+; common names for the internal calling convention
+%define mx m0
+%define my m1
+%define mz m2
+%define mw m3
+
+%define xmx xm0
+%define xmy xm1
+%define xmz xm2
+%define xmw xm3
+
+%define ymx ym0
+%define ymy ym1
+%define ymz ym2
+%define ymw ym3
+
+%define mx2 m4
+%define my2 m5
+%define mz2 m6
+%define mw2 m7
+
+%define xmx2 xm4
+%define xmy2 xm5
+%define xmz2 xm6
+%define xmw2 xm7
+
+%define ymx2 ym4
+%define ymy2 ym5
+%define ymz2 ym6
+%define ymw2 ym7
+
+; from entry point signature
+%define execq r0q
+%define implq r1q
+%define blocksd r2d
+
+; extra registers for free use by kernels, not saved between ops
+%define tmp0q r3q
+%define tmp1q r4q
+%define tmp2q r5q
+%define tmp3q r6q
+
+%define tmp0d r3d
+%define tmp1d r4d
+%define tmp2d r5d
+%define tmp3d r6d
+
+; pinned static registers for plane pointers
+%define in0q r7q
+%define out0q r8q
+%define in1q r9q
+%define out1q r10q
+%define in2q r11q
+%define out2q r12q
+%define in3q r13q
+%define out3q r14q
+
+; load the next operation kernel
+%macro LOAD_CONT 1 ; reg
+ mov %1, [implq + SwsOpImpl.cont]
+%endmacro
+
+; tail call into the next operation kernel
+%macro CONTINUE 1 ; reg
+ add implq, SwsOpImpl.next
+ jmp %1
+ annotate_function_size
+%endmacro
+
+%macro CONTINUE 0
+ LOAD_CONT tmp0q
+ CONTINUE tmp0q
+%endmacro
+
+; return to entry point after write, avoids unnecessary vzeroupper
+%macro END_CHAIN 0
+ ret
+ annotate_function_size
+%endmacro
+
+; helper for inline conditionals
+%rmacro IF 2+ ; cond, body
+ %if %1
+ %2
+ %endif
+%endmacro
+
+; alternate name for nested usage to work around some NASM bugs
+%rmacro IF1 2+
+ %if %1
+ %2
+ %endif
+%endmacro
+
+; move at least N pixels
+%macro MOVSZ 2+ ; size, args
+ %if %1 <= 4
+ movd %2
+ %elif %1 <= 8
+ movq %2
+ %else
+ movu %2
+ %endif
+%endmacro
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
new file mode 100644
index 0000000000..120ccc65b2
--- /dev/null
+++ b/libswscale/x86/ops_float.asm
@@ -0,0 +1,376 @@
+;******************************************************************************
+;* Copyright (c) 2025 Niklas Haas
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "ops_common.asm"
+
+SECTION .text
+
+;---------------------------------------------------------
+; Pixel type conversions
+
+%macro conv8to32f 0
+op convert_U8_F32
+ LOAD_CONT tmp0q
+IF X, vpsrldq xmx2, xmx, 8
+IF Y, vpsrldq xmy2, xmy, 8
+IF Z, vpsrldq xmz2, xmz, 8
+IF W, vpsrldq xmw2, xmw, 8
+IF X, pmovzxbd mx, xmx
+IF Y, pmovzxbd my, xmy
+IF Z, pmovzxbd mz, xmz
+IF W, pmovzxbd mw, xmw
+IF X, pmovzxbd mx2, xmx2
+IF Y, pmovzxbd my2, xmy2
+IF Z, pmovzxbd mz2, xmz2
+IF W, pmovzxbd mw2, xmw2
+IF X, vcvtdq2ps mx, mx
+IF Y, vcvtdq2ps my, my
+IF Z, vcvtdq2ps mz, mz
+IF W, vcvtdq2ps mw, mw
+IF X, vcvtdq2ps mx2, mx2
+IF Y, vcvtdq2ps my2, my2
+IF Z, vcvtdq2ps mz2, mz2
+IF W, vcvtdq2ps mw2, mw2
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv16to32f 0
+op convert_U16_F32
+ LOAD_CONT tmp0q
+IF X, vextracti128 xmx2, mx, 1
+IF Y, vextracti128 xmy2, my, 1
+IF Z, vextracti128 xmz2, mz, 1
+IF W, vextracti128 xmw2, mw, 1
+IF X, pmovzxwd mx, xmx
+IF Y, pmovzxwd my, xmy
+IF Z, pmovzxwd mz, xmz
+IF W, pmovzxwd mw, xmw
+IF X, pmovzxwd mx2, xmx2
+IF Y, pmovzxwd my2, xmy2
+IF Z, pmovzxwd mz2, xmz2
+IF W, pmovzxwd mw2, xmw2
+IF X, vcvtdq2ps mx, mx
+IF Y, vcvtdq2ps my, my
+IF Z, vcvtdq2ps mz, mz
+IF W, vcvtdq2ps mw, mw
+IF X, vcvtdq2ps mx2, mx2
+IF Y, vcvtdq2ps my2, my2
+IF Z, vcvtdq2ps mz2, mz2
+IF W, vcvtdq2ps mw2, mw2
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv32fto8 0
+op convert_F32_U8
+ LOAD_CONT tmp0q
+IF X, cvttps2dq mx, mx
+IF Y, cvttps2dq my, my
+IF Z, cvttps2dq mz, mz
+IF W, cvttps2dq mw, mw
+IF X, cvttps2dq mx2, mx2
+IF Y, cvttps2dq my2, my2
+IF Z, cvttps2dq mz2, mz2
+IF W, cvttps2dq mw2, mw2
+IF X, packusdw mx, mx2
+IF Y, packusdw my, my2
+IF Z, packusdw mz, mz2
+IF W, packusdw mw, mw2
+IF X, vextracti128 xmx2, mx, 1
+IF Y, vextracti128 xmy2, my, 1
+IF Z, vextracti128 xmz2, mz, 1
+IF W, vextracti128 xmw2, mw, 1
+IF X, packuswb xmx, xmx2
+IF Y, packuswb xmy, xmy2
+IF Z, packuswb xmz, xmz2
+IF W, packuswb xmw, xmw2
+IF X, vpshufd xmx, xmx, q3120
+IF Y, vpshufd xmy, xmy, q3120
+IF Z, vpshufd xmz, xmz, q3120
+IF W, vpshufd xmw, xmw, q3120
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv32fto16 0
+op convert_F32_U16
+ LOAD_CONT tmp0q
+IF X, cvttps2dq mx, mx
+IF Y, cvttps2dq my, my
+IF Z, cvttps2dq mz, mz
+IF W, cvttps2dq mw, mw
+IF X, cvttps2dq mx2, mx2
+IF Y, cvttps2dq my2, my2
+IF Z, cvttps2dq mz2, mz2
+IF W, cvttps2dq mw2, mw2
+IF X, packusdw mx, mx2
+IF Y, packusdw my, my2
+IF Z, packusdw mz, mz2
+IF W, packusdw mw, mw2
+IF X, vpermq mx, mx, q3120
+IF Y, vpermq my, my, q3120
+IF Z, vpermq mz, mz, q3120
+IF W, vpermq mw, mw, q3120
+ CONTINUE tmp0q
+%endmacro
+
+%macro min_max 0
+op min
+IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0]
+IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 4]
+IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 8]
+IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 12]
+ LOAD_CONT tmp0q
+IF X, minps mx, mx, m8
+IF Y, minps my, my, m9
+IF Z, minps mz, mz, m10
+IF W, minps mw, mw, m11
+IF X, minps mx2, m8
+IF Y, minps my2, m9
+IF Z, minps mz2, m10
+IF W, minps mw2, m11
+ CONTINUE tmp0q
+
+op max
+IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0]
+IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 4]
+IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 8]
+IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 12]
+ LOAD_CONT tmp0q
+IF X, maxps mx, m8
+IF Y, maxps my, m9
+IF Z, maxps mz, m10
+IF W, maxps mw, m11
+IF X, maxps mx2, m8
+IF Y, maxps my2, m9
+IF Z, maxps mz2, m10
+IF W, maxps mw2, m11
+ CONTINUE tmp0q
+%endmacro
+
+%macro scale 0
+op scale
+ vbroadcastss m8, [implq + SwsOpImpl.priv]
+ LOAD_CONT tmp0q
+IF X, mulps mx, m8
+IF Y, mulps my, m8
+IF Z, mulps mz, m8
+IF W, mulps mw, m8
+IF X, mulps mx2, m8
+IF Y, mulps my2, m8
+IF Z, mulps mz2, m8
+IF W, mulps mw2, m8
+ CONTINUE tmp0q
+%endmacro
+
+%macro load_dither_row 5 ; size_log2, y, addr, out, out2
+ lea tmp0q, %2
+ and tmp0q, (1 << %1) - 1
+ shl tmp0q, %1+2
+%if %1 == 2
+ VBROADCASTI128 %4, [%3 + tmp0q]
+%else
+ mova %4, [%3 + tmp0q]
+ %if (4 << %1) > mmsize
+ mova %5, [%3 + tmp0q + mmsize]
+ %endif
+%endif
+%endmacro
+
+%macro dither 1 ; size_log2
+op dither%1
+ %define DX m8
+ %define DY m9
+ %define DZ m10
+ %define DW m11
+ %define DX2 DX
+ %define DY2 DY
+ %define DZ2 DZ
+ %define DW2 DW
+%if %1 == 0
+ ; constant offest for all channels
+ vbroadcastss DX, [implq + SwsOpImpl.priv]
+ %define DY DX
+ %define DZ DX
+ %define DW DX
+%elif %1 == 1
+ ; 2x2 matrix, only sign of y matters
+ mov tmp0d, [execq + SwsOpExec.y]
+ and tmp0d, 1
+ shl tmp0d, 3
+ %if X || Y
+ vbroadcastsd DX, [implq + SwsOpImpl.priv + tmp0q]
+ %endif
+ %if Z || W
+ xor tmp0d, 8
+ vbroadcastsd DZ, [implq + SwsOpImpl.priv + tmp0q]
+ %endif
+ %define DY DX
+ %define DW DZ
+%else
+ ; matrix is at least 4x4, load all four channels with custom offset
+ %if (4 << %1) > mmsize
+ %define DX2 m12
+ %define DY2 m13
+ %define DZ2 m14
+ %define DW2 m15
+ %endif
+ mov tmp1d, [execq + SwsOpExec.y]
+ mov tmp2q, [implq + SwsOpImpl.priv]
+IF X, load_dither_row %1, [tmp1d + 0], tmp2q, DX, DX2
+IF Y, load_dither_row %1, [tmp1d + 3], tmp2q, DY, DY2
+IF Z, load_dither_row %1, [tmp1d + 2], tmp2q, DZ, DZ2
+IF W, load_dither_row %1, [tmp1d + 5], tmp2q, DW, DW2
+%endif
+ LOAD_CONT tmp0q
+IF X, addps mx, DX
+IF Y, addps my, DY
+IF Z, addps mz, DZ
+IF W, addps mw, DW
+IF X, addps mx2, DX2
+IF Y, addps my2, DY2
+IF Z, addps mz2, DZ2
+IF W, addps mw2, DW2
+ CONTINUE tmp0q
+%endmacro
+
+%macro dither_fns 0
+ dither 0
+ dither 1
+ dither 2
+ dither 3
+ dither 4
+%endmacro
+
+%xdefine MASK(I, J) (1 << (5 * (I) + (J)))
+%xdefine MASK_OFF(I) MASK(I, 4)
+%xdefine MASK_ROW(I) (0b11111 << (5 * (I)))
+%xdefine MASK_COL(J) (0b1000010000100001 << J)
+%xdefine MASK_ALL (1 << 20) - 1
+%xdefine MASK_LUMA MASK(0, 0) | MASK_OFF(0)
+%xdefine MASK_ALPHA MASK(3, 3) | MASK_OFF(3)
+%xdefine MASK_DIAG3 MASK(0, 0) | MASK(1, 1) | MASK(2, 2)
+%xdefine MASK_OFF3 MASK_OFF(0) | MASK_OFF(1) | MASK_OFF(2)
+%xdefine MASK_MAT3 MASK(0, 0) | MASK(0, 1) | MASK(0, 2) |\
+ MASK(1, 0) | MASK(1, 1) | MASK(1, 2) |\
+ MASK(2, 0) | MASK(2, 1) | MASK(2, 2)
+%xdefine MASK_DIAG4 MASK_DIAG3 | MASK(3, 3)
+%xdefine MASK_OFF4 MASK_OFF3 | MASK_OFF(3)
+%xdefine MASK_MAT4 MASK_ALL & ~MASK_OFF4
+
+%macro linear_row 7 ; res, x, y, z, w, row, mask
+%define COL(J) ((%7) & MASK(%6, J)) ; true if mask contains component J
+%define NOP(J) (J == %6 && !COL(J)) ; true if J is untouched input component
+
+ ; load weights
+ IF COL(0), vbroadcastss m12, [tmp0q + %6 * 20 + 0]
+ IF COL(1), vbroadcastss m13, [tmp0q + %6 * 20 + 4]
+ IF COL(2), vbroadcastss m14, [tmp0q + %6 * 20 + 8]
+ IF COL(3), vbroadcastss m15, [tmp0q + %6 * 20 + 12]
+
+ ; initialize result vector as appropriate
+ %if COL(4) ; offset
+ vbroadcastss %1, [tmp0q + %6 * 20 + 16]
+ %elif NOP(0)
+ ; directly reuse first component vector if possible
+ mova %1, %2
+ %else
+ xorps %1, %1
+ %endif
+
+ IF COL(0), mulps m12, %2
+ IF COL(1), mulps m13, %3
+ IF COL(2), mulps m14, %4
+ IF COL(3), mulps m15, %5
+ IF COL(0), addps %1, m12
+ IF NOP(0) && COL(4), addps %1, %3 ; first vector was not reused
+ IF COL(1), addps %1, m13
+ IF NOP(1), addps %1, %3
+ IF COL(2), addps %1, m14
+ IF NOP(2), addps %1, %4
+ IF COL(3), addps %1, m15
+ IF NOP(3), addps %1, %5
+%endmacro
+
+%macro linear_inner 5 ; x, y, z, w, mask
+ %define ROW(I) ((%5) & MASK_ROW(I))
+ IF1 ROW(0), linear_row m8, %1, %2, %3, %4, 0, %5
+ IF1 ROW(1), linear_row m9, %1, %2, %3, %4, 1, %5
+ IF1 ROW(2), linear_row m10, %1, %2, %3, %4, 2, %5
+ IF1 ROW(3), linear_row m11, %1, %2, %3, %4, 3, %5
+ IF ROW(0), mova %1, m8
+ IF ROW(1), mova %2, m9
+ IF ROW(2), mova %3, m10
+ IF ROW(3), mova %4, m11
+%endmacro
+
+%macro linear_mask 2 ; name, mask
+op %1
+ mov tmp0q, [implq + SwsOpImpl.priv] ; address of matrix
+ linear_inner mx, my, mz, mw, %2
+ linear_inner mx2, my2, mz2, mw2, %2
+ CONTINUE
+%endmacro
+
+; specialized functions for very simple cases
+%macro linear_dot3 0
+op dot3
+ mov tmp0q, [implq + SwsOpImpl.priv]
+ vbroadcastss m12, [tmp0q + 0]
+ vbroadcastss m13, [tmp0q + 4]
+ vbroadcastss m14, [tmp0q + 8]
+ LOAD_CONT tmp0q
+ mulps mx, m12
+ mulps m8, my, m13
+ mulps m9, mz, m14
+ addps mx, m8
+ addps mx, m9
+ mulps mx2, m12
+ mulps m10, my2, m13
+ mulps m11, mz2, m14
+ addps mx2, m10
+ addps mx2, m11
+ CONTINUE tmp0q
+%endmacro
+
+%macro linear_fns 0
+ linear_dot3
+ linear_mask luma, MASK_LUMA
+ linear_mask alpha, MASK_ALPHA
+ linear_mask lumalpha, MASK_LUMA | MASK_ALPHA
+ linear_mask row0, MASK_ROW(0)
+ linear_mask row0a, MASK_ROW(0) | MASK_ALPHA
+ linear_mask diag3, MASK_DIAG3
+ linear_mask diag4, MASK_DIAG4
+ linear_mask diagoff3, MASK_DIAG3 | MASK_OFF3
+ linear_mask matrix3, MASK_MAT3
+ linear_mask affine3, MASK_MAT3 | MASK_OFF3
+ linear_mask affine3a, MASK_MAT3 | MASK_OFF3 | MASK_ALPHA
+ linear_mask matrix4, MASK_MAT4
+ linear_mask affine4, MASK_MAT4 | MASK_OFF4
+%endmacro
+
+INIT_YMM avx2
+decl_common_patterns conv8to32f
+decl_common_patterns conv16to32f
+decl_common_patterns conv32fto8
+decl_common_patterns conv32fto16
+decl_common_patterns min_max
+decl_common_patterns scale
+decl_common_patterns dither_fns
+linear_fns
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
new file mode 100644
index 0000000000..3f995d71e2
--- /dev/null
+++ b/libswscale/x86/ops_int.asm
@@ -0,0 +1,882 @@
+;******************************************************************************
+;* Copyright (c) 2025 Niklas Haas
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "ops_common.asm"
+
+SECTION_RODATA
+
+expand16_shuf: db 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+expand32_shuf: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+
+read8_unpack2: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+read8_unpack3: db 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1
+read8_unpack4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+read16_unpack2: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+read16_unpack3: db 0, 1, 6, 7, 2, 3, 8, 9, 4, 5, 10, 11, -1, -1, -1, -1
+read16_unpack4: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+write8_pack2: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+write8_pack3: db 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, -1, -1, -1, -1
+write16_pack3: db 0, 1, 4, 5, 8, 9, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1
+
+%define write8_pack4 read8_unpack4
+%define write16_pack4 read16_unpack2
+%define write16_pack2 read16_unpack4
+
+align 32
+bits_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3
+bits_mask: db 128, 64, 32, 16, 8, 4, 2, 1,128, 64, 32, 16, 8, 4, 2, 1
+bits_reverse: db 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
+
+nibble_mask: times 16 db 0x0F
+ones_mask: times 16 db 0x01
+
+SECTION .text
+
+;---------------------------------------------------------
+; Global entry point
+
+%macro process_fn 1 ; num_planes
+cglobal sws_process%1_x86, 6, 7 + 2 * %1, 16
+ ; set up static registers
+ mov in0q, [execq + SwsOpExec.in0]
+IF %1 > 1, mov in1q, [execq + SwsOpExec.in1]
+IF %1 > 2, mov in2q, [execq + SwsOpExec.in2]
+IF %1 > 3, mov in3q, [execq + SwsOpExec.in3]
+ mov out0q, [execq + SwsOpExec.out0]
+IF %1 > 1, mov out1q, [execq + SwsOpExec.out1]
+IF %1 > 2, mov out2q, [execq + SwsOpExec.out2]
+IF %1 > 3, mov out3q, [execq + SwsOpExec.out3]
+ push implq
+.loop:
+ mov tmp0q, [implq + SwsOpImpl.cont]
+ add implq, SwsOpImpl.next
+ call tmp0q
+ mov implq, [rsp + 0]
+ dec blocksd
+ jg .loop
+
+ ; clean up
+ add rsp, 8
+ RET
+%endmacro
+
+process_fn 1
+process_fn 2
+process_fn 3
+process_fn 4
+
+;---------------------------------------------------------
+; Planar reads / writes
+
+%macro read_planar 1 ; elems
+op read_planar%1
+ movu mx, [in0q]
+IF %1 > 1, movu my, [in1q]
+IF %1 > 2, movu mz, [in2q]
+IF %1 > 3, movu mw, [in3q]
+%if V2
+ movu mx2, [in0q + mmsize]
+IF %1 > 1, movu my2, [in1q + mmsize]
+IF %1 > 2, movu mz2, [in2q + mmsize]
+IF %1 > 3, movu mw2, [in3q + mmsize]
+%endif
+ LOAD_CONT tmp0q
+ add in0q, mmsize * (1 + V2)
+IF %1 > 1, add in1q, mmsize * (1 + V2)
+IF %1 > 2, add in2q, mmsize * (1 + V2)
+IF %1 > 3, add in3q, mmsize * (1 + V2)
+ CONTINUE tmp0q
+%endmacro
+
+%macro write_planar 1 ; elems
+op write_planar%1
+ movu [out0q], mx
+IF %1 > 1, movu [out1q], my
+IF %1 > 2, movu [out2q], mz
+IF %1 > 3, movu [out3q], mw
+%if V2
+ movu [out0q + mmsize], mx2
+IF %1 > 1, movu [out1q + mmsize], my2
+IF %1 > 2, movu [out2q + mmsize], mz2
+IF %1 > 3, movu [out3q + mmsize], mw2
+%endif
+ add out0q, mmsize * (1 + V2)
+IF %1 > 1, add out1q, mmsize * (1 + V2)
+IF %1 > 2, add out2q, mmsize * (1 + V2)
+IF %1 > 3, add out3q, mmsize * (1 + V2)
+ END_CHAIN
+%endmacro
+
+%macro read_packed2 1 ; depth
+op read%1_packed2
+ movu m8, [in0q + 0*mmsize]
+ movu m9, [in0q + 1*mmsize]
+ IF V2, movu m10, [in0q + 2*mmsize]
+ IF V2, movu m11, [in0q + 3*mmsize]
+IF %1 < 32, VBROADCASTI128 m12, [read%1_unpack2]
+ LOAD_CONT tmp0q
+ add in0q, mmsize * (2 + V2 * 2)
+%if %1 == 32
+ shufps m8, m8, q3120
+ shufps m9, m9, q3120
+ IF V2, shufps m10, m10, q3120
+ IF V2, shufps m11, m11, q3120
+%else
+ pshufb m8, m12 ; { X0 Y0 | X1 Y1 }
+ pshufb m9, m12 ; { X2 Y2 | X3 Y3 }
+ IF V2, pshufb m10, m12
+ IF V2, pshufb m11, m12
+%endif
+ unpcklpd mx, m8, m9 ; { X0 X2 | X1 X3 }
+ unpckhpd my, m8, m9 ; { Y0 Y2 | Y1 Y3 }
+ IF V2, unpcklpd mx2, m10, m11
+ IF V2, unpckhpd my2, m10, m11
+%if avx_enabled
+ vpermq mx, mx, q3120 ; { X0 X1 | X2 X3 }
+ vpermq my, my, q3120 ; { Y0 Y1 | Y2 Y3 }
+ IF V2, vpermq mx2, mx2, q3120
+ IF V2, vpermq my2, my2, q3120
+%endif
+ CONTINUE tmp0q
+%endmacro
+
+%macro write_packed2 1 ; depth
+op write%1_packed2
+IF %1 < 32, VBROADCASTI128 m12, [write%1_pack2]
+ LOAD_CONT tmp0q
+%if avx_enabled
+ vpermq mx, mx, q3120 ; { X0 X2 | X1 X3 }
+ vpermq my, my, q3120 ; { Y0 Y2 | Y1 Y3 }
+ IF V2, vpermq mx2, mx2, q3120
+ IF V2, vpermq my2, my2, q3120
+%endif
+ unpcklpd m8, mx, my ; { X0 Y0 | X1 Y1 }
+ unpckhpd m9, mx, my ; { X2 Y2 | X3 Y3 }
+ IF V2, unpcklpd m10, mx2, my2
+ IF V2, unpckhpd m11, mx2, my2
+%if %1 == 32
+ shufps m8, m8, q3120
+ shufps m9, m9, q3120
+ IF V2, shufps m10, m10, q3120
+ IF V2, shufps m11, m11, q3120
+%else
+ pshufb m8, m12
+ pshufb m9, m12
+ IF V2, pshufb m10, m12
+ IF V2, pshufb m11, m12
+%endif
+ movu [out0q + 0*mmsize], m8
+ movu [out0q + 1*mmsize], m9
+IF V2, movu [out0q + 2*mmsize], m10
+IF V2, movu [out0q + 3*mmsize], m11
+ add out0q, mmsize * (2 + V2 * 2)
+ END_CHAIN
+%endmacro
+
+%macro read_packed_inner 7 ; x, y, z, w, addr, num, depth
+ movu xm8, [%5 + 0 * %6]
+ movu xm9, [%5 + 4 * %6]
+ movu xm10, [%5 + 8 * %6]
+ movu xm11, [%5 + 12 * %6]
+ %if avx_enabled
+ vinserti128 m8, m8, [%5 + 16 * %6], 1
+ vinserti128 m9, m9, [%5 + 20 * %6], 1
+ vinserti128 m10, m10, [%5 + 24 * %6], 1
+ vinserti128 m11, m11, [%5 + 28 * %6], 1
+ %endif
+ %if %7 == 32
+ mova %1, m8
+ mova %2, m9
+ mova %3, m10
+ mova %4, m11
+ %else
+ pshufb %1, m8, m12 ; { X0 Y0 Z0 W0 | X4 Y4 Z4 W4 }
+ pshufb %2, m9, m12 ; { X1 Y1 Z1 W1 | X5 Y5 Z5 W5 }
+ pshufb %3, m10, m12 ; { X2 Y2 Z2 W2 | X6 Y6 Z6 W6 }
+ pshufb %4, m11, m12 ; { X3 Y3 Z3 W3 | X7 Y7 Z7 W7 }
+ %endif
+ punpckldq m8, %1, %2 ; { X0 X1 Y0 Y1 | X4 X5 Y4 Y5 }
+ punpckldq m9, %3, %4 ; { X2 X3 Y2 Y3 | X6 X7 Y6 Y7 }
+ punpckhdq m10, %1, %2 ; { Z0 Z1 W0 W1 | Z4 Z5 W4 W5 }
+ punpckhdq m11, %3, %4 ; { Z2 Z3 W2 W3 | Z6 Z7 W6 W7 }
+ punpcklqdq %1, m8, m9 ; { X0 X1 X2 X3 | X4 X5 X6 X7 }
+ punpckhqdq %2, m8, m9 ; { Y0 Y1 Y2 Y3 | Y4 Y5 Y6 Y7 }
+ punpcklqdq %3, m10, m11 ; { Z0 Z1 Z2 Z3 | Z4 Z5 Z6 Z7 }
+IF %6 > 3, punpckhqdq %4, m10, m11 ; { W0 W1 W2 W3 | W4 W5 W6 W7 }
+%endmacro
+
+%macro read_packed 2 ; num, depth
+op read%2_packed%1
+IF %2 < 32, VBROADCASTI128 m12, [read%2_unpack%1]
+ LOAD_CONT tmp0q
+ read_packed_inner mx, my, mz, mw, in0q, %1, %2
+IF1 V2, read_packed_inner mx2, my2, mz2, mw2, in0q + %1 * mmsize, %1, %2
+ add in0q, %1 * mmsize * (1 + V2)
+ CONTINUE tmp0q
+%endmacro
+
+%macro write_packed_inner 7 ; x, y, z, w, addr, num, depth
+ punpckldq m8, %1, %2 ; { X0 Y0 X1 Y1 | X4 Y4 X5 Y5 }
+ punpckldq m9, %3, %4 ; { Z0 W0 Z1 W1 | Z4 W4 Z5 W5 }
+ punpckhdq m10, %1, %2 ; { X2 Y2 X3 Y3 | X6 Y6 X7 Y7 }
+ punpckhdq m11, %3, %4 ; { Z2 W2 Z3 W3 | Z6 W6 Z7 W7 }
+ punpcklqdq %1, m8, m9 ; { X0 Y0 Z0 W0 | X4 Y4 Z4 W4 }
+ punpckhqdq %2, m8, m9 ; { X1 Y1 Z1 W1 | X5 Y5 Z5 W5 }
+ punpcklqdq %3, m10, m11 ; { X2 Y2 Z2 W2 | X6 Y6 Z6 W6 }
+ punpckhqdq %4, m10, m11 ; { X3 Y3 Z3 W3 | X7 Y7 Z7 W7 }
+ %if %7 == 32
+ mova m8, %1
+ mova m9, %2
+ mova m10, %3
+ mova m11, %4
+ %else
+ pshufb m8, %1, m12
+ pshufb m9, %2, m12
+ pshufb m10, %3, m12
+ pshufb m11, %4, m12
+ %endif
+ movu [%5 + 0*%6], xm8
+ movu [%5 + 4*%6], xm9
+ movu [%5 + 8*%6], xm10
+ movu [%5 + 12*%6], xm11
+ %if avx_enabled
+ vextracti128 [%5 + 16*%6], m8, 1
+ vextracti128 [%5 + 20*%6], m9, 1
+ vextracti128 [%5 + 24*%6], m10, 1
+ vextracti128 [%5 + 28*%6], m11, 1
+ %endif
+%endmacro
+
+%macro write_packed 2 ; num, depth
+op write%2_packed%1
+IF %2 < 32, VBROADCASTI128 m12, [write%2_pack%1]
+ write_packed_inner mx, my, mz, mw, out0q, %1, %2
+IF1 V2, write_packed_inner mx2, my2, mz2, mw2, out0q + %1 * mmsize, %1, %2
+ add out0q, %1 * mmsize * (1 + V2)
+ END_CHAIN
+%endmacro
+
+%macro rw_packed 1 ; depth
+ read_packed2 %1
+ read_packed 3, %1
+ read_packed 4, %1
+ write_packed2 %1
+ write_packed 3, %1
+ write_packed 4, %1
+%endmacro
+
+%macro read_nibbles 0
+op read_nibbles1
+%if avx_enabled
+ movu xmx, [in0q]
+IF V2, movu xmx2, [in0q + 16]
+%else
+ movq xmx, [in0q]
+IF V2, movq xmx2, [in0q + 8]
+%endif
+ VBROADCASTI128 m8, [nibble_mask]
+ LOAD_CONT tmp0q
+ add in0q, (mmsize >> 1) * (1 + V2)
+ pmovzxbw mx, xmx
+IF V2, pmovzxbw mx2, xmx2
+ psllw my, mx, 8
+IF V2, psllw my2, mx2, 8
+ psrlw mx, 4
+IF V2, psrlw mx2, 4
+ pand my, m8
+IF V2, pand my2, m8
+ por mx, my
+IF V2, por mx2, my2
+ CONTINUE tmp0q
+%endmacro
+
+%macro read_bits 0
+op read_bits1
+%if avx_enabled
+ vpbroadcastd mx, [in0q]
+IF V2, vpbroadcastd mx2, [in0q + 4]
+%else
+ movd mx, [in0q]
+IF V2, movd mx2, [in0q + 2]
+%endif
+ mova m8, [bits_shuf]
+ VBROADCASTI128 m9, [bits_mask]
+ VBROADCASTI128 m10, [ones_mask]
+ LOAD_CONT tmp0q
+ add in0q, (mmsize >> 3) * (1 + V2)
+ pshufb mx, m8
+IF V2, pshufb mx2, m8
+ pand mx, m9
+IF V2, pand mx2, m9
+ pcmpeqb mx, m9
+IF V2, pcmpeqb mx2, m9
+ pand mx, m10
+IF V2, pand mx2, m10
+ CONTINUE tmp0q
+%endmacro
+
+%macro write_bits 0
+op write_bits1
+ VBROADCASTI128 m8, [bits_reverse]
+ psllw mx, 7
+IF V2, psllw mx2, 7
+ pshufb mx, m8
+IF V2, pshufb mx2, m8
+ pmovmskb tmp0d, mx
+IF V2, pmovmskb tmp1d, mx2
+%if avx_enabled
+ mov [out0q], tmp0d
+IF V2, mov [out0q + 4], tmp1d
+%else
+ mov [out0q], tmp0d
+IF V2, mov [out0q + 2], tmp1d
+%endif
+ add out0q, (mmsize >> 3) * (1 + V2)
+ END_CHAIN
+%endmacro
+
+;---------------------------------------------------------
+; Generic byte order shuffle (packed swizzle, endian, etc)
+
+%macro shuffle 0
+op shuffle
+ VBROADCASTI128 m8, [implq + SwsOpImpl.priv]
+ LOAD_CONT tmp0q
+IF X, pshufb mx, m8
+IF Y, pshufb my, m8
+IF Z, pshufb mz, m8
+IF W, pshufb mw, m8
+%if V2
+IF X, pshufb mx2, m8
+IF Y, pshufb my2, m8
+IF Z, pshufb mz2, m8
+IF W, pshufb mw2, m8
+%endif
+ CONTINUE tmp0q
+%endmacro
+
+;---------------------------------------------------------
+; Clearing
+
+%macro clear_alpha 3 ; idx, vreg, vreg2
+op clear_alpha%1
+ LOAD_CONT tmp0q
+ pcmpeqb %2, %2
+IF V2, mova %3, %2
+ CONTINUE tmp0q
+%endmacro
+
+%macro clear_zero 3 ; idx, vreg, vreg2
+op clear_zero%1
+ LOAD_CONT tmp0q
+ pxor %2, %2
+IF V2, mova %3, %2
+ CONTINUE tmp0q
+%endmacro
+
+%macro clear_generic 0
+op clear
+ LOAD_CONT tmp0q
+%if avx_enabled
+ IF !X, vpbroadcastd mx, [implq + SwsOpImpl.priv + 0]
+ IF !Y, vpbroadcastd my, [implq + SwsOpImpl.priv + 4]
+ IF !Z, vpbroadcastd mz, [implq + SwsOpImpl.priv + 8]
+ IF !W, vpbroadcastd mw, [implq + SwsOpImpl.priv + 12]
+%else ; !avx_enabled
+ IF !X, movd mx, [implq + SwsOpImpl.priv + 0]
+ IF !Y, movd my, [implq + SwsOpImpl.priv + 4]
+ IF !Z, movd mz, [implq + SwsOpImpl.priv + 8]
+ IF !W, movd mw, [implq + SwsOpImpl.priv + 12]
+ IF !X, pshufd mx, mx, 0
+ IF !Y, pshufd my, my, 0
+ IF !Z, pshufd mz, mz, 0
+ IF !W, pshufd mw, mw, 0
+%endif
+%if V2
+ IF !X, mova mx2, mx
+ IF !Y, mova my2, my
+ IF !Z, mova mz2, mz
+ IF !W, mova mw2, mw
+%endif
+ CONTINUE tmp0q
+%endmacro
+
+%macro clear_funcs 0
+ decl_pattern 1, 1, 1, 0, clear_generic
+ decl_pattern 0, 1, 1, 1, clear_generic
+ decl_pattern 0, 0, 1, 1, clear_generic
+ decl_pattern 1, 0, 0, 1, clear_generic
+ decl_pattern 1, 1, 0, 0, clear_generic
+ decl_pattern 0, 1, 0, 1, clear_generic
+ decl_pattern 1, 0, 1, 0, clear_generic
+ decl_pattern 1, 0, 0, 0, clear_generic
+ decl_pattern 0, 1, 0, 0, clear_generic
+ decl_pattern 0, 0, 1, 0, clear_generic
+%endmacro
+
+;---------------------------------------------------------
+; Swizzling and duplicating
+
+; mA := mB, mB := mC, ... mX := mA
+%macro vrotate 2-* ; A, B, C, ...
+ %rep %0
+ %assign rot_a %1 + 4
+ %assign rot_b %2 + 4
+ mova m%1, m%2
+ IF V2, mova m%[rot_a], m%[rot_b]
+ %rotate 1
+ %endrep
+ %undef rot_a
+ %undef rot_b
+%endmacro
+
+%macro swizzle_funcs 0
+op swizzle_3012
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 3, 2, 1
+ CONTINUE tmp0q
+
+op swizzle_3021
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 3, 1
+ CONTINUE tmp0q
+
+op swizzle_2103
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 2
+ CONTINUE tmp0q
+
+op swizzle_3210
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 3
+ vrotate 8, 1, 2
+ CONTINUE tmp0q
+
+op swizzle_3102
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 3, 2
+ CONTINUE tmp0q
+
+op swizzle_3201
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 3, 1, 2
+ CONTINUE tmp0q
+
+op swizzle_1203
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 1, 2
+ CONTINUE tmp0q
+
+op swizzle_1023
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 1
+ CONTINUE tmp0q
+
+op swizzle_2013
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 2, 1
+ CONTINUE tmp0q
+
+op swizzle_2310
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 2, 1, 3
+ CONTINUE tmp0q
+
+op swizzle_2130
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 2, 3
+ CONTINUE tmp0q
+
+op swizzle_1230
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 1, 2, 3
+ CONTINUE tmp0q
+
+op swizzle_1320
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 1, 3
+ CONTINUE tmp0q
+
+op swizzle_0213
+ LOAD_CONT tmp0q
+ vrotate 8, 1, 2
+ CONTINUE tmp0q
+
+op swizzle_0231
+ LOAD_CONT tmp0q
+ vrotate 8, 1, 2, 3
+ CONTINUE tmp0q
+
+op swizzle_0312
+ LOAD_CONT tmp0q
+ vrotate 8, 1, 3, 2
+ CONTINUE tmp0q
+
+op swizzle_3120
+ LOAD_CONT tmp0q
+ vrotate 8, 0, 3
+ CONTINUE tmp0q
+
+op swizzle_0321
+ LOAD_CONT tmp0q
+ vrotate 8, 1, 3
+ CONTINUE tmp0q
+
+op swizzle_0003
+ LOAD_CONT tmp0q
+ mova my, mx
+ mova mz, mx
+%if V2
+ mova my2, mx2
+ mova mz2, mx2
+%endif
+ CONTINUE tmp0q
+
+op swizzle_0001
+ LOAD_CONT tmp0q
+ mova mw, my
+ mova mz, mx
+ mova my, mx
+%if V2
+ mova mw2, my2
+ mova mz2, mx2
+ mova my2, mx2
+%endif
+ CONTINUE tmp0q
+
+op swizzle_3000
+ LOAD_CONT tmp0q
+ mova my, mx
+ mova mz, mx
+ mova mx, mw
+ mova mw, my
+%if V2
+ mova my2, mx2
+ mova mz2, mx2
+ mova mx2, mw2
+ mova mw2, my2
+%endif
+ CONTINUE tmp0q
+
+op swizzle_1000
+ LOAD_CONT tmp0q
+ mova mz, mx
+ mova mw, mx
+ mova mx, my
+ mova my, mz
+%if V2
+ mova mz2, mx2
+ mova mw2, mx2
+ mova mx2, my2
+ mova my2, mz2
+%endif
+ CONTINUE tmp0q
+%endmacro
+
+%macro packed_shuffle 2-3 ; size_in, size_out, shift
+cglobal packed_shuffle%1_%2, 3, 5, 2, exec, shuffle, blocks, src, dst
+ mov srcq, [execq + SwsOpExec.in0]
+ mov dstq, [execq + SwsOpExec.out0]
+ VBROADCASTI128 m1, [shuffleq]
+ %ifnum %3
+ shl blocksd, %3
+ %else
+ imul blocksd, %2
+ %endif
+IF %1==%2, add srcq, blocksq
+ add dstq, blocksq
+ neg blocksq
+.loop:
+ %if %1 == %2
+ MOVSZ %1, m0, [srcq + blocksq]
+ %else
+ MOVSZ %1, m0, [srcq]
+ %endif
+ pshufb m0, m1
+ movu [dstq + blocksq], m0
+IF %1!=%2, add srcq, %1
+ add blocksq, %2
+ jl .loop
+ RET
+%endmacro
+
+;---------------------------------------------------------
+; Pixel type conversions
+
+%macro conv8to16 1 ; type
+op %1_U8_U16
+ LOAD_CONT tmp0q
+%if V2
+ %if avx_enabled
+ IF X, vextracti128 xmx2, mx, 1
+ IF Y, vextracti128 xmy2, my, 1
+ IF Z, vextracti128 xmz2, mz, 1
+ IF W, vextracti128 xmw2, mw, 1
+ %else
+ IF X, psrldq xmx2, mx, 8
+ IF Y, psrldq xmy2, my, 8
+ IF Z, psrldq xmz2, mz, 8
+ IF W, psrldq xmw2, mw, 8
+ %endif
+ IF X, pmovzxbw mx2, xmx2
+ IF Y, pmovzxbw my2, xmy2
+ IF Z, pmovzxbw mz2, xmz2
+ IF W, pmovzxbw mw2, xmw2
+%endif ; V2
+ IF X, pmovzxbw mx, xmx
+ IF Y, pmovzxbw my, xmy
+ IF Z, pmovzxbw mz, xmz
+ IF W, pmovzxbw mw, xmw
+
+%ifidn %1, expand
+ VBROADCASTI128 m8, [expand16_shuf]
+ %if V2
+ IF X, pshufb mx2, m8
+ IF Y, pshufb my2, m8
+ IF Z, pshufb mz2, m8
+ IF W, pshufb mw2, m8
+ %endif
+ IF X, pshufb mx, m8
+ IF Y, pshufb my, m8
+ IF Z, pshufb mz, m8
+ IF W, pshufb mw, m8
+%endif ; expand
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv16to8 0
+op convert_U16_U8
+ LOAD_CONT tmp0q
+%if V2
+ ; this code technically works for the !V2 case as well, but slower
+IF X, packuswb mx, mx2
+IF Y, packuswb my, my2
+IF Z, packuswb mz, mz2
+IF W, packuswb mw, mw2
+IF X, vpermq mx, mx, q3120
+IF Y, vpermq my, my, q3120
+IF Z, vpermq mz, mz, q3120
+IF W, vpermq mw, mw, q3120
+%else
+IF X, vextracti128 xm8, mx, 1
+IF Y, vextracti128 xm9, my, 1
+IF Z, vextracti128 xm10, mz, 1
+IF W, vextracti128 xm11, mw, 1
+IF X, packuswb xmx, xm8
+IF Y, packuswb xmy, xm9
+IF Z, packuswb xmz, xm10
+IF W, packuswb xmw, xm11
+%endif
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv8to32 1 ; type
+op %1_U8_U32
+ LOAD_CONT tmp0q
+IF X, psrldq xmx2, xmx, 8
+IF Y, psrldq xmy2, xmy, 8
+IF Z, psrldq xmz2, xmz, 8
+IF W, psrldq xmw2, xmw, 8
+IF X, pmovzxbd mx, xmx
+IF Y, pmovzxbd my, xmy
+IF Z, pmovzxbd mz, xmz
+IF W, pmovzxbd mw, xmw
+IF X, pmovzxbd mx2, xmx2
+IF Y, pmovzxbd my2, xmy2
+IF Z, pmovzxbd mz2, xmz2
+IF W, pmovzxbd mw2, xmw2
+%ifidn %1, expand
+ VBROADCASTI128 m8, [expand32_shuf]
+IF X, pshufb mx, m8
+IF Y, pshufb my, m8
+IF Z, pshufb mz, m8
+IF W, pshufb mw, m8
+IF X, pshufb mx2, m8
+IF Y, pshufb my2, m8
+IF Z, pshufb mz2, m8
+IF W, pshufb mw2, m8
+%endif ; expand
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv32to8 0
+op convert_U32_U8
+ LOAD_CONT tmp0q
+IF X, packusdw mx, mx2
+IF Y, packusdw my, my2
+IF Z, packusdw mz, mz2
+IF W, packusdw mw, mw2
+IF X, vextracti128 xmx2, mx, 1
+IF Y, vextracti128 xmy2, my, 1
+IF Z, vextracti128 xmz2, mz, 1
+IF W, vextracti128 xmw2, mw, 1
+IF X, packuswb xmx, xmx2
+IF Y, packuswb xmy, xmy2
+IF Z, packuswb xmz, xmz2
+IF W, packuswb xmw, xmw2
+IF X, vpshufd xmx, xmx, q3120
+IF Y, vpshufd xmy, xmy, q3120
+IF Z, vpshufd xmz, xmz, q3120
+IF W, vpshufd xmw, xmw, q3120
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv16to32 0
+op convert_U16_U32
+ LOAD_CONT tmp0q
+IF X, vextracti128 xmx2, mx, 1
+IF Y, vextracti128 xmy2, my, 1
+IF Z, vextracti128 xmz2, mz, 1
+IF W, vextracti128 xmw2, mw, 1
+IF X, pmovzxwd mx, xmx
+IF Y, pmovzxwd my, xmy
+IF Z, pmovzxwd mz, xmz
+IF W, pmovzxwd mw, xmw
+IF X, pmovzxwd mx2, xmx2
+IF Y, pmovzxwd my2, xmy2
+IF Z, pmovzxwd mz2, xmz2
+IF W, pmovzxwd mw2, xmw2
+ CONTINUE tmp0q
+%endmacro
+
+%macro conv32to16 0
+op convert_U32_U16
+ LOAD_CONT tmp0q
+IF X, packusdw mx, mx2
+IF Y, packusdw my, my2
+IF Z, packusdw mz, mz2
+IF W, packusdw mw, mw2
+IF X, vpermq mx, mx, q3120
+IF Y, vpermq my, my, q3120
+IF Z, vpermq mz, mz, q3120
+IF W, vpermq mw, mw, q3120
+ CONTINUE tmp0q
+%endmacro
+
+;---------------------------------------------------------
+; Shifting
+
+%macro lshift16 0
+op lshift16
+ vmovq xm8, [implq + SwsOpImpl.priv]
+ LOAD_CONT tmp0q
+IF X, psllw mx, xm8
+IF Y, psllw my, xm8
+IF Z, psllw mz, xm8
+IF W, psllw mw, xm8
+%if V2
+IF X, psllw mx2, xm8
+IF Y, psllw my2, xm8
+IF Z, psllw mz2, xm8
+IF W, psllw mw2, xm8
+%endif
+ CONTINUE tmp0q
+%endmacro
+
+%macro rshift16 0
+op rshift16
+ vmovq xm8, [implq + SwsOpImpl.priv]
+ LOAD_CONT tmp0q
+IF X, psrlw mx, xm8
+IF Y, psrlw my, xm8
+IF Z, psrlw mz, xm8
+IF W, psrlw mw, xm8
+%if V2
+IF X, psrlw mx2, xm8
+IF Y, psrlw my2, xm8
+IF Z, psrlw mz2, xm8
+IF W, psrlw mw2, xm8
+%endif
+ CONTINUE tmp0q
+%endmacro
+
+;---------------------------------------------------------
+; Function instantiations
+
+%macro funcs_u8 0
+ read_planar 1
+ read_planar 2
+ read_planar 3
+ read_planar 4
+ write_planar 1
+ write_planar 2
+ write_planar 3
+ write_planar 4
+
+ rw_packed 8
+ read_nibbles
+ read_bits
+ write_bits
+
+ clear_alpha 0, mx, mx2
+ clear_alpha 1, my, my2
+ clear_alpha 3, mw, mw2
+ clear_zero 0, mx, mx2
+ clear_zero 1, my, my2
+ clear_zero 3, mw, mw2
+ clear_funcs
+ swizzle_funcs
+
+ decl_common_patterns shuffle
+%endmacro
+
+%macro funcs_u16 0
+ rw_packed 16
+ decl_common_patterns conv8to16 convert
+ decl_common_patterns conv8to16 expand
+ decl_common_patterns conv16to8
+ decl_common_patterns lshift16
+ decl_common_patterns rshift16
+%endmacro
+
+INIT_XMM sse4
+decl_v2 0, funcs_u8
+decl_v2 1, funcs_u8
+
+packed_shuffle 5, 15 ; 8 -> 24
+packed_shuffle 4, 16, 4 ; 8 -> 32, 16 -> 64
+packed_shuffle 2, 12 ; 8 -> 48
+packed_shuffle 10, 15 ; 16 -> 24
+packed_shuffle 8, 16, 4 ; 16 -> 32, 32 -> 64
+packed_shuffle 4, 12 ; 16 -> 48
+packed_shuffle 15, 15 ; 24 -> 24
+packed_shuffle 12, 16, 4 ; 24 -> 32
+packed_shuffle 6, 12 ; 24 -> 48
+packed_shuffle 16, 12 ; 32 -> 24, 64 -> 48
+packed_shuffle 16, 16, 4 ; 32 -> 32, 64 -> 64
+packed_shuffle 8, 12 ; 32 -> 48
+packed_shuffle 12, 12 ; 48 -> 48
+
+INIT_YMM avx2
+decl_v2 0, funcs_u8
+decl_v2 1, funcs_u8
+decl_v2 0, funcs_u16
+decl_v2 1, funcs_u16
+
+packed_shuffle 32, 32
+
+INIT_YMM avx2
+decl_v2 1, rw_packed 32
+decl_common_patterns conv8to32 convert
+decl_common_patterns conv8to32 expand
+decl_common_patterns conv32to8
+decl_common_patterns conv16to32
+decl_common_patterns conv32to16
--
2.49.0
More information about the ffmpeg-devel
mailing list