[FFmpeg-devel] [PATCH v2 5/7] swscale/la: Optimize the functions of the swscale series with lsx.
Hao Chen
chenhao at loongson.cn
Wed May 17 10:03:49 EEST 2023
From: Lu Wang <wanglu at loongson.cn>
./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480
-pix_fmt bgra -y /dev/null -an
before: 91fps
after: 160fps
---
libswscale/loongarch/Makefile | 5 +
libswscale/loongarch/input.S | 285 +++
libswscale/loongarch/output.S | 138 ++
libswscale/loongarch/output_lasx.c | 4 +-
libswscale/loongarch/output_lsx.c | 1828 ++++++++++++++++
libswscale/loongarch/swscale.S | 1868 +++++++++++++++++
libswscale/loongarch/swscale_init_loongarch.c | 32 +-
libswscale/loongarch/swscale_loongarch.h | 43 +-
libswscale/loongarch/swscale_lsx.c | 57 +
libswscale/utils.c | 3 +-
10 files changed, 4256 insertions(+), 7 deletions(-)
create mode 100644 libswscale/loongarch/input.S
create mode 100644 libswscale/loongarch/output.S
create mode 100644 libswscale/loongarch/output_lsx.c
create mode 100644 libswscale/loongarch/swscale.S
create mode 100644 libswscale/loongarch/swscale_lsx.c
diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 8e665e826c..c0b6a449c0 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -4,3 +4,8 @@ LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
loongarch/yuv2rgb_lasx.o \
loongarch/rgb2rgb_lasx.o \
loongarch/output_lasx.o
+LSX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale.o \
+ loongarch/swscale_lsx.o \
+ loongarch/input.o \
+ loongarch/output.o \
+ loongarch/output_lsx.o
diff --git a/libswscale/loongarch/input.S b/libswscale/loongarch/input.S
new file mode 100644
index 0000000000..d01f7384b1
--- /dev/null
+++ b/libswscale/loongarch/input.S
@@ -0,0 +1,285 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
+ * int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_y_lsx
+ ld.d a5, a1, 0
+ ld.d a6, a1, 8
+ ld.d a7, a1, 16
+
+ ld.w t1, a3, 0 // ry
+ ld.w t2, a3, 4 // gy
+ ld.w t3, a3, 8 // by
+ li.w t4, 9
+ li.w t5, 524544
+ li.w t7, 4
+ li.w t8, 8
+ vldi vr7, 0
+ vreplgr2vr.w vr1, t1
+ vreplgr2vr.w vr2, t2
+ vreplgr2vr.w vr3, t3
+ vreplgr2vr.w vr4, t4
+ vreplgr2vr.w vr5, t5
+ bge a2, t8, .WIDTH8
+ bge a2, t7, .WIDTH4
+ blt zero, a2, .WIDTH
+ b .END
+
+.WIDTH8:
+ vld vr8, a5, 0
+ vld vr9, a6, 0
+ vld vr10, a7, 0
+ vilvl.b vr11, vr7, vr8
+ vilvl.b vr12, vr7, vr9
+ vilvl.b vr13, vr7, vr10
+ vilvl.h vr14, vr7, vr11
+ vilvl.h vr15, vr7, vr12
+ vilvl.h vr16, vr7, vr13
+ vilvh.h vr17, vr7, vr11
+ vilvh.h vr18, vr7, vr12
+ vilvh.h vr19, vr7, vr13
+ vmul.w vr20, vr1, vr16
+ vmul.w vr21, vr1, vr19
+ vmadd.w vr20, vr2, vr14
+ vmadd.w vr20, vr3, vr15
+ vmadd.w vr21, vr2, vr17
+ vmadd.w vr21, vr3, vr18
+ vadd.w vr20, vr20, vr5
+ vadd.w vr21, vr21, vr5
+ vsra.w vr20, vr20, vr4
+ vsra.w vr21, vr21, vr4
+ vpickev.h vr20, vr21, vr20
+ vst vr20, a0, 0
+ addi.d a2, a2, -8
+ addi.d a5, a5, 8
+ addi.d a6, a6, 8
+ addi.d a7, a7, 8
+ addi.d a0, a0, 16
+ bge a2, t8, .WIDTH8
+ bge a2, t7, .WIDTH4
+ blt zero, a2, .WIDTH
+ b .END
+
+.WIDTH4:
+ vld vr8, a5, 0
+ vld vr9, a6, 0
+ vld vr10, a7, 0
+ vilvl.b vr11, vr7, vr8
+ vilvl.b vr12, vr7, vr9
+ vilvl.b vr13, vr7, vr10
+ vilvl.h vr14, vr7, vr11
+ vilvl.h vr15, vr7, vr12
+ vilvl.h vr16, vr7, vr13
+ vmul.w vr17, vr1, vr16
+ vmadd.w vr17, vr2, vr14
+ vmadd.w vr17, vr3, vr15
+ vadd.w vr17, vr17, vr5
+ vsra.w vr17, vr17, vr4
+ vpickev.h vr17, vr17, vr17
+ vstelm.d vr17, a0, 0, 0
+ addi.d a2, a2, -4
+ addi.d a5, a5, 4
+ addi.d a6, a6, 4
+ addi.d a7, a7, 4
+ addi.d a0, a0, 8
+ bge a2, t7, .WIDTH4
+ blt zero, a2, .WIDTH
+ b .END
+
+.WIDTH:
+ ld.bu t0, a5, 0
+ ld.bu t4, a6, 0
+ ld.bu t6, a7, 0
+ mul.w t8, t6, t1
+ mul.w t7, t0, t2
+ add.w t8, t8, t7
+ mul.w t7, t4, t3
+ add.w t8, t8, t7
+ add.w t8, t8, t5
+ srai.w t8, t8, 9
+ st.h t8, a0, 0
+ addi.d a2, a2, -1
+ addi.d a5, a5, 1
+ addi.d a6, a6, 1
+ addi.d a7, a7, 1
+ addi.d a0, a0, 2
+ blt zero, a2, .WIDTH
+.END:
+endfunc
+
+/* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+ * int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_uv_lsx
+ addi.d sp, sp, -24
+ st.d s1, sp, 0
+ st.d s2, sp, 8
+ st.d s3, sp, 16
+
+ ld.d a5, a2, 0
+ ld.d a6, a2, 8
+ ld.d a7, a2, 16
+ ld.w t1, a4, 12 // ru
+ ld.w t2, a4, 16 // gu
+ ld.w t3, a4, 20 // bu
+ ld.w s1, a4, 24 // rv
+ ld.w s2, a4, 28 // gv
+ ld.w s3, a4, 32 // bv
+ li.w t4, 9
+ li.w t5, 4194560
+ li.w t7, 4
+ li.w t8, 8
+ vldi vr0, 0
+ vreplgr2vr.w vr1, t1
+ vreplgr2vr.w vr2, t2
+ vreplgr2vr.w vr3, t3
+ vreplgr2vr.w vr4, s1
+ vreplgr2vr.w vr5, s2
+ vreplgr2vr.w vr6, s3
+ vreplgr2vr.w vr7, t4
+ vreplgr2vr.w vr8, t5
+ bge a2, t8, .LOOP_WIDTH8
+ bge a2, t7, .LOOP_WIDTH4
+ blt zero, a2, .LOOP_WIDTH
+ b .LOOP_END
+
+.LOOP_WIDTH8:
+ vld vr9, a5, 0
+ vld vr10, a6, 0
+ vld vr11, a7, 0
+ vilvl.b vr9, vr0, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr0, vr11
+ vilvl.h vr12, vr0, vr9
+ vilvl.h vr13, vr0, vr10
+ vilvl.h vr14, vr0, vr11
+ vilvh.h vr15, vr0, vr9
+ vilvh.h vr16, vr0, vr10
+ vilvh.h vr17, vr0, vr11
+ vmul.w vr18, vr1, vr14
+ vmul.w vr19, vr1, vr17
+ vmul.w vr20, vr4, vr14
+ vmul.w vr21, vr4, vr17
+ vmadd.w vr18, vr2, vr12
+ vmadd.w vr18, vr3, vr13
+ vmadd.w vr19, vr2, vr15
+ vmadd.w vr19, vr3, vr16
+ vmadd.w vr20, vr5, vr12
+ vmadd.w vr20, vr6, vr13
+ vmadd.w vr21, vr5, vr15
+ vmadd.w vr21, vr6, vr16
+ vadd.w vr18, vr18, vr8
+ vadd.w vr19, vr19, vr8
+ vadd.w vr20, vr20, vr8
+ vadd.w vr21, vr21, vr8
+ vsra.w vr18, vr18, vr7
+ vsra.w vr19, vr19, vr7
+ vsra.w vr20, vr20, vr7
+ vsra.w vr21, vr21, vr7
+ vpickev.h vr18, vr19, vr18
+ vpickev.h vr20, vr21, vr20
+ vst vr18, a0, 0
+ vst vr20, a1, 0
+ addi.d a3, a3, -8
+ addi.d a5, a5, 8
+ addi.d a6, a6, 8
+ addi.d a7, a7, 8
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ bge a3, t8, .LOOP_WIDTH8
+ bge a3, t7, .LOOP_WIDTH4
+ blt zero, a3, .LOOP_WIDTH
+ b .LOOP_END
+
+.LOOP_WIDTH4:
+ vld vr9, a5, 0
+ vld vr10, a6, 0
+ vld vr11, a7, 0
+ vilvl.b vr9, vr0, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr0, vr11
+ vilvl.h vr12, vr0, vr9
+ vilvl.h vr13, vr0, vr10
+ vilvl.h vr14, vr0, vr11
+ vmul.w vr18, vr1, vr14
+ vmul.w vr19, vr4, vr14
+ vmadd.w vr18, vr2, vr12
+ vmadd.w vr18, vr3, vr13
+ vmadd.w vr19, vr5, vr12
+ vmadd.w vr19, vr6, vr13
+ vadd.w vr18, vr18, vr8
+ vadd.w vr19, vr19, vr8
+ vsra.w vr18, vr18, vr7
+ vsra.w vr19, vr19, vr7
+ vpickev.h vr18, vr18, vr18
+ vpickev.h vr19, vr19, vr19
+ vstelm.d vr18, a0, 0, 0
+ vstelm.d vr19, a1, 0, 0
+ addi.d a3, a3, -4
+ addi.d a5, a5, 4
+ addi.d a6, a6, 4
+ addi.d a7, a7, 4
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ bge a3, t7, .LOOP_WIDTH4
+ blt zero, a3, .LOOP_WIDTH
+ b .LOOP_END
+
+.LOOP_WIDTH:
+ ld.bu t0, a5, 0
+ ld.bu t4, a6, 0
+ ld.bu t6, a7, 0
+ mul.w t8, t6, t1
+ mul.w t7, t0, t2
+ add.w t8, t8, t7
+ mul.w t7, t4, t3
+ add.w t8, t8, t7
+ add.w t8, t8, t5
+ srai.w t8, t8, 9
+ st.h t8, a0, 0
+ mul.w t8, t6, s1
+ mul.w t7, t0, s2
+ add.w t8, t8, t7
+ mul.w t7, t4, s3
+ add.w t8, t8, t7
+ add.w t8, t8, t5
+ srai.w t8, t8, 9
+ st.h t8, a1, 0
+ addi.d a3, a3, -1
+ addi.d a5, a5, 1
+ addi.d a6, a6, 1
+ addi.d a7, a7, 1
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ blt zero, a3, .LOOP_WIDTH
+
+.LOOP_END:
+ ld.d s1, sp, 0
+ ld.d s2, sp, 8
+ ld.d s3, sp, 16
+ addi.d sp, sp, 24
+endfunc
diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S
new file mode 100644
index 0000000000..b44bac502a
--- /dev/null
+++ b/libswscale/loongarch/output.S
@@ -0,0 +1,138 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+ * const int16_t **src, uint8_t *dest, int dstW,
+ * const uint8_t *dither, int offset)
+ */
+function ff_yuv2planeX_8_lsx
+ addi.w t1, a6, 1
+ addi.w t2, a6, 2
+ addi.w t3, a6, 3
+ addi.w t4, a6, 4
+ addi.w t5, a6, 5
+ addi.w t6, a6, 6
+ addi.w t7, a6, 7
+ andi t0, a6, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a5, t0
+ ldx.bu t1, a5, t1
+ ldx.bu t2, a5, t2
+ ldx.bu t3, a5, t3
+ ldx.bu t4, a5, t4
+ ldx.bu t5, a5, t5
+ ldx.bu t6, a5, t6
+ ldx.bu t7, a5, t7
+ vreplgr2vr.w vr0, t0
+ vreplgr2vr.w vr1, t1
+ vreplgr2vr.w vr2, t2
+ vreplgr2vr.w vr3, t3
+ vreplgr2vr.w vr4, t4
+ vreplgr2vr.w vr5, t5
+ vreplgr2vr.w vr6, t6
+ vreplgr2vr.w vr7, t7
+ vilvl.w vr0, vr2, vr0
+ vilvl.w vr4, vr6, vr4
+ vilvl.w vr1, vr3, vr1
+ vilvl.w vr5, vr7, vr5
+ vilvl.d vr12, vr4, vr0
+ vilvl.d vr13, vr5, vr1
+ li.w t5, 0
+ li.w t8, 8
+ bge a4, t8, .WIDTH8
+ blt zero, a4, .WIDTH
+ b .END
+
+.WIDTH8:
+ li.d t1, 0
+ li.d t4, 0
+ vslli.w vr2, vr12, 12
+ vslli.w vr3, vr13, 12
+ move t3, a0
+
+.FILTERSIZE8:
+ ldx.d t2, a2, t1
+ vldx vr4, t2, t5
+ vldrepl.h vr5, t3, 0
+ vmaddwev.w.h vr2, vr4, vr5
+ vmaddwod.w.h vr3, vr4, vr5
+ addi.d t1, t1, 8
+ addi.d t3, t3, 2
+ addi.d t4, t4, 1
+ blt t4, a1, .FILTERSIZE8
+ vsrai.w vr2, vr2, 19
+ vsrai.w vr3, vr3, 19
+ vclip255.w vr2, vr2
+ vclip255.w vr3, vr3
+ vpickev.h vr2, vr3, vr2
+ vpickev.b vr2, vr2, vr2
+ vbsrl.v vr3, vr2, 4
+ vilvl.b vr2, vr3, vr2
+ fst.d f2, a3, 0
+ addi.d t5, t5, 16
+ addi.d a4, a4, -8
+ addi.d a3, a3, 8
+ bge a4, t8, .WIDTH8
+ blt zero, a4, .WIDTH
+ b .END
+
+.WIDTH:
+ li.d t1, 0
+ li.d t4, 0
+ vslli.w vr2, vr12, 12
+ vslli.w vr3, vr13, 12
+.FILTERSIZE:
+ ldx.d t2, a2, t1
+ vldx vr4, t2, t5
+ vldrepl.h vr5, a0, 0
+ vmaddwev.w.h vr2, vr4, vr5
+ vmaddwod.w.h vr3, vr4, vr5
+ addi.d t1, t1, 8
+ addi.d a0, a0, 2
+ addi.d t4, t4, 1
+ blt t4, a1, .FILTERSIZE
+ vsrai.w vr2, vr2, 19
+ vsrai.w vr3, vr3, 19
+ vclip255.w vr2, vr2
+ vclip255.w vr3, vr3
+ vpickev.h vr2, vr3, vr2
+ vpickev.b vr2, vr2, vr2
+ vbsrl.v vr3, vr2, 4
+ vilvl.b vr2, vr3, vr2
+
+.DEST:
+ vstelm.b vr2, a3, 0, 0
+ vbsrl.v vr2, vr2, 1
+ addi.d a4, a4, -1
+ addi.d a3, a3, 1
+ blt zero, a4, .DEST
+.END:
+endfunc
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
index 36a4c4503b..277d7063e6 100644
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@@ -1773,11 +1773,9 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
-#undef yuvTorgb
-#undef yuvTorgb_setup
-av_cold void ff_sws_init_output_loongarch(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c)
{
if(c->flags & SWS_FULL_CHR_H_INT) {
diff --git a/libswscale/loongarch/output_lsx.c b/libswscale/loongarch/output_lsx.c
new file mode 100644
index 0000000000..768cc3abc6
--- /dev/null
+++ b/libswscale/loongarch/output_lsx.c
@@ -0,0 +1,1828 @@
+/*
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+
+/*Copy from libswscale/output.c*/
+static av_always_inline void
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
+ unsigned A1, unsigned A2,
+ const void *_r, const void *_g, const void *_b, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
+ target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
+ uint32_t *dest = (uint32_t *) _dest;
+ const uint32_t *r = (const uint32_t *) _r;
+ const uint32_t *g = (const uint32_t *) _g;
+ const uint32_t *b = (const uint32_t *) _b;
+
+#if CONFIG_SMALL
+ dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+ dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#else
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
+ int sh = (target == AV_PIX_FMT_RGB32_1 ||
+ target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
+ av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
+#endif
+ dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+ dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#endif
+ } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
+ uint8_t *dest = (uint8_t *) _dest;
+ const uint8_t *r = (const uint8_t *) _r;
+ const uint8_t *g = (const uint8_t *) _g;
+ const uint8_t *b = (const uint8_t *) _b;
+
+#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
+#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
+
+ dest[i * 6 + 0] = r_b[Y1];
+ dest[i * 6 + 1] = g[Y1];
+ dest[i * 6 + 2] = b_r[Y1];
+ dest[i * 6 + 3] = r_b[Y2];
+ dest[i * 6 + 4] = g[Y2];
+ dest[i * 6 + 5] = b_r[Y2];
+#undef r_b
+#undef b_r
+ } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
+ target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
+ target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
+ uint16_t *dest = (uint16_t *) _dest;
+ const uint16_t *r = (const uint16_t *) _r;
+ const uint16_t *g = (const uint16_t *) _g;
+ const uint16_t *b = (const uint16_t *) _b;
+ int dr1, dg1, db1, dr2, dg2, db2;
+
+ if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
+ dr1 = ff_dither_2x2_8[ y & 1 ][0];
+ dg1 = ff_dither_2x2_4[ y & 1 ][0];
+ db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+ dr2 = ff_dither_2x2_8[ y & 1 ][1];
+ dg2 = ff_dither_2x2_4[ y & 1 ][1];
+ db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+ } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
+ dr1 = ff_dither_2x2_8[ y & 1 ][0];
+ dg1 = ff_dither_2x2_8[ y & 1 ][1];
+ db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+ dr2 = ff_dither_2x2_8[ y & 1 ][1];
+ dg2 = ff_dither_2x2_8[ y & 1 ][0];
+ db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+ } else {
+ dr1 = ff_dither_4x4_16[ y & 3 ][0];
+ dg1 = ff_dither_4x4_16[ y & 3 ][1];
+ db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
+ dr2 = ff_dither_4x4_16[ y & 3 ][1];
+ dg2 = ff_dither_4x4_16[ y & 3 ][0];
+ db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
+ }
+
+ dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+ dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+ } else { /* 8/4 bits */
+ uint8_t *dest = (uint8_t *) _dest;
+ const uint8_t *r = (const uint8_t *) _r;
+ const uint8_t *g = (const uint8_t *) _g;
+ const uint8_t *b = (const uint8_t *) _b;
+ int dr1, dg1, db1, dr2, dg2, db2;
+
+ if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
+ const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
+ const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
+ dr1 = dg1 = d32[(i * 2 + 0) & 7];
+ db1 = d64[(i * 2 + 0) & 7];
+ dr2 = dg2 = d32[(i * 2 + 1) & 7];
+ db2 = d64[(i * 2 + 1) & 7];
+ } else {
+ const uint8_t * const d64 = ff_dither_8x8_73 [y & 7];
+ const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
+ dr1 = db1 = d128[(i * 2 + 0) & 7];
+ dg1 = d64[(i * 2 + 0) & 7];
+ dr2 = db2 = d128[(i * 2 + 1) & 7];
+ dg2 = d64[(i * 2 + 1) & 7];
+ }
+
+ if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
+ dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
+ ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
+ } else {
+ dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+ dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+ }
+ }
+}
+
+#define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
+{ \
+ Y1 = __lsx_vpickve2gr_w(vec_y1, t1); \
+ Y2 = __lsx_vpickve2gr_w(vec_y2, t2); \
+ U = __lsx_vpickve2gr_w(vec_u, t3); \
+ V = __lsx_vpickve2gr_w(vec_v, t4); \
+ r = c->table_rV[V]; \
+ g = (c->table_gU[U] + c->table_gV[V]); \
+ b = c->table_bU[U]; \
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
+ r, g, b, y, target, 0); \
+ count++; \
+}
+
+static void
+yuv2rgb_X_template_lsx(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest, int dstW,
+ int y, enum AVPixelFormat target, int hasAlpha)
+{
+ int i, j;
+ int count = 0;
+ int t = 1 << 18;
+ int len = dstW >> 5;
+ int res = dstW & 31;
+ int len_count = (dstW + 1) >> 1;
+ const void *r, *g, *b;
+ int head = YUVRGB_TABLE_HEADROOM;
+ __m128i headroom = __lsx_vreplgr2vr_w(head);
+
+ for (i = 0; i < len; i++) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
+ __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
+ __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_ev1 = yl_ev;
+ yl_od1 = yl_ev;
+ yh_ev1 = yl_ev;
+ yh_od1 = yl_ev;
+ u_ev1 = yl_ev;
+ v_ev1 = yl_ev;
+ u_od1 = yl_ev;
+ v_od1 = yl_ev;
+ yl_ev2 = yl_ev;
+ yl_od2 = yl_ev;
+ yh_ev2 = yl_ev;
+ yh_od2 = yl_ev;
+ u_ev2 = yl_ev;
+ v_ev2 = yl_ev;
+ u_od2 = yl_ev;
+ v_od2 = yl_ev;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+ 16, l_src1, l_src2);
+ DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
+ 48, l_src3, l_src4);
+ yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
+ yl_od1 = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
+ yh_ev1 = __lsx_vmaddwev_w_h(yh_ev1, temp, l_src3);
+ yh_od1 = __lsx_vmaddwod_w_h(yh_od1, temp, l_src3);
+ yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
+ yl_od2 = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
+ yh_ev2 = __lsx_vmaddwev_w_h(yh_ev2, temp, l_src4);
+ yh_od2 = __lsx_vmaddwod_w_h(yh_od2, temp, l_src4);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src1, v_src1);
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
+ u_src2, v_src2);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
+ u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
+ v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
+ v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
+ u_ev2 = __lsx_vmaddwev_w_h(u_ev2, temp, u_src2);
+ u_od2 = __lsx_vmaddwod_w_h(u_od2, temp, u_src2);
+ v_ev2 = __lsx_vmaddwev_w_h(v_ev2, temp, v_src2);
+ v_od2 = __lsx_vmaddwod_w_h(v_od2, temp, v_src2);
+ }
+ yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
+ yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
+ yl_od1 = __lsx_vsrai_w(yl_od1, 19);
+ yh_od1 = __lsx_vsrai_w(yh_od1, 19);
+ u_ev1 = __lsx_vsrai_w(u_ev1, 19);
+ v_ev1 = __lsx_vsrai_w(v_ev1, 19);
+ u_od1 = __lsx_vsrai_w(u_od1, 19);
+ v_od1 = __lsx_vsrai_w(v_od1, 19);
+ yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
+ yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
+ yl_od2 = __lsx_vsrai_w(yl_od2, 19);
+ yh_od2 = __lsx_vsrai_w(yh_od2, 19);
+ u_ev2 = __lsx_vsrai_w(u_ev2, 19);
+ v_ev2 = __lsx_vsrai_w(v_ev2, 19);
+ u_od2 = __lsx_vsrai_w(u_od2, 19);
+ v_od2 = __lsx_vsrai_w(v_od2, 19);
+ u_ev1 = __lsx_vadd_w(u_ev1, headroom);
+ v_ev1 = __lsx_vadd_w(v_ev1, headroom);
+ u_od1 = __lsx_vadd_w(u_od1, headroom);
+ v_od1 = __lsx_vadd_w(v_od1, headroom);
+ u_ev2 = __lsx_vadd_w(u_ev2, headroom);
+ v_ev2 = __lsx_vadd_w(v_ev2, headroom);
+ u_od2 = __lsx_vadd_w(u_od2, headroom);
+ v_od2 = __lsx_vadd_w(v_od2, headroom);
+
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 3, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 0, 0, 2, 2);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 1, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 2, 2, 3, 3);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 3, 3, 3, 3);
+ }
+
+ if (res >= 16) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, l_src2, u_src1, v_src1;
+ __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
+ __m128i u_ev1, u_od1, v_ev1, v_od1, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_ev1 = yl_ev;
+ yl_od1 = yl_ev;
+ u_ev1 = yl_ev;
+ v_ev1 = yl_ev;
+ u_od1 = yl_ev;
+ v_od1 = yl_ev;
+ yl_ev2 = yl_ev;
+ yl_od2 = yl_ev;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+ 16, l_src1, l_src2);
+ yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
+ yl_od1 = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
+ yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
+ yl_od2 = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src1, v_src1);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
+ u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
+ v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
+ v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
+ }
+ yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
+ yl_od1 = __lsx_vsrai_w(yl_od1, 19);
+ u_ev1 = __lsx_vsrai_w(u_ev1, 19);
+ v_ev1 = __lsx_vsrai_w(v_ev1, 19);
+ u_od1 = __lsx_vsrai_w(u_od1, 19);
+ v_od1 = __lsx_vsrai_w(v_od1, 19);
+ yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
+ yl_od2 = __lsx_vsrai_w(yl_od2, 19);
+ u_ev1 = __lsx_vadd_w(u_ev1, headroom);
+ v_ev1 = __lsx_vadd_w(v_ev1, headroom);
+ u_od1 = __lsx_vadd_w(u_od1, headroom);
+ v_od1 = __lsx_vadd_w(v_od1, headroom);
+
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
+ res -= 16;
+ }
+
+ if (res >= 8) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, u_src, v_src;
+ __m128i yl_ev, yl_od;
+ __m128i u_ev, u_od, v_ev, v_od, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_od = yl_ev;
+ u_ev = yl_ev;
+ v_ev = yl_ev;
+ u_od = yl_ev;
+ v_od = yl_ev;
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+ yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+ yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src, v_src);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+ u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+ v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+ v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+ }
+ yl_ev = __lsx_vsrai_w(yl_ev, 19);
+ yl_od = __lsx_vsrai_w(yl_od, 19);
+ u_ev = __lsx_vsrai_w(u_ev, 19);
+ v_ev = __lsx_vsrai_w(v_ev, 19);
+ u_od = __lsx_vsrai_w(u_od, 19);
+ v_od = __lsx_vsrai_w(v_od, 19);
+ u_ev = __lsx_vadd_w(u_ev, headroom);
+ v_ev = __lsx_vadd_w(v_ev, headroom);
+ u_od = __lsx_vadd_w(u_od, headroom);
+ v_od = __lsx_vadd_w(v_od, headroom);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
+ res -= 8;
+ }
+
+ if (res >= 4) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, u_src, v_src;
+ __m128i yl_ev, yl_od;
+ __m128i u_ev, u_od, v_ev, v_od, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_od = yl_ev;
+ u_ev = yl_ev;
+ v_ev = yl_ev;
+ u_od = yl_ev;
+ v_od = yl_ev;
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+ yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+ yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src, v_src);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+ u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+ v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+ v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+ }
+ yl_ev = __lsx_vsrai_w(yl_ev, 19);
+ yl_od = __lsx_vsrai_w(yl_od, 19);
+ u_ev = __lsx_vsrai_w(u_ev, 19);
+ v_ev = __lsx_vsrai_w(v_ev, 19);
+ u_od = __lsx_vsrai_w(u_od, 19);
+ v_od = __lsx_vsrai_w(v_od, 19);
+ u_ev = __lsx_vadd_w(u_ev, headroom);
+ v_ev = __lsx_vadd_w(v_ev, headroom);
+ u_od = __lsx_vadd_w(u_od, headroom);
+ v_od = __lsx_vadd_w(v_od, headroom);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+ res -= 4;
+ }
+
+ if (res >= 2) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, u_src, v_src;
+ __m128i yl_ev, yl_od;
+ __m128i u_ev, u_od, v_ev, v_od, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_od = yl_ev;
+ u_ev = yl_ev;
+ v_ev = yl_ev;
+ u_od = yl_ev;
+ v_od = yl_ev;
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+ yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+ yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src, v_src);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+ u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+ v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+ v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+ }
+ yl_ev = __lsx_vsrai_w(yl_ev, 19);
+ yl_od = __lsx_vsrai_w(yl_od, 19);
+ u_ev = __lsx_vsrai_w(u_ev, 19);
+ v_ev = __lsx_vsrai_w(v_ev, 19);
+ u_od = __lsx_vsrai_w(u_od, 19);
+ v_od = __lsx_vsrai_w(v_od, 19);
+ u_ev = __lsx_vadd_w(u_ev, headroom);
+ v_ev = __lsx_vadd_w(v_ev, headroom);
+ u_od = __lsx_vadd_w(u_od, headroom);
+ v_od = __lsx_vadd_w(v_od, headroom);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+ res -= 2;
+ }
+
+ for (; count < len_count; count++) {
+ int Y1 = 1 << 18;
+ int Y2 = Y1;
+ int U = Y1;
+ int V = Y1;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ Y1 += lumSrc[j][count * 2] * lumFilter[j];
+ Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][count] * chrFilter[j];
+ V += chrVSrc[j][count] * chrFilter[j];
+ }
+ Y1 >>= 19;
+ Y2 >>= 19;
+ U >>= 19;
+ V >>= 19;
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+}
+
+static void
+yuv2rgb_2_template_lsx(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+ int yalpha1 = 4096 - yalpha;
+ int uvalpha1 = 4096 - uvalpha;
+ int i, count = 0;
+ int len = dstW - 7;
+ int len_count = (dstW + 1) >> 1;
+ const void *r, *g, *b;
+ int head = YUVRGB_TABLE_HEADROOM;
+ __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
+ __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
+ __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
+ __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
+ __m128i headroom = __lsx_vreplgr2vr_w(head);
+ __m128i zero = __lsx_vldi(0);
+
+ for (i = 0; i < len; i += 8) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ int c_dex = count << 1;
+ __m128i y0_h, y0_l, y0, u0, v0;
+ __m128i y1_h, y1_l, y1, u1, v1;
+ __m128i y_l, y_h, u, v;
+
+ DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+ buf1, i_dex, y0, u0, v0, y1);
+ DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
+ DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
+ DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
+ u0, u1, v0, v1);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
+ u0 = __lsx_vmul_w(u0, v_uvalpha1);
+ v0 = __lsx_vmul_w(v0, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
+ u = __lsx_vmadd_w(u0, v_uvalpha, u1);
+ v = __lsx_vmadd_w(v0, v_uvalpha, v1);
+ y_l = __lsx_vsrai_w(y_l, 19);
+ y_h = __lsx_vsrai_w(y_h, 19);
+ u = __lsx_vsrai_w(u, 19);
+ v = __lsx_vsrai_w(v, 19);
+ u = __lsx_vadd_w(u, headroom);
+ v = __lsx_vadd_w(v, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
+ }
+ if (dstW - i >= 4) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ __m128i y0_l, y0, u0, v0;
+ __m128i y1_l, y1, u1, v1;
+ __m128i y_l, u, v;
+
+ y0 = __lsx_vldx(buf0, i_dex);
+ u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
+ v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
+ y1 = __lsx_vldx(buf1, i_dex);
+ u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
+ v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
+ DUP2_ARG2(__lsx_vilvl_h, zero, y0, zero, y1, y0_l, y1_l);
+ DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
+ u0, u1, v0, v1);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ u0 = __lsx_vmul_w(u0, v_uvalpha1);
+ v0 = __lsx_vmul_w(v0, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ u = __lsx_vmadd_w(u0, v_uvalpha, u1);
+ v = __lsx_vmadd_w(v0, v_uvalpha, v1);
+ y_l = __lsx_vsrai_w(y_l, 19);
+ u = __lsx_vsrai_w(u, 19);
+ v = __lsx_vsrai_w(v, 19);
+ u = __lsx_vadd_w(u, headroom);
+ v = __lsx_vadd_w(v, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ i += 4;
+ }
+ for (; count < len_count; count++) {
+ int Y1 = (buf0[count * 2] * yalpha1 +
+ buf1[count * 2] * yalpha) >> 19;
+ int Y2 = (buf0[count * 2 + 1] * yalpha1 +
+ buf1[count * 2 + 1] * yalpha) >> 19;
+ int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
+ int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
+
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+}
+
+static void
+yuv2rgb_1_template_lsx(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ int i;
+ int len = (dstW - 7);
+ int len_count = (dstW + 1) >> 1;
+ const void *r, *g, *b;
+
+ if (uvalpha < 2048) {
+ int count = 0;
+ int head = YUVRGB_TABLE_HEADROOM;
+ __m128i headroom = __lsx_vreplgr2vr_h(head);
+
+ for (i = 0; i < len; i += 8) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ int c_dex = count << 1;
+ __m128i src_y, src_u, src_v;
+ __m128i u, v, uv, y_l, y_h;
+
+ src_y = __lsx_vldx(buf0, i_dex);
+ DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
+ src_y = __lsx_vsrari_h(src_y, 7);
+ src_u = __lsx_vsrari_h(src_u, 7);
+ src_v = __lsx_vsrari_h(src_v, 7);
+ y_l = __lsx_vsllwil_w_h(src_y, 0);
+ y_h = __lsx_vexth_w_h(src_y);
+ uv = __lsx_vilvl_h(src_v, src_u);
+ u = __lsx_vaddwev_w_h(uv, headroom);
+ v = __lsx_vaddwod_w_h(uv, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
+ }
+ if (dstW - i >= 4){
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ __m128i src_y, src_u, src_v;
+ __m128i y_l, u, v, uv;
+
+ src_y = __lsx_vldx(buf0, i_dex);
+ src_u = __lsx_vldrepl_d((ubuf0 + count), 0);
+ src_v = __lsx_vldrepl_d((vbuf0 + count), 0);
+ y_l = __lsx_vsrari_h(src_y, 7);
+ y_l = __lsx_vsllwil_w_h(y_l, 0);
+ uv = __lsx_vilvl_h(src_v, src_u);
+ uv = __lsx_vsrari_h(uv, 7);
+ u = __lsx_vaddwev_w_h(uv, headroom);
+ v = __lsx_vaddwod_w_h(uv, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ i += 4;
+ }
+ for (; count < len_count; count++) {
+ int Y1 = (buf0[count * 2 ] + 64) >> 7;
+ int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
+ int U = (ubuf0[count] + 64) >> 7;
+ int V = (vbuf0[count] + 64) >> 7;
+
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ int count = 0;
+ int HEADROOM = YUVRGB_TABLE_HEADROOM;
+ __m128i headroom = __lsx_vreplgr2vr_w(HEADROOM);
+
+ for (i = 0; i < len; i += 8) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ int c_dex = count << 1;
+ __m128i src_y, src_u0, src_v0, src_u1, src_v1;
+ __m128i y_l, y_h, u1, u2, v1, v2;
+
+ DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+ ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
+ src_v1 = __lsx_vldx(vbuf1, c_dex);
+ src_y = __lsx_vsrari_h(src_y, 7);
+ u1 = __lsx_vaddwev_w_h(src_u0, src_u1);
+ v1 = __lsx_vaddwod_w_h(src_u0, src_u1);
+ u2 = __lsx_vaddwev_w_h(src_v0, src_v1);
+ v2 = __lsx_vaddwod_w_h(src_v0, src_v1);
+ y_l = __lsx_vsllwil_w_h(src_y, 0);
+ y_h = __lsx_vexth_w_h(src_y);
+ u1 = __lsx_vsrari_w(u1, 8);
+ v1 = __lsx_vsrari_w(v1, 8);
+ u2 = __lsx_vsrari_w(u2, 8);
+ v2 = __lsx_vsrari_w(v2, 8);
+ u1 = __lsx_vadd_w(u1, headroom);
+ v1 = __lsx_vadd_w(v1, headroom);
+ u2 = __lsx_vadd_w(u2, headroom);
+ v2 = __lsx_vadd_w(v2, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u1, v1, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u2, v2, 2, 3, 0, 0);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u1, v1, 0, 1, 1, 1);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u2, v2, 2, 3, 1, 1);
+ }
+ if (dstW - i >= 4) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ __m128i src_y, src_u0, src_v0, src_u1, src_v1;
+ __m128i uv;
+
+ src_y = __lsx_vldx(buf0, i_dex);
+ src_u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
+ src_v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
+ src_u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
+ src_v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
+
+ src_u0 = __lsx_vilvl_h(src_u1, src_u0);
+ src_v0 = __lsx_vilvl_h(src_v1, src_v0);
+ src_y = __lsx_vsrari_h(src_y, 7);
+ src_y = __lsx_vsllwil_w_h(src_y, 0);
+ uv = __lsx_vilvl_h(src_v0, src_u0);
+ uv = __lsx_vhaddw_w_h(uv, uv);
+ uv = __lsx_vsrari_w(uv, 8);
+ uv = __lsx_vadd_w(uv, headroom);
+ WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 0, 1, 0, 1);
+ WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 2, 3, 2, 3);
+ i += 4;
+ }
+ for (; count < len_count; count++) {
+ int Y1 = (buf0[count * 2 ] + 64) >> 7;
+ int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
+ int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
+ int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
+
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+ }
+}
+
+#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _X_lsx(SwsContext *c, const int16_t *lumFilter, \
+ const int16_t **lumSrc, int lumFilterSize, \
+ const int16_t *chrFilter, const int16_t **chrUSrc, \
+ const int16_t **chrVSrc, int chrFilterSize, \
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \
+ int y) \
+{ \
+ name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize, \
+ chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+ alpSrc, dest, dstW, y, fmt, hasAlpha); \
+}
+
+#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
+YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _2_lsx(SwsContext *c, const int16_t *buf[2], \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \
+ int yalpha, int uvalpha, int y) \
+{ \
+ name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest, \
+ dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
+}
+
+#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
+YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _1_lsx(SwsContext *c, const int16_t *buf0, \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf0, uint8_t *dest, int dstW, \
+ int uvalpha, int y) \
+{ \
+ name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest, \
+ dstW, uvalpha, y, fmt, hasAlpha); \
+}
+
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+#endif
+YUV2RGBWRAPPER(yuv2rgb,, x32_1, AV_PIX_FMT_RGB32_1, 0)
+YUV2RGBWRAPPER(yuv2rgb,, x32, AV_PIX_FMT_RGB32, 0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
+YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 16, AV_PIX_FMT_RGB565, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 15, AV_PIX_FMT_RGB555, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 12, AV_PIX_FMT_RGB444, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 8, AV_PIX_FMT_RGB8, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 4, AV_PIX_FMT_RGB4, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 4b, AV_PIX_FMT_RGB4_BYTE, 0)
+
+// This function is copied from libswscale/output.c
+static av_always_inline void yuv2rgb_write_full(SwsContext *c,
+ uint8_t *dest, int i, int R, int A, int G, int B,
+ int y, enum AVPixelFormat target, int hasAlpha, int err[4])
+{
+ int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
+
+ if ((R | G | B) & 0xC0000000) {
+ R = av_clip_uintp2(R, 30);
+ G = av_clip_uintp2(G, 30);
+ B = av_clip_uintp2(B, 30);
+ }
+
+ switch(target) {
+ case AV_PIX_FMT_ARGB:
+ dest[0] = hasAlpha ? A : 255;
+ dest[1] = R >> 22;
+ dest[2] = G >> 22;
+ dest[3] = B >> 22;
+ break;
+ case AV_PIX_FMT_RGB24:
+ dest[0] = R >> 22;
+ dest[1] = G >> 22;
+ dest[2] = B >> 22;
+ break;
+ case AV_PIX_FMT_RGBA:
+ dest[0] = R >> 22;
+ dest[1] = G >> 22;
+ dest[2] = B >> 22;
+ dest[3] = hasAlpha ? A : 255;
+ break;
+ case AV_PIX_FMT_ABGR:
+ dest[0] = hasAlpha ? A : 255;
+ dest[1] = B >> 22;
+ dest[2] = G >> 22;
+ dest[3] = R >> 22;
+ break;
+ case AV_PIX_FMT_BGR24:
+ dest[0] = B >> 22;
+ dest[1] = G >> 22;
+ dest[2] = R >> 22;
+ break;
+ case AV_PIX_FMT_BGRA:
+ dest[0] = B >> 22;
+ dest[1] = G >> 22;
+ dest[2] = R >> 22;
+ dest[3] = hasAlpha ? A : 255;
+ break;
+ case AV_PIX_FMT_BGR4_BYTE:
+ case AV_PIX_FMT_RGB4_BYTE:
+ case AV_PIX_FMT_BGR8:
+ case AV_PIX_FMT_RGB8:
+ {
+ int r,g,b;
+
+ switch (c->dither) {
+ default:
+ case SWS_DITHER_AUTO:
+ case SWS_DITHER_ED:
+ R >>= 22;
+ G >>= 22;
+ B >>= 22;
+ R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
+ G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
+ B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+ r = R >> (isrgb8 ? 5 : 7);
+ g = G >> (isrgb8 ? 5 : 6);
+ b = B >> (isrgb8 ? 6 : 7);
+ r = av_clip(r, 0, isrgb8 ? 7 : 1);
+ g = av_clip(g, 0, isrgb8 ? 7 : 3);
+ b = av_clip(b, 0, isrgb8 ? 3 : 1);
+ err[0] = R - r*(isrgb8 ? 36 : 255);
+ err[1] = G - g*(isrgb8 ? 36 : 85);
+ err[2] = B - b*(isrgb8 ? 85 : 255);
+ break;
+ case SWS_DITHER_A_DITHER:
+ if (isrgb8) {
+ /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
+ r = (((R >> 19) + A_DITHER(i,y) -96)>>8);
+ g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
+ b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
+ r = av_clip_uintp2(r, 3);
+ g = av_clip_uintp2(g, 3);
+ b = av_clip_uintp2(b, 2);
+ } else {
+ r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
+ g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
+ b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
+ r = av_clip_uintp2(r, 1);
+ g = av_clip_uintp2(g, 2);
+ b = av_clip_uintp2(b, 1);
+ }
+ break;
+ case SWS_DITHER_X_DITHER:
+ if (isrgb8) {
+ /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
+ r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
+ g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
+ b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
+ r = av_clip_uintp2(r, 3);
+ g = av_clip_uintp2(g, 3);
+ b = av_clip_uintp2(b, 2);
+ } else {
+ r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
+ g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
+ b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
+ r = av_clip_uintp2(r, 1);
+ g = av_clip_uintp2(g, 2);
+ b = av_clip_uintp2(b, 1);
+ }
+
+ break;
+ }
+
+ if(target == AV_PIX_FMT_BGR4_BYTE) {
+ dest[0] = r + 2*g + 8*b;
+ } else if(target == AV_PIX_FMT_RGB4_BYTE) {
+ dest[0] = b + 2*g + 8*r;
+ } else if(target == AV_PIX_FMT_BGR8) {
+ dest[0] = r + 8*g + 64*b;
+ } else if(target == AV_PIX_FMT_RGB8) {
+ dest[0] = b + 4*g + 32*r;
+ } else
+ av_assert2(0);
+ break; }
+ }
+}
+
+#define YUVTORGB_SETUP_LSX \
+ int y_offset = c->yuv2rgb_y_offset; \
+ int y_coeff = c->yuv2rgb_y_coeff; \
+ int v2r_coe = c->yuv2rgb_v2r_coeff; \
+ int v2g_coe = c->yuv2rgb_v2g_coeff; \
+ int u2g_coe = c->yuv2rgb_u2g_coeff; \
+ int u2b_coe = c->yuv2rgb_u2b_coeff; \
+ __m128i offset = __lsx_vreplgr2vr_w(y_offset); \
+ __m128i coeff = __lsx_vreplgr2vr_w(y_coeff); \
+ __m128i v2r = __lsx_vreplgr2vr_w(v2r_coe); \
+ __m128i v2g = __lsx_vreplgr2vr_w(v2g_coe); \
+ __m128i u2g = __lsx_vreplgr2vr_w(u2g_coe); \
+ __m128i u2b = __lsx_vreplgr2vr_w(u2b_coe); \
+
+#define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff, \
+ y_temp, v2r, v2g, u2g, u2b) \
+{ \
+ y = __lsx_vsub_w(y, offset); \
+ y = __lsx_vmul_w(y, coeff); \
+ y = __lsx_vadd_w(y, y_temp); \
+ R = __lsx_vmadd_w(y, v, v2r); \
+ v = __lsx_vmadd_w(y, v, v2g); \
+ G = __lsx_vmadd_w(v, u, u2g); \
+ B = __lsx_vmadd_w(y, u, u2b); \
+}
+
+#define WRITE_FULL_A_LSX(r, g, b, a, t1, s) \
+{ \
+ R = __lsx_vpickve2gr_w(r, t1); \
+ G = __lsx_vpickve2gr_w(g, t1); \
+ B = __lsx_vpickve2gr_w(b, t1); \
+ A = __lsx_vpickve2gr_w(a, t1); \
+ if (A & 0x100) \
+ A = av_clip_uint8(A); \
+ yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
+ dest += step; \
+}
+
+#define WRITE_FULL_LSX(r, g, b, t1, s) \
+{ \
+ R = __lsx_vpickve2gr_w(r, t1); \
+ G = __lsx_vpickve2gr_w(g, t1); \
+ B = __lsx_vpickve2gr_w(b, t1); \
+ yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
+ dest += step; \
+}
+
+static void
+yuv2rgb_full_X_template_lsx(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest,
+ int dstW, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+ int i, j, B, G, R, A;
+ int step = (target == AV_PIX_FMT_RGB24 ||
+ target == AV_PIX_FMT_BGR24) ? 3 : 4;
+ int err[4] = {0};
+ int a_temp = 1 << 18;
+ int templ = 1 << 9;
+ int tempc = templ - (128 << 19);
+ int ytemp = 1 << 21;
+ int len = dstW - 7;
+ __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+ YUVTORGB_SETUP_LSX
+
+ if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+ || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
+ step = 1;
+
+ for (i = 0; i < len; i += 8) {
+ __m128i l_src, u_src, v_src;
+ __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
+ __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+ int n = i << 1;
+
+ y_ev = y_od = __lsx_vreplgr2vr_w(templ);
+ u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src = __lsx_vldx(lumSrc[j], n);
+ y_ev = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
+ y_od = __lsx_vmaddwod_w_h(y_od, l_src, temp);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
+ u_src, v_src);
+ DUP2_ARG3(__lsx_vmaddwev_w_h, u_ev, u_src, temp, v_ev,
+ v_src, temp, u_ev, v_ev);
+ DUP2_ARG3(__lsx_vmaddwod_w_h, u_od, u_src, temp, v_od,
+ v_src, temp, u_od, v_od);
+ }
+ y_ev = __lsx_vsrai_w(y_ev, 10);
+ y_od = __lsx_vsrai_w(y_od, 10);
+ u_ev = __lsx_vsrai_w(u_ev, 10);
+ u_od = __lsx_vsrai_w(u_od, 10);
+ v_ev = __lsx_vsrai_w(v_ev, 10);
+ v_od = __lsx_vsrai_w(v_od, 10);
+ YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a_src, a_ev, a_od;
+
+ a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h(lumFilter + j, 0);
+ a_src = __lsx_vldx(alpSrc[j], n);
+ a_ev = __lsx_vmaddwev_w_h(a_ev, a_src, temp);
+ a_od = __lsx_vmaddwod_w_h(a_od, a_src, temp);
+ }
+ a_ev = __lsx_vsrai_w(a_ev, 19);
+ a_od = __lsx_vsrai_w(a_od, 19);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i l_src, u_src, v_src;
+ __m128i y_ev, u_ev, v_ev, uv, temp;
+ __m128i R_ev, G_ev, B_ev;
+ int n = i << 1;
+
+ y_ev = __lsx_vreplgr2vr_w(templ);
+ u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src = __lsx_vldx(lumSrc[j], n);
+ l_src = __lsx_vilvl_h(l_src, l_src);
+ y_ev = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
+ uv = __lsx_vilvl_h(v_src, u_src);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, uv, temp);
+ v_ev = __lsx_vmaddwod_w_h(v_ev, uv, temp);
+ }
+ y_ev = __lsx_vsrai_w(y_ev, 10);
+ u_ev = __lsx_vsrai_w(u_ev, 10);
+ v_ev = __lsx_vsrai_w(v_ev, 10);
+ YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a_src, a_ev;
+
+ a_ev = __lsx_vreplgr2vr_w(a_temp);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h(lumFilter + j, 0);
+ a_src = __lsx_vldx(alpSrc[j], n);
+ a_src = __lsx_vilvl_h(a_src, a_src);
+ a_ev = __lsx_vmaddwev_w_h(a_ev, a_src, temp);
+ }
+ a_ev = __lsx_vsrai_w(a_ev, 19);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 1);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 2);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 1);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 2);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++) {
+ int Y = templ;
+ int V, U = V = tempc;
+
+ A = 0;
+ for (j = 0; j < lumFilterSize; j++) {
+ Y += lumSrc[j][i] * lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][i] * chrFilter[j];
+ V += chrVSrc[j][i] * chrFilter[j];
+
+ }
+ Y >>= 10;
+ U >>= 10;
+ V >>= 10;
+ if (hasAlpha) {
+ A = 1 << 18;
+ for (j = 0; j < lumFilterSize; j++) {
+ A += alpSrc[j][i] * lumFilter[j];
+ }
+ A >>= 19;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_2_template_lsx(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+ *abuf0 = hasAlpha ? abuf[0] : NULL,
+ *abuf1 = hasAlpha ? abuf[1] : NULL;
+ int yalpha1 = 4096 - yalpha;
+ int uvalpha1 = 4096 - uvalpha;
+ int uvtemp = 128 << 19;
+ int atemp = 1 << 18;
+ int err[4] = {0};
+ int ytemp = 1 << 21;
+ int len = dstW - 7;
+ int i, R, G, B, A;
+ int step = (target == AV_PIX_FMT_RGB24 ||
+ target == AV_PIX_FMT_BGR24) ? 3 : 4;
+ __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
+ __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
+ __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
+ __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
+ __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
+ __m128i a_bias = __lsx_vreplgr2vr_w(atemp);
+ __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+ YUVTORGB_SETUP_LSX
+
+ av_assert2(yalpha <= 4096U);
+ av_assert2(uvalpha <= 4096U);
+
+ if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+ || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
+ step = 1;
+
+ for (i = 0; i < len; i += 8) {
+ __m128i b0, b1, ub0, ub1, vb0, vb1;
+ __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
+ __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
+ __m128i y_l, y_h, v_l, v_h, u_l, u_h;
+ __m128i R_l, R_h, G_l, G_h, B_l, B_h;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
+ n, ubuf1, n, b0, b1, ub0, ub1);
+ DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+ DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+ u0_l, u1_l, v0_l, v1_l);
+ DUP2_ARG1(__lsx_vexth_w_h, b0, b1, y0_h, y1_h);
+ DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
+ u0_h, u1_h, v0_h, v1_h);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
+ u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
+ u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
+ v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
+ v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
+ u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
+ u_h = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
+ v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
+ v_h = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
+ u_l = __lsx_vsub_w(u_l, uv);
+ u_h = __lsx_vsub_w(u_h, uv);
+ v_l = __lsx_vsub_w(v_l, uv);
+ v_h = __lsx_vsub_w(v_h, uv);
+ y_l = __lsx_vsrai_w(y_l, 10);
+ y_h = __lsx_vsrai_w(y_h, 10);
+ u_l = __lsx_vsrai_w(u_l, 10);
+ u_h = __lsx_vsrai_w(u_h, 10);
+ v_l = __lsx_vsrai_w(v_l, 10);
+ v_h = __lsx_vsrai_w(v_h, 10);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a0, a1, a0_l, a0_h;
+ __m128i a_l, a_h, a1_l, a1_h;
+
+ DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+ DUP2_ARG1(__lsx_vexth_w_h, a0, a1, a0_h, a1_h);
+ a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
+ a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
+ a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
+ a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
+ a_l = __lsx_vsrai_w(a_l, 19);
+ a_h = __lsx_vsrai_w(a_h, 19);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i b0, b1, ub0, ub1, vb0, vb1;
+ __m128i y0_l, y1_l, u0_l;
+ __m128i v0_l, u1_l, v1_l;
+ __m128i y_l, u_l, v_l;
+ __m128i R_l, G_l, B_l;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
+ ubuf1, n, b0, b1, ub0, ub1);
+ DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+ DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+ u0_l, u1_l, v0_l, v1_l);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
+ v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
+ v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
+ u_l = __lsx_vsub_w(u_l, uv);
+ v_l = __lsx_vsub_w(v_l, uv);
+ y_l = __lsx_vsrai_w(y_l, 10);
+ u_l = __lsx_vsrai_w(u_l, 10);
+ v_l = __lsx_vsrai_w(v_l, 10);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a0, a1, a0_l;
+ __m128i a_l, a1_l;
+
+ DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+ a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
+ a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
+ a_l = __lsx_vsrai_w(a_l, 19);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++){
+ int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10;
+ int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
+ int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
+
+ A = 0;
+ if (hasAlpha){
+ A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_1_template_lsx(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ int i, B, G, R, A;
+ int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
+ int err[4] = {0};
+ int ytemp = 1 << 21;
+ int bias_int = 64;
+ int len = dstW - 7;
+ __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+ YUVTORGB_SETUP_LSX
+
+ if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+ || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
+ step = 1;
+ if (uvalpha < 2048) {
+ int uvtemp = 128 << 7;
+ __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
+ __m128i bias = __lsx_vreplgr2vr_w(bias_int);
+
+ for (i = 0; i < len; i += 8) {
+ __m128i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
+ __m128i y_l, y_h, u_l, u_h, v_l, v_h;
+ __m128i R_l, R_h, G_l, G_h, B_l, B_h;
+ int n = i << 1;
+
+ DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
+ vb = __lsx_vldx(vbuf0, n);
+ y_l = __lsx_vsllwil_w_h(b, 2);
+ y_h = __lsx_vexth_w_h(b);
+ DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+ DUP2_ARG1(__lsx_vexth_w_h, ub, vb, ub_h, vb_h);
+ y_h = __lsx_vslli_w(y_h, 2);
+ u_l = __lsx_vsub_w(ub_l, uv);
+ u_h = __lsx_vsub_w(ub_h, uv);
+ v_l = __lsx_vsub_w(vb_l, uv);
+ v_h = __lsx_vsub_w(vb_h, uv);
+ u_l = __lsx_vslli_w(u_l, 2);
+ u_h = __lsx_vslli_w(u_h, 2);
+ v_l = __lsx_vslli_w(v_l, 2);
+ v_h = __lsx_vslli_w(v_h, 2);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src;
+ __m128i a_l, a_h;
+
+ a_src = __lsx_vld(abuf0 + i, 0);
+ a_l = __lsx_vsllwil_w_h(a_src, 0);
+ a_h = __lsx_vexth_w_h(a_src);
+ a_l = __lsx_vadd_w(a_l, bias);
+ a_h = __lsx_vadd_w(a_h, bias);
+ a_l = __lsx_vsrai_w(a_l, 7);
+ a_h = __lsx_vsrai_w(a_h, 7);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i b, ub, vb, ub_l, vb_l;
+ __m128i y_l, u_l, v_l;
+ __m128i R_l, G_l, B_l;
+ int n = i << 1;
+
+ DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
+ vb = __lsx_vldx(vbuf0, n);
+ y_l = __lsx_vsllwil_w_h(b, 0);
+ DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+ y_l = __lsx_vslli_w(y_l, 2);
+ u_l = __lsx_vsub_w(ub_l, uv);
+ v_l = __lsx_vsub_w(vb_l, uv);
+ u_l = __lsx_vslli_w(u_l, 2);
+ v_l = __lsx_vslli_w(v_l, 2);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src, a_l;
+
+ a_src = __lsx_vldx(abuf0, n);
+ a_src = __lsx_vsllwil_w_h(a_src, 0);
+ a_l = __lsx_vadd_w(bias, a_src);
+ a_l = __lsx_vsrai_w(a_l, 7);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++) {
+ int Y = buf0[i] << 2;
+ int U = (ubuf0[i] - uvtemp) << 2;
+ int V = (vbuf0[i] - uvtemp) << 2;
+
+ A = 0;
+ if(hasAlpha) {
+ A = (abuf0[i] + 64) >> 7;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ int uvtemp = 128 << 8;
+ __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
+ __m128i zero = __lsx_vldi(0);
+ __m128i bias = __lsx_vreplgr2vr_h(bias_int);
+
+ for (i = 0; i < len; i += 8) {
+ __m128i b, ub0, ub1, vb0, vb1;
+ __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
+ __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
+ ubuf1, n, b, ub0, vb0, ub1);
+ vb1 = __lsx_vldx(vbuf, n);
+ y_ev = __lsx_vaddwev_w_h(b, zero);
+ y_od = __lsx_vaddwod_w_h(b, zero);
+ DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
+ DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
+ DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
+ DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
+ u_ev, u_od, v_ev, v_od);
+ DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
+ u_ev, u_od, v_ev, v_od);
+ YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src;
+ __m128i a_ev, a_od;
+
+ a_src = __lsx_vld(abuf0 + i, 0);
+ a_ev = __lsx_vaddwev_w_h(bias, a_src);
+ a_od = __lsx_vaddwod_w_h(bias, a_src);
+ a_ev = __lsx_vsrai_w(a_ev, 7);
+ a_od = __lsx_vsrai_w(a_od, 7);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i b, ub0, ub1, vb0, vb1;
+ __m128i y_l, u_l, v_l;
+ __m128i R_l, G_l, B_l;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
+ ubuf1, n, b, ub0, vb0, ub1);
+ vb1 = __lsx_vldx(vbuf1, n);
+ y_l = __lsx_vsllwil_w_h(b, 0);
+ y_l = __lsx_vslli_w(y_l, 2);
+ DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
+ ub0, vb0, ub1, vb1);
+ DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
+ u_l = __lsx_vsub_w(u_l, uv);
+ v_l = __lsx_vsub_w(v_l, uv);
+ u_l = __lsx_vslli_w(u_l, 1);
+ v_l = __lsx_vslli_w(v_l, 1);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src;
+ __m128i a_l;
+
+ a_src = __lsx_vld(abuf0 + i, 0);
+ a_src = __lsx_vilvl_h(a_src, a_src);
+ a_l = __lsx_vaddwev_w_h(bias, a_l);
+ a_l = __lsx_vsrai_w(a_l, 7);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++) {
+ int Y = buf0[i] << 2;
+ int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
+ int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
+
+ A = 0;
+ if(hasAlpha) {
+ A = (abuf0[i] + 64) >> 7;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ }
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+}
+
+#if CONFIG_SMALL
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
+
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
+
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c)
+{
+ if(c->flags & SWS_FULL_CHR_H_INT) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGBA:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2rgba32_full_X_lsx;
+ c->yuv2packed2 = yuv2rgba32_full_2_lsx;
+ c->yuv2packed1 = yuv2rgba32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2rgba32_full_X_lsx;
+ c->yuv2packed2 = yuv2rgba32_full_2_lsx;
+ c->yuv2packed1 = yuv2rgba32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2rgbx32_full_X_lsx;
+ c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
+ c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_ARGB:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2argb32_full_X_lsx;
+ c->yuv2packed2 = yuv2argb32_full_2_lsx;
+ c->yuv2packed1 = yuv2argb32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2argb32_full_X_lsx;
+ c->yuv2packed2 = yuv2argb32_full_2_lsx;
+ c->yuv2packed1 = yuv2argb32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2xrgb32_full_X_lsx;
+ c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
+ c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_BGRA:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2bgra32_full_X_lsx;
+ c->yuv2packed2 = yuv2bgra32_full_2_lsx;
+ c->yuv2packed1 = yuv2bgra32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2bgra32_full_X_lsx;
+ c->yuv2packed2 = yuv2bgra32_full_2_lsx;
+ c->yuv2packed1 = yuv2bgra32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2bgrx32_full_X_lsx;
+ c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
+ c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_ABGR:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2abgr32_full_X_lsx;
+ c->yuv2packed2 = yuv2abgr32_full_2_lsx;
+ c->yuv2packed1 = yuv2abgr32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2abgr32_full_X_lsx;
+ c->yuv2packed2 = yuv2abgr32_full_2_lsx;
+ c->yuv2packed1 = yuv2abgr32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2xbgr32_full_X_lsx;
+ c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
+ c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->yuv2packedX = yuv2rgb24_full_X_lsx;
+ c->yuv2packed2 = yuv2rgb24_full_2_lsx;
+ c->yuv2packed1 = yuv2rgb24_full_1_lsx;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->yuv2packedX = yuv2bgr24_full_X_lsx;
+ c->yuv2packed2 = yuv2bgr24_full_2_lsx;
+ c->yuv2packed1 = yuv2bgr24_full_1_lsx;
+ break;
+ case AV_PIX_FMT_BGR4_BYTE:
+ c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
+ c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
+ c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
+ break;
+ case AV_PIX_FMT_RGB4_BYTE:
+ c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
+ c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
+ c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
+ break;
+ case AV_PIX_FMT_BGR8:
+ c->yuv2packedX = yuv2bgr8_full_X_lsx;
+ c->yuv2packed2 = yuv2bgr8_full_2_lsx;
+ c->yuv2packed1 = yuv2bgr8_full_1_lsx;
+ break;
+ case AV_PIX_FMT_RGB8:
+ c->yuv2packedX = yuv2rgb8_full_X_lsx;
+ c->yuv2packed2 = yuv2rgb8_full_2_lsx;
+ c->yuv2packed1 = yuv2rgb8_full_1_lsx;
+ break;
+ }
+ } else {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB32:
+ case AV_PIX_FMT_BGR32:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packed1 = yuv2rgbx32_1_lsx;
+ c->yuv2packed2 = yuv2rgbx32_2_lsx;
+ c->yuv2packedX = yuv2rgbx32_X_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ case AV_PIX_FMT_BGR32_1:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
+ c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
+ c->yuv2packedX = yuv2rgbx32_1_X_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->yuv2packed1 = yuv2rgb24_1_lsx;
+ c->yuv2packed2 = yuv2rgb24_2_lsx;
+ c->yuv2packedX = yuv2rgb24_X_lsx;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->yuv2packed1 = yuv2bgr24_1_lsx;
+ c->yuv2packed2 = yuv2bgr24_2_lsx;
+ c->yuv2packedX = yuv2bgr24_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ case AV_PIX_FMT_RGB565BE:
+ case AV_PIX_FMT_BGR565LE:
+ case AV_PIX_FMT_BGR565BE:
+ c->yuv2packed1 = yuv2rgb16_1_lsx;
+ c->yuv2packed2 = yuv2rgb16_2_lsx;
+ c->yuv2packedX = yuv2rgb16_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ case AV_PIX_FMT_RGB555BE:
+ case AV_PIX_FMT_BGR555LE:
+ case AV_PIX_FMT_BGR555BE:
+ c->yuv2packed1 = yuv2rgb15_1_lsx;
+ c->yuv2packed2 = yuv2rgb15_2_lsx;
+ c->yuv2packedX = yuv2rgb15_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ case AV_PIX_FMT_RGB444BE:
+ case AV_PIX_FMT_BGR444LE:
+ case AV_PIX_FMT_BGR444BE:
+ c->yuv2packed1 = yuv2rgb12_1_lsx;
+ c->yuv2packed2 = yuv2rgb12_2_lsx;
+ c->yuv2packedX = yuv2rgb12_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ c->yuv2packed1 = yuv2rgb8_1_lsx;
+ c->yuv2packed2 = yuv2rgb8_2_lsx;
+ c->yuv2packedX = yuv2rgb8_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB4:
+ case AV_PIX_FMT_BGR4:
+ c->yuv2packed1 = yuv2rgb4_1_lsx;
+ c->yuv2packed2 = yuv2rgb4_2_lsx;
+ c->yuv2packedX = yuv2rgb4_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB4_BYTE:
+ case AV_PIX_FMT_BGR4_BYTE:
+ c->yuv2packed1 = yuv2rgb4b_1_lsx;
+ c->yuv2packed2 = yuv2rgb4b_2_lsx;
+ c->yuv2packedX = yuv2rgb4b_X_lsx;
+ break;
+ }
+ }
+}
diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
new file mode 100644
index 0000000000..aa4c5cbe28
--- /dev/null
+++ b/libswscale/loongarch/swscale.S
@@ -0,0 +1,1868 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize)
+ */
+function ff_hscale_8_to_15_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+ li.w t0, 32767
+ li.w t8, 8
+ li.w t7, 4
+ vldi vr0, 0
+ vreplgr2vr.w vr20, t0
+ beq a6, t7, .LOOP_DSTW4
+ beq a6, t8, .LOOP_DSTW8
+ blt t8, a6, .LOOP_START
+ b .END_DSTW4
+
+.LOOP_START:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_DSTW:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ fldx.d f1, a3, t2
+ fldx.d f2, a3, t3
+ fldx.d f3, a3, t4
+ fldx.d f4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 8
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_DSTW
+ blt s1, a6, .DSTWA
+ b .END_FILTER
+.DSTWA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.FILTERSIZEA:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZEA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.FILTERSIZEB:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZEB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.FILTERSIZEC:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZEC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.FILTERSIZED:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZED
+.END_FILTER:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ srai.w s2, s2, 7
+ srai.w s3, s3, 7
+ srai.w s4, s4, 7
+ srai.w s5, s5, 7
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.h s2, a1, 0
+ st.h s3, a1, 2
+ st.h s4, a1, 4
+ st.h s5, a1, 6
+
+ addi.d a1, a1, 8
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP_START
+ blt zero, a2, .RES
+ b .END_LOOP
+.RES:
+ li.w t1, 0
+.DSTW:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DSTW
+ b .END_LOOP
+
+.LOOP_DSTW8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr6, vr0, vr6
+ vilvl.b vr7, vr0, vr7
+ vilvl.b vr8, vr0, vr8
+
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vdp2.w.h vr1, vr5, vr13
+ vdp2.w.h vr2, vr6, vr14
+ vdp2.w.h vr3, vr7, vr15
+ vdp2.w.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsrai.w vr1, vr1, 7
+ vsrai.w vr5, vr5, 7
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vpickev.h vr1, vr5, vr1
+ vst vr1, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DSTW8
+ blt zero, a2, .RES8
+ b .END_LOOP
+.RES8:
+ li.w t1, 0
+.DSTW8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE8:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE8
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DSTW8
+ b .END_LOOP
+
+.LOOP_DSTW4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.s f1, a3, t1
+ fldx.s f2, a3, t2
+ fldx.s f3, a3, t3
+ fldx.s f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.s f5, a3, t1
+ fldx.s f6, a3, t2
+ fldx.s f7, a3, t3
+ fldx.s f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr7, vr0, vr7
+
+ vdp2.w.h vr13, vr1, vr9
+ vdp2.w.h vr14, vr3, vr10
+ vdp2.w.h vr15, vr5, vr11
+ vdp2.w.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsrai.w vr13, vr13, 7
+ vsrai.w vr15, vr15, 7
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vpickev.h vr13, vr15, vr13
+ vst vr13, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DSTW4
+ blt zero, a2, .RES4
+ b .END_LOOP
+.RES4:
+ li.w t1, 0
+.DSTW4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE4:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE4
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DSTW4
+ b .END_LOOP
+.END_DSTW4:
+
+ li.w t1, 0
+.LOOP_DSTW1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE1:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE1
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_DSTW1
+ b .END_LOOP
+.END_LOOP:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
+
+/* void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize)
+ */
+function ff_hscale_8_to_19_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+ li.w t0, 524287
+ li.w t8, 8
+ li.w t7, 4
+ vldi vr0, 0
+ vreplgr2vr.w vr20, t0
+ beq a6, t7, .LOOP_DST4
+ beq a6, t8, .LOOP_DST8
+ blt t8, a6, .LOOP
+ b .END_DST4
+
+.LOOP:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_DST:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ fldx.d f1, a3, t2
+ fldx.d f2, a3, t3
+ fldx.d f3, a3, t4
+ fldx.d f4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 8
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_DST
+ blt s1, a6, .DSTA
+ b .END_FILTERA
+.DSTA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.FILTERA:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.FILTERB:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.FILTERC:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.FILTERD:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERD
+.END_FILTERA:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ srai.w s2, s2, 3
+ srai.w s3, s3, 3
+ srai.w s4, s4, 3
+ srai.w s5, s5, 3
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.w s2, a1, 0
+ st.w s3, a1, 4
+ st.w s4, a1, 8
+ st.w s5, a1, 12
+
+ addi.d a1, a1, 16
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP
+ blt zero, a2, .RESA
+ b .END
+.RESA:
+ li.w t1, 0
+.DST:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DST
+ b .END
+
+.LOOP_DST8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr6, vr0, vr6
+ vilvl.b vr7, vr0, vr7
+ vilvl.b vr8, vr0, vr8
+
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vdp2.w.h vr1, vr5, vr13
+ vdp2.w.h vr2, vr6, vr14
+ vdp2.w.h vr3, vr7, vr15
+ vdp2.w.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsrai.w vr1, vr1, 3
+ vsrai.w vr5, vr5, 3
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vst vr1, a1, 0
+ vst vr5, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DST8
+ blt zero, a2, .REST8
+ b .END
+.REST8:
+ li.w t1, 0
+.DST8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER8:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER8
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DST8
+ b .END
+
+.LOOP_DST4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.s f1, a3, t1
+ fldx.s f2, a3, t2
+ fldx.s f3, a3, t3
+ fldx.s f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.s f5, a3, t1
+ fldx.s f6, a3, t2
+ fldx.s f7, a3, t3
+ fldx.s f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr7, vr0, vr7
+
+ vdp2.w.h vr13, vr1, vr9
+ vdp2.w.h vr14, vr3, vr10
+ vdp2.w.h vr15, vr5, vr11
+ vdp2.w.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsrai.w vr13, vr13, 3
+ vsrai.w vr15, vr15, 3
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vst vr13, a1, 0
+ vst vr15, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DST4
+ blt zero, a2, .REST4
+ b .END
+.REST4:
+ li.w t1, 0
+.DST4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER4:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER4
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DST4
+ b .END
+.END_DST4:
+
+ li.w t1, 0
+.LOOP_DST1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER1:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER1
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_DST1
+ b .END
+.END:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
+
+/* void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize, int sh)
+ */
+function ff_hscale_16_to_15_sub_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+ li.w t0, 32767
+ li.w t8, 8
+ li.w t7, 4
+ vreplgr2vr.w vr20, t0
+ vreplgr2vr.w vr0, a7
+ beq a6, t7, .LOOP_HS15_DST4
+ beq a6, t8, .LOOP_HS15_DST8
+ blt t8, a6, .LOOP_HS15
+ b .END_HS15_DST4
+
+.LOOP_HS15:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_HS15_DST:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ slli.w t5, t5, 1
+ vldx vr1, a3, t2
+ vldx vr2, a3, t3
+ vldx vr3, a3, t4
+ vldx vr4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 16
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_HS15_DST
+ blt s1, a6, .HS15_DSTA
+ b .END_HS15_FILTERA
+.HS15_DSTA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.HS15_FILTERA:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.HS15_FILTERB:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.HS15_FILTERC:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.HS15_FILTERD:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERD
+.END_HS15_FILTERA:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ sra.w s2, s2, a7
+ sra.w s3, s3, a7
+ sra.w s4, s4, a7
+ sra.w s5, s5, a7
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.h s2, a1, 0
+ st.h s3, a1, 2
+ st.h s4, a1, 4
+ st.h s5, a1, 6
+
+ addi.d a1, a1, 8
+ sub.d a3, a3, s1
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP_HS15
+ blt zero, a2, .HS15_RESA
+ b .HS15_END
+.HS15_RESA:
+ li.w t1, 0
+.HS15_DST:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS15_DST
+ b .HS15_END
+
+.LOOP_HS15_DST8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr1, a3, t1
+ vldx vr2, a3, t2
+ vldx vr3, a3, t3
+ vldx vr4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr5, a3, t1
+ vldx vr6, a3, t2
+ vldx vr7, a3, t3
+ vldx vr8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vmulwev.w.hu.h vr1, vr5, vr13
+ vmulwev.w.hu.h vr2, vr6, vr14
+ vmulwev.w.hu.h vr3, vr7, vr15
+ vmulwev.w.hu.h vr4, vr8, vr16
+ vmaddwod.w.hu.h vr1, vr5, vr13
+ vmaddwod.w.hu.h vr2, vr6, vr14
+ vmaddwod.w.hu.h vr3, vr7, vr15
+ vmaddwod.w.hu.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsra.w vr1, vr1, vr0
+ vsra.w vr5, vr5, vr0
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vpickev.h vr1, vr5, vr1
+ vst vr1, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS15_DST8
+ blt zero, a2, .HS15_REST8
+ b .HS15_END
+.HS15_REST8:
+ li.w t1, 0
+.HS15_DST8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER8:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER8
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS15_DST8
+ b .HS15_END
+
+.LOOP_HS15_DST4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.d vr1, vr2, vr1
+ vilvl.d vr3, vr4, vr3
+ vilvl.d vr5, vr6, vr5
+ vilvl.d vr7, vr8, vr7
+ vmulwev.w.hu.h vr13, vr1, vr9
+ vmulwev.w.hu.h vr14, vr3, vr10
+ vmulwev.w.hu.h vr15, vr5, vr11
+ vmulwev.w.hu.h vr16, vr7, vr12
+ vmaddwod.w.hu.h vr13, vr1, vr9
+ vmaddwod.w.hu.h vr14, vr3, vr10
+ vmaddwod.w.hu.h vr15, vr5, vr11
+ vmaddwod.w.hu.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsra.w vr13, vr13, vr0
+ vsra.w vr15, vr15, vr0
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vpickev.h vr13, vr15, vr13
+ vst vr13, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS15_DST4
+ blt zero, a2, .HS15_REST4
+ b .HS15_END
+.HS15_REST4:
+ li.w t1, 0
+.HS15_DST4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER4:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER4
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS15_DST4
+ b .HS15_END
+.END_HS15_DST4:
+
+ li.w t1, 0
+.LOOP_HS15_DST1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER1:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER1
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_HS15_DST1
+ b .HS15_END
+.HS15_END:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
+
+/* void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize, int sh)
+ */
+function ff_hscale_16_to_19_sub_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+
+ li.w t0, 524287
+ li.w t8, 8
+ li.w t7, 4
+ vreplgr2vr.w vr20, t0
+ vreplgr2vr.w vr0, a7
+ beq a6, t7, .LOOP_HS19_DST4
+ beq a6, t8, .LOOP_HS19_DST8
+ blt t8, a6, .LOOP_HS19
+ b .END_HS19_DST4
+
+.LOOP_HS19:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_HS19_DST:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ slli.w t5, t5, 1
+ vldx vr1, a3, t2
+ vldx vr2, a3, t3
+ vldx vr3, a3, t4
+ vldx vr4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 16
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_HS19_DST
+ blt s1, a6, .HS19_DSTA
+ b .END_HS19_FILTERA
+.HS19_DSTA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.HS19_FILTERA:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.HS19_FILTERB:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.HS19_FILTERC:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.HS19_FILTERD:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERD
+.END_HS19_FILTERA:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ sra.w s2, s2, a7
+ sra.w s3, s3, a7
+ sra.w s4, s4, a7
+ sra.w s5, s5, a7
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.w s2, a1, 0
+ st.w s3, a1, 4
+ st.w s4, a1, 8
+ st.w s5, a1, 12
+
+ addi.d a1, a1, 16
+ sub.d a3, a3, s1
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP_HS19
+ blt zero, a2, .HS19_RESA
+ b .HS19_END
+.HS19_RESA:
+ li.w t1, 0
+.HS19_DST:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS19_DST
+ b .HS19_END
+
+.LOOP_HS19_DST8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr1, a3, t1
+ vldx vr2, a3, t2
+ vldx vr3, a3, t3
+ vldx vr4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr5, a3, t1
+ vldx vr6, a3, t2
+ vldx vr7, a3, t3
+ vldx vr8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vmulwev.w.hu.h vr1, vr5, vr13
+ vmulwev.w.hu.h vr2, vr6, vr14
+ vmulwev.w.hu.h vr3, vr7, vr15
+ vmulwev.w.hu.h vr4, vr8, vr16
+ vmaddwod.w.hu.h vr1, vr5, vr13
+ vmaddwod.w.hu.h vr2, vr6, vr14
+ vmaddwod.w.hu.h vr3, vr7, vr15
+ vmaddwod.w.hu.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsra.w vr1, vr1, vr0
+ vsra.w vr5, vr5, vr0
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vst vr1, a1, 0
+ vst vr5, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS19_DST8
+ blt zero, a2, .HS19_REST8
+ b .HS19_END
+.HS19_REST8:
+ li.w t1, 0
+.HS19_DST8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER8:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER8
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS19_DST8
+ b .HS19_END
+
+.LOOP_HS19_DST4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.d vr1, vr2, vr1
+ vilvl.d vr3, vr4, vr3
+ vilvl.d vr5, vr6, vr5
+ vilvl.d vr7, vr8, vr7
+ vmulwev.w.hu.h vr13, vr1, vr9
+ vmulwev.w.hu.h vr14, vr3, vr10
+ vmulwev.w.hu.h vr15, vr5, vr11
+ vmulwev.w.hu.h vr16, vr7, vr12
+ vmaddwod.w.hu.h vr13, vr1, vr9
+ vmaddwod.w.hu.h vr14, vr3, vr10
+ vmaddwod.w.hu.h vr15, vr5, vr11
+ vmaddwod.w.hu.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsra.w vr13, vr13, vr0
+ vsra.w vr15, vr15, vr0
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vst vr13, a1, 0
+ vst vr15, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS19_DST4
+ blt zero, a2, .HS19_REST4
+ b .HS19_END
+.HS19_REST4:
+ li.w t1, 0
+.HS19_DST4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER4:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER4
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS19_DST4
+ b .HS19_END
+.END_HS19_DST4:
+
+ li.w t1, 0
+.LOOP_HS19_DST1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER1:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER1
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_HS19_DST1
+ b .HS19_END
+.HS19_END:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 97fe947e2e..c13a1662ec 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -27,8 +27,33 @@
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_lsx(cpu_flags)) {
+ ff_sws_init_output_lsx(c);
+ if (c->srcBpc == 8) {
+ if (c->dstBpc <= 14) {
+ c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
+ } else {
+ c->hyScale = c->hcScale = ff_hscale_8_to_19_lsx;
+ }
+ } else {
+ c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
+ : ff_hscale_16_to_15_lsx;
+ }
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_GBRAP:
+ case AV_PIX_FMT_GBRP:
+ {
+ c->readChrPlanar = planar_rgb_to_uv_lsx;
+ c->readLumPlanar = planar_rgb_to_y_lsx;
+ }
+ break;
+ }
+ if (c->dstBpc == 8)
+ c->yuv2planeX = ff_yuv2planeX_8_lsx;
+ }
+#if HAVE_LASX
if (have_lasx(cpu_flags)) {
- ff_sws_init_output_loongarch(c);
+ ff_sws_init_output_lasx(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -51,17 +76,21 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
if (c->dstBpc == 8)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
+#endif // #if HAVE_LASX
}
av_cold void rgb2rgb_init_loongarch(void)
{
+#if HAVE_LASX
int cpu_flags = av_get_cpu_flags();
if (have_lasx(cpu_flags))
interleaveBytes = ff_interleave_bytes_lasx;
+#endif // #if HAVE_LASX
}
av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
{
+#if HAVE_LASX
int cpu_flags = av_get_cpu_flags();
if (have_lasx(cpu_flags)) {
switch (c->dstFormat) {
@@ -91,5 +120,6 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
return yuv420_abgr32_lasx;
}
}
+#endif // #if HAVE_LASX
return NULL;
}
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index c52eb1016b..bc29913ac6 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -24,7 +24,45 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
+#include "config.h"
+void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize, int sh);
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize, int sh);
+
+void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+ int width, int32_t *rgb2yuv, void *opq);
+
+void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
+ int32_t *rgb2yuv, void *opq);
+
+void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c);
+
+#if HAVE_LASX
void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
@@ -69,10 +107,11 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
-av_cold void ff_sws_init_output_loongarch(SwsContext *c);
-
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
+av_cold void ff_sws_init_output_lasx(SwsContext *c);
+#endif // #if HAVE_LASX
+
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/loongarch/swscale_lsx.c b/libswscale/loongarch/swscale_lsx.c
new file mode 100644
index 0000000000..da8eabfca3
--- /dev/null
+++ b/libswscale/loongarch/swscale_lsx.c
@@ -0,0 +1,57 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+ int sh = desc->comp[0].depth - 1;
+
+ if (sh < 15) {
+ sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
+ (desc->comp[0].depth - 1);
+ } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
+ sh = 15;
+ }
+ ff_hscale_16_to_15_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+ int bits = desc->comp[0].depth - 1;
+ int sh = bits - 4;
+
+ if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
+
+ sh = 9;
+ } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
+ sh = 16 - 1 - 4;
+ }
+ ff_hscale_16_to_19_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 925c536bf1..b02e6cdc64 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -653,7 +653,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
filterAlign = 1;
}
- if (have_lasx(cpu_flags)) {
+ if (have_lasx(cpu_flags) || have_lsx(cpu_flags)) {
int reNum = minFilterSize & (0x07);
if (minFilterSize < 5)
@@ -1806,6 +1806,7 @@ static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
const int filterAlign = X86_MMX(cpu_flags) ? 4 :
PPC_ALTIVEC(cpu_flags) ? 8 :
have_neon(cpu_flags) ? 4 :
+ have_lsx(cpu_flags) ? 8 :
have_lasx(cpu_flags) ? 8 : 1;
if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
--
2.20.1
More information about the ffmpeg-devel
mailing list