[FFmpeg-devel] [PATCH v2 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files

Sat Sep 3 05:23:28 EEST 2022

ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -pix_fmt rgb24 -y /dev/null -an
before: 178fps
after:  210fps

Signed-off-by: Hao Chen <chenhao at loongson.cn>
---
 libswscale/loongarch/Makefile                 |   2 +
 libswscale/loongarch/rgb2rgb_lasx.c           |  52 +++
 libswscale/loongarch/swscale_init_loongarch.c |  42 +++
 libswscale/loongarch/swscale_loongarch.h      |  22 ++
 libswscale/loongarch/yuv2rgb_lasx.c           | 321 ++++++++++++++++++
 libswscale/rgb2rgb.c                          |   2 +
 libswscale/rgb2rgb.h                          |   1 +
 libswscale/yuv2rgb.c                          |   2 +
 8 files changed, 444 insertions(+)
 create mode 100644 libswscale/loongarch/rgb2rgb_lasx.c
 create mode 100644 libswscale/loongarch/yuv2rgb_lasx.c

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 586a1717b6..4345971514 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -1,3 +1,5 @@
 OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
 LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
                                loongarch/input_lasx.o   \
+                               loongarch/yuv2rgb_lasx.o \
+                               loongarch/rgb2rgb_lasx.o
diff --git a/libswscale/loongarch/rgb2rgb_lasx.c b/libswscale/loongarch/rgb2rgb_lasx.c
new file mode 100644
index 0000000000..1b6be90217
--- /dev/null
+++ b/libswscale/loongarch/rgb2rgb_lasx.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao at loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
+                              uint8_t *dest, int width, int height,
+                              int src1Stride, int src2Stride, int dstStride)
+{
+    int h;
+    int len = width & (0xFFFFFFF0);
+
+    for (h = 0; h < height; h++) {
+        int w, index = 0;
+        __m256i src_1, src_2, dst;
+
+        for (w = 0; w < len; w += 16) {
+            DUP2_ARG2(__lasx_xvld, src1 + w, 0, src2 + w, 0, src_1, src_2);
+            src_1 = __lasx_xvpermi_d(src_1, 0xD8);
+            src_2 = __lasx_xvpermi_d(src_2, 0xD8);
+            dst   = __lasx_xvilvl_b(src_2, src_1);
+            __lasx_xvst(dst, dest + index, 0);
+            index  += 32;
+        }
+        for (; w < width; w++) {
+            dest[(w << 1) + 0] = src1[w];
+            dest[(w << 1) + 1] = src2[w];
+        }
+        dest += dstStride;
+        src1 += src1Stride;
+        src2 += src2Stride;
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 197dc6e1e7..1e0bb1b116 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -21,6 +21,7 @@
 
 #include "swscale_loongarch.h"
 #include "libswscale/swscale_internal.h"
+#include "libswscale/rgb2rgb.h"
 #include "libavutil/loongarch/cpu.h"
 
 av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
@@ -48,3 +49,44 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
         }
     }
 }
+
+av_cold void rgb2rgb_init_loongarch(void)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags))
+        interleaveBytes = ff_interleave_bytes_lasx;
+}
+
+av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags)) {
+        switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB24:
+                return yuv420_rgb24_lasx;
+            case AV_PIX_FMT_BGR24:
+                return yuv420_bgr24_lasx;
+            case AV_PIX_FMT_RGBA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_rgba32_lasx;
+            case AV_PIX_FMT_ARGB:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_argb32_lasx;
+            case AV_PIX_FMT_BGRA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_bgra32_lasx;
+            case AV_PIX_FMT_ABGR:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_abgr32_lasx;
+        }
+    }
+    return NULL;
+}
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 67e24d544c..07a8145da2 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -47,4 +47,26 @@ void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4]
 void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
                           int32_t *rgb2yuv);
 
+int yuv420_rgb24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgr24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_rgba32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgra32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_argb32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_abgr32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
+                              uint8_t *dest, int width, int height,
+                              int src1Stride, int src2Stride, int dstStride);
+
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/loongarch/yuv2rgb_lasx.c b/libswscale/loongarch/yuv2rgb_lasx.c
new file mode 100644
index 0000000000..4ea47f10db
--- /dev/null
+++ b/libswscale/loongarch/yuv2rgb_lasx.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao at loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+#define YUV2RGB_LOAD_COE                                     \
+    /* Load x_offset */                                      \
+    __m256i y_offset = __lasx_xvreplgr2vr_d(c->yOffset);     \
+    __m256i u_offset = __lasx_xvreplgr2vr_d(c->uOffset);     \
+    __m256i v_offset = __lasx_xvreplgr2vr_d(c->vOffset);     \
+    /* Load x_coeff  */                                      \
+    __m256i ug_coeff = __lasx_xvreplgr2vr_d(c->ugCoeff);     \
+    __m256i vg_coeff = __lasx_xvreplgr2vr_d(c->vgCoeff);     \
+    __m256i y_coeff  = __lasx_xvreplgr2vr_d(c->yCoeff);      \
+    __m256i ub_coeff = __lasx_xvreplgr2vr_d(c->ubCoeff);     \
+    __m256i vr_coeff = __lasx_xvreplgr2vr_d(c->vrCoeff);     \
+
+#define LOAD_YUV_16                                          \
+    m_y1 = __lasx_xvld(py_1, 0);                             \
+    m_y2 = __lasx_xvld(py_2, 0);                             \
+    m_u  = __lasx_xvldrepl_d(pu, 0);                         \
+    m_v  = __lasx_xvldrepl_d(pv, 0);                         \
+    m_u  = __lasx_xvilvl_b(m_u, m_u);                        \
+    m_v  = __lasx_xvilvl_b(m_v, m_v);                        \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, m_y1, m_y2, m_u, m_v,    \
+              m_y1, m_y2, m_u, m_v);                         \
+
+/* YUV2RGB method
+ * The conversion method is as follows:
+ * R = Y' * y_coeff + V' * vr_coeff
+ * G = Y' * y_coeff + V' * vg_coeff + U' * ug_coeff
+ * B = Y' * y_coeff + U' * ub_coeff
+ *
+ * where X' = X * 8 - x_offset
+ *
+ */
+
+#define YUV2RGB                                                            \
+    m_y1 = __lasx_xvslli_h(m_y1, 3);                                       \
+    m_y2 = __lasx_xvslli_h(m_y2, 3);                                       \
+    m_u  = __lasx_xvslli_h(m_u, 3);                                        \
+    m_v  = __lasx_xvslli_h(m_v, 3);                                        \
+    m_y1 = __lasx_xvsub_h(m_y1, y_offset);                                 \
+    m_y2 = __lasx_xvsub_h(m_y2, y_offset);                                 \
+    m_u  = __lasx_xvsub_h(m_u, u_offset);                                  \
+    m_v  = __lasx_xvsub_h(m_v, v_offset);                                  \
+    y_1  = __lasx_xvmuh_h(m_y1, y_coeff);                                  \
+    y_2  = __lasx_xvmuh_h(m_y2, y_coeff);                                  \
+    u2g  = __lasx_xvmuh_h(m_u, ug_coeff);                                  \
+    u2b  = __lasx_xvmuh_h(m_u, ub_coeff);                                  \
+    v2r  = __lasx_xvmuh_h(m_v, vr_coeff);                                  \
+    v2g  = __lasx_xvmuh_h(m_v, vg_coeff);                                  \
+    r1   = __lasx_xvsadd_h(y_1, v2r);                                      \
+    v2g  = __lasx_xvsadd_h(v2g, u2g);                                      \
+    g1   = __lasx_xvsadd_h(y_1, v2g);                                      \
+    b1   = __lasx_xvsadd_h(y_1, u2b);                                      \
+    r2   = __lasx_xvsadd_h(y_2, v2r);                                      \
+    g2   = __lasx_xvsadd_h(y_2, v2g);                                      \
+    b2   = __lasx_xvsadd_h(y_2, u2b);                                      \
+    DUP4_ARG1(__lasx_xvclip255_h, r1, g1, b1, r2, r1, g1, b1, r2);         \
+    DUP2_ARG1(__lasx_xvclip255_h, g2, b2, g2, b2);                         \
+
+#define YUV2RGB_RES                                                        \
+    m_y1 = __lasx_xvldrepl_d(py_1, 0);                       \
+    m_y2 = __lasx_xvldrepl_d(py_2, 0);                       \
+    m_u  = __lasx_xvldrepl_d(pu, 0);                         \
+    m_v  = __lasx_xvldrepl_d(pv, 0);                         \
+    m_y1 = __lasx_xvilvl_d(m_y2, m_y1);                      \
+    m_u  = __lasx_xvilvl_b(m_u, m_u);                        \
+    m_v  = __lasx_xvilvl_b(m_v, m_v);                        \
+    m_y1 = __lasx_vext2xv_hu_bu(m_y1);                       \
+    m_u  = __lasx_vext2xv_hu_bu(m_u);                        \
+    m_v  = __lasx_vext2xv_hu_bu(m_v);                        \
+    m_y1 = __lasx_xvslli_h(m_y1, 3);                         \
+    m_u  = __lasx_xvslli_h(m_u, 3);                          \
+    m_v  = __lasx_xvslli_h(m_v, 3);                          \
+    m_y1 = __lasx_xvsub_h(m_y1, y_offset);                   \
+    m_u  = __lasx_xvsub_h(m_u, u_offset);                    \
+    m_v  = __lasx_xvsub_h(m_v, v_offset);                    \
+    y_1  = __lasx_xvmuh_h(m_y1, y_coeff);                    \
+    u2g  = __lasx_xvmuh_h(m_u, ug_coeff);                    \
+    u2b  = __lasx_xvmuh_h(m_u, ub_coeff);                    \
+    v2r  = __lasx_xvmuh_h(m_v, vr_coeff);                    \
+    v2g  = __lasx_xvmuh_h(m_v, vg_coeff);                    \
+    r1   = __lasx_xvsadd_h(y_1, v2r);                        \
+    v2g  = __lasx_xvsadd_h(v2g, u2g);                        \
+    g1   = __lasx_xvsadd_h(y_1, v2g);                        \
+    b1   = __lasx_xvsadd_h(y_1, u2b);                        \
+    r1   = __lasx_xvclip255_h(r1);                           \
+    g1   = __lasx_xvclip255_h(g1);                           \
+    b1   = __lasx_xvclip255_h(b1);                           \
+
+#define RGB_PACK(r, g, b, rgb_l, rgb_h)                                    \
+{                                                                          \
+    __m256i rg;                                                            \
+    rg = __lasx_xvpackev_b(g, r);                                          \
+    DUP2_ARG3(__lasx_xvshuf_b, b, rg, shuf2, b, rg, shuf3, rgb_l, rgb_h);  \
+}
+
+#define RGB32_PACK(a, r, g, b, rgb_l, rgb_h)                               \
+{                                                                          \
+    __m256i ra, bg, tmp0, tmp1;                                            \
+    ra    = __lasx_xvpackev_b(r, a);                                       \
+    bg    = __lasx_xvpackev_b(b, g);                                       \
+    tmp0  = __lasx_xvilvl_h(bg, ra);                                       \
+    tmp1  = __lasx_xvilvh_h(bg, ra);                                       \
+    rgb_l = __lasx_xvpermi_q(tmp1, tmp0, 0x20);                            \
+    rgb_h = __lasx_xvpermi_q(tmp1, tmp0, 0x31);                            \
+}
+
+#define RGB_STORE_RES(rgb_l, rgb_h, image_1, image_2)                      \
+{                                                                          \
+    __lasx_xvstelm_d(rgb_l, image_1, 0,  0);                               \
+    __lasx_xvstelm_d(rgb_l, image_1, 8,  1);                               \
+    __lasx_xvstelm_d(rgb_h, image_1, 16, 0);                               \
+    __lasx_xvstelm_d(rgb_l, image_2, 0,  2);                               \
+    __lasx_xvstelm_d(rgb_l, image_2, 8,  3);                               \
+    __lasx_xvstelm_d(rgb_h, image_2, 16, 2);                               \
+}
+
+#define RGB_STORE(rgb_l, rgb_h, image)                                       \
+{                                                                            \
+    __lasx_xvstelm_d(rgb_l, image, 0,  0);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 8,  1);                                   \
+    __lasx_xvstelm_d(rgb_h, image, 16, 0);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 24, 2);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 32, 3);                                   \
+    __lasx_xvstelm_d(rgb_h, image, 40, 2);                                   \
+}
+
+#define RGB32_STORE(rgb_l, rgb_h, image)                                     \
+{                                                                            \
+    __lasx_xvst(rgb_l, image, 0);                                            \
+    __lasx_xvst(rgb_h, image, 32);                                           \
+}
+
+#define RGB32_STORE_RES(rgb_l, rgb_h, image_1, image_2)                      \
+{                                                                            \
+    __lasx_xvst(rgb_l, image_1, 0);                                          \
+    __lasx_xvst(rgb_h, image_2, 0);                                          \
+}
+
+#define YUV2RGBFUNC(func_name, dst_type, alpha)                                     \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m256i m_y1, m_y2, m_u, m_v;                                                   \
+    __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m256i shuf2 = {0x0504120302100100, 0x0A18090816070614,                        \
+                     0x0504120302100100, 0x0A18090816070614};                       \
+    __m256i shuf3 = {0x1E0F0E1C0D0C1A0B, 0x0101010101010101,                        \
+                     0x1E0F0E1C0D0C1A0B, 0x0101010101010101};                       \
+    y      = (c->dstW + 7) & ~7;                                                    \
+    h_size = y >> 4;                                                                \
+    res    = y & 15;                                                                \
+    YUV2RGB_LOAD_COE                                                                \
+                                                                                    \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        dst_type *image1    = (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]);\
+        dst_type *image2    = (dst_type *)(image1 +                   dstStride[0]);\
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define YUV2RGBFUNC32(func_name, dst_type, alpha)                                   \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m256i m_y1, m_y2, m_u, m_v;                                                   \
+    __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m256i a = __lasx_xvldi(0xFF);                                                 \
+                                                                                    \
+    y      = (c->dstW + 7) & ~7;                                                    \
+    h_size = y >> 4;                                                                \
+    res    = y & 15;                                                                \
+    YUV2RGB_LOAD_COE                                                                \
+                                                                                    \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        int yd = y + srcSliceY;                                                     \
+        dst_type av_unused *r, *g, *b;                                              \
+        dst_type *image1    = (dst_type *)(dst[0] + (yd)     * dstStride[0]);       \
+        dst_type *image2    = (dst_type *)(dst[0] + (yd + 1) * dstStride[0]);       \
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define DEALYUV2RGBREMAIN                                                           \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 48;                                                           \
+            image2 += 48;                                                           \
+        }                                                                           \
+        if (res) {                                                                  \
+
+#define DEALYUV2RGBREMAIN32                                                         \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 16;                                                           \
+            image2 += 16;                                                           \
+        }                                                                           \
+        if (res) {                                                                  \
+
+
+#define END_FUNC()                          \
+        }                                   \
+    }                                       \
+    return srcSliceH;                       \
+}
+
+YUV2RGBFUNC(yuv420_rgb24_lasx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+    RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN
+    YUV2RGB_RES
+    RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+    RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC(yuv420_bgr24_lasx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+    RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN
+    YUV2RGB_RES
+    RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+    RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_rgba32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_bgra32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_argb32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_abgr32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 4f1ac9c465..3af775b389 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -141,6 +141,8 @@ av_cold void ff_sws_rgb2rgb_init(void)
     rgb2rgb_init_aarch64();
 #elif ARCH_X86
     rgb2rgb_init_x86();
+#elif ARCH_LOONGARCH64
+    rgb2rgb_init_loongarch();
 #endif
 }
 
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 7272e98c57..db85bfc42f 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -168,5 +168,6 @@ void ff_sws_rgb2rgb_init(void);
 
 void rgb2rgb_init_aarch64(void);
 void rgb2rgb_init_x86(void);
+void rgb2rgb_init_loongarch(void);
 
 #endif /* SWSCALE_RGB2RGB_H */
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 6ee483d12a..9c3f5e23c6 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -683,6 +683,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     t = ff_yuv2rgb_init_ppc(c);
 #elif ARCH_X86
     t = ff_yuv2rgb_init_x86(c);
+#elif ARCH_LOONGARCH64
+    t = ff_yuv2rgb_init_loongarch(c);
 #endif
 
     if (t)
-- 
2.20.1