[FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.

Mon Aug 29 15:30:42 EEST 2022

Hao Chen:
> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
> rgb24 -y /dev/null -an
> before: 150fps
> after:  183fps
> 
> Signed-off-by: Hao Chen <chenhao at loongson.cn>
> ---
>  libswscale/loongarch/Makefile                 |    3 +-
>  libswscale/loongarch/output_lasx.c            | 1982 +++++++++++++++++
>  libswscale/loongarch/swscale_init_loongarch.c |    3 +
>  libswscale/loongarch/swscale_loongarch.h      |    6 +
>  4 files changed, 1993 insertions(+), 1 deletion(-)
>  create mode 100644 libswscale/loongarch/output_lasx.c
> 
> diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
> index 4345971514..54d48b3de0 100644
> --- a/libswscale/loongarch/Makefile
> +++ b/libswscale/loongarch/Makefile
> @@ -2,4 +2,5 @@ OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
>  LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
>                                 loongarch/input_lasx.o   \
>                                 loongarch/yuv2rgb_lasx.o \
> -                               loongarch/rgb2rgb_lasx.o
> +                               loongarch/rgb2rgb_lasx.o \
> +							   loongarch/output_lasx.o
> diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
> new file mode 100644
> index 0000000000..19f82692ff
> --- /dev/null
> +++ b/libswscale/loongarch/output_lasx.c
> @@ -0,0 +1,1982 @@
> +/*
> + * Copyright (C) 2022 Loongson Technology Corporation Limited
> + * Contributed by Hao Chen(chenhao at loongson.cn)
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "swscale_loongarch.h"
> +#include "libavutil/loongarch/loongson_intrinsics.h"
> +
> +void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
> +                          const int16_t **src, uint8_t *dest, int dstW,
> +                          const uint8_t *dither, int offset)
> +{
> +    int i;
> +    int len = dstW - 15;
> +    __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
> +                    0x1C0C180814041000, 0x1C1814100C080400};
> +    __m256i val1, val2, val3;
> +    uint8_t dither0 = dither[offset & 7];
> +    uint8_t dither1 = dither[(offset + 1) & 7];
> +    uint8_t dither2 = dither[(offset + 2) & 7];
> +    uint8_t dither3 = dither[(offset + 3) & 7];
> +    uint8_t dither4 = dither[(offset + 4) & 7];
> +    uint8_t dither5 = dither[(offset + 5) & 7];
> +    uint8_t dither6 = dither[(offset + 6) & 7];
> +    uint8_t dither7 = dither[(offset + 7) & 7];
> +    int val_1[8] = {dither0, dither2, dither4, dither6,
> +                    dither0, dither2, dither4, dither6};
> +    int val_2[8] = {dither1, dither3, dither5, dither7,
> +                    dither1, dither3, dither5, dither7};
> +    int val_3[8] = {dither0, dither1, dither2, dither3,
> +                    dither4, dither5, dither6, dither7};
> +
> +    DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
> +    val3 = __lasx_xvld(val_3, 0);
> +
> +    for (i = 0; i < len; i += 16) {
> +        int j;
> +        __m256i src0, filter0, val;
> +        __m256i val_ev, val_od;
> +
> +        val_ev = __lasx_xvslli_w(val1, 12);
> +        val_od = __lasx_xvslli_w(val2, 12);
> +
> +        for (j = 0; j < filterSize; j++) {
> +            src0  = __lasx_xvld(src[j]+ i, 0);
> +            filter0 = __lasx_xvldrepl_h((filter + j), 0);
> +            val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
> +            val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
> +        }
> +        val_ev = __lasx_xvsrai_w(val_ev, 19);
> +        val_od = __lasx_xvsrai_w(val_od, 19);
> +        val_ev = __lasx_xvclip255_w(val_ev);
> +        val_od = __lasx_xvclip255_w(val_od);
> +        val    = __lasx_xvshuf_b(val_od, val_ev, mask);
> +        __lasx_xvstelm_d(val, (dest + i), 0, 0);
> +        __lasx_xvstelm_d(val, (dest + i), 8, 2);
> +    }
> +    if (dstW - i >= 8){
> +        int j;
> +        __m256i src0, filter0, val_h;
> +        __m256i val_l;
> +
> +        val_l = __lasx_xvslli_w(val3, 12);
> +
> +        for (j = 0; j < filterSize; j++) {
> +            src0  = __lasx_xvld(src[j] + i, 0);
> +            src0  = __lasx_vext2xv_w_h(src0);
> +            filter0 = __lasx_xvldrepl_h((filter + j), 0);
> +            filter0 = __lasx_vext2xv_w_h(filter0);
> +            val_l = __lasx_xvmadd_w(val_l, src0, filter0);
> +        }
> +        val_l = __lasx_xvsrai_w(val_l, 19);
> +        val_l = __lasx_xvclip255_w(val_l);
> +        val_h = __lasx_xvpermi_d(val_l, 0x4E);
> +        val_l = __lasx_xvshuf_b(val_h, val_l, mask);
> +        __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
> +        i += 8;
> +    }
> +    for (; i < dstW; i++) {
> +        int val = dither[(i + offset) & 7] << 12;
> +        int j;
> +        for (j = 0; j< filterSize; j++)
> +            val += src[j][i] * filter[j];
> +
> +        dest[i] = av_clip_uint8(val >> 19);
> +    }
> +}
> +
> +/*Copy from libswscale/output.c*/
> +static av_always_inline void
> +yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
> +              unsigned A1, unsigned A2,
> +              const void *_r, const void *_g, const void *_b, int y,
> +              enum AVPixelFormat target, int hasAlpha)
> +{
> +    if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
> +        target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
> +        uint32_t *dest = (uint32_t *) _dest;
> +        const uint32_t *r = (const uint32_t *) _r;
> +        const uint32_t *g = (const uint32_t *) _g;
> +        const uint32_t *b = (const uint32_t *) _b;
> +
> +#if CONFIG_SMALL
> +        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
> +        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
> +#else
> +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
> +        int sh = (target == AV_PIX_FMT_RGB32_1 ||
> +                  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
> +        av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
> +#endif
> +        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
> +        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
> +#endif
> +    } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
> +        uint8_t *dest = (uint8_t *) _dest;
> +        const uint8_t *r = (const uint8_t *) _r;
> +        const uint8_t *g = (const uint8_t *) _g;
> +        const uint8_t *b = (const uint8_t *) _b;
> +
> +#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
> +#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
> +
> +        dest[i * 6 + 0] = r_b[Y1];
> +        dest[i * 6 + 1] =   g[Y1];
> +        dest[i * 6 + 2] = b_r[Y1];
> +        dest[i * 6 + 3] = r_b[Y2];
> +        dest[i * 6 + 4] =   g[Y2];
> +        dest[i * 6 + 5] = b_r[Y2];
> +#undef r_b
> +#undef b_r
> +    } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
> +               target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
> +               target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
> +        uint16_t *dest = (uint16_t *) _dest;
> +        const uint16_t *r = (const uint16_t *) _r;
> +        const uint16_t *g = (const uint16_t *) _g;
> +        const uint16_t *b = (const uint16_t *) _b;
> +        int dr1, dg1, db1, dr2, dg2, db2;
> +
> +        if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
> +            dr1 = ff_dither_2x2_8[ y & 1     ][0];
> +            dg1 = ff_dither_2x2_4[ y & 1     ][0];
> +            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
> +            dr2 = ff_dither_2x2_8[ y & 1     ][1];
> +            dg2 = ff_dither_2x2_4[ y & 1     ][1];
> +            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
> +    } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
> +            dr1 = ff_dither_2x2_8[ y & 1     ][0];
> +            dg1 = ff_dither_2x2_8[ y & 1     ][1];
> +            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
> +            dr2 = ff_dither_2x2_8[ y & 1     ][1];
> +            dg2 = ff_dither_2x2_8[ y & 1     ][0];
> +            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
> +        } else {
> +            dr1 = ff_dither_4x4_16[ y & 3     ][0];
> +            dg1 = ff_dither_4x4_16[ y & 3     ][1];
> +            db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
> +            dr2 = ff_dither_4x4_16[ y & 3     ][1];
> +            dg2 = ff_dither_4x4_16[ y & 3     ][0];
> +            db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
> +        }
> +
> +        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
> +        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
> +    } else /* 8/4 bits */ {
> +        uint8_t *dest = (uint8_t *) _dest;
> +        const uint8_t *r = (const uint8_t *) _r;
> +        const uint8_t *g = (const uint8_t *) _g;
> +        const uint8_t *b = (const uint8_t *) _b;
> +        int dr1, dg1, db1, dr2, dg2, db2;
> +
> +        if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
> +            const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
> +            const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
> +            dr1 = dg1 = d32[(i * 2 + 0) & 7];
> +            db1 =       d64[(i * 2 + 0) & 7];
> +            dr2 = dg2 = d32[(i * 2 + 1) & 7];
> +            db2 =       d64[(i * 2 + 1) & 7];
> +        } else {
> +            const uint8_t * const d64  = ff_dither_8x8_73 [y & 7];
> +            const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
> +            dr1 = db1 = d128[(i * 2 + 0) & 7];
> +            dg1 =        d64[(i * 2 + 0) & 7];
> +            dr2 = db2 = d128[(i * 2 + 1) & 7];
> +            dg2 =        d64[(i * 2 + 1) & 7];
> +        }
> +
> +        if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
> +            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
> +                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
> +        } else {
> +            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
> +            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
> +        }
> +    }
> +}
> +
> +#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)    \
> +{                                                                      \
> +    Y1 = __lasx_xvpickve2gr_w(vec_y1, t1);                             \
> +    Y2 = __lasx_xvpickve2gr_w(vec_y2, t2);                             \
> +    U  = __lasx_xvpickve2gr_w(vec_u, t3);                              \
> +    V  = __lasx_xvpickve2gr_w(vec_v, t4);                              \
> +    r  =  c->table_rV[V];                                              \
> +    g  = (c->table_gU[U] + c->table_gV[V]);                            \
> +    b  =  c->table_bU[U];                                              \
> +    yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                           \
> +                  r, g, b, y, target, 0);                              \
> +    count++;                                                           \
> +}
> +
> +static void
> +yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
> +                        const int16_t **lumSrc, int lumFilterSize,
> +                        const int16_t *chrFilter, const int16_t **chrUSrc,
> +                        const int16_t **chrVSrc, int chrFilterSize,
> +                        const int16_t **alpSrc, uint8_t *dest, int dstW,
> +                        int y, enum AVPixelFormat target, int hasAlpha)
> +{
> +    int i, j;
> +    int count = 0;
> +    int t     = 1 << 18;
> +    int len   = dstW >> 6;
> +    int res   = dstW & 63;
> +    int len_count = (dstW + 1) >> 1;
> +    const void *r, *g, *b;
> +    int head = YUVRGB_TABLE_HEADROOM;
> +    __m256i headroom  = __lasx_xvreplgr2vr_w(head);
> +
> +    for (i = 0; i < len; i++) {
> +        int Y1, Y2, U, V, count_lum = count << 1;
> +        __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
> +        __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
> +        __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
> +
> +        yl1_ev = __lasx_xvldrepl_w(&t, 0);
> +        yl1_od = yl1_ev;
> +        yh1_ev = yl1_ev;
> +        yh1_od = yl1_ev;
> +        u1_ev  = yl1_ev;
> +        v1_ev  = yl1_ev;
> +        u1_od  = yl1_ev;
> +        v1_od  = yl1_ev;
> +        yl2_ev = yl1_ev;
> +        yl2_od = yl1_ev;
> +        yh2_ev = yl1_ev;
> +        yh2_od = yl1_ev;
> +        u2_ev  = yl1_ev;
> +        v2_ev  = yl1_ev;
> +        u2_od  = yl1_ev;
> +        v2_od  = yl1_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            int16_t *src_lum = lumSrc[j] + count_lum;
> +            temp    = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
> +                      src_lum, 96, l_src1, l_src2, l_src3, l_src4);
> +
> +            yl1_ev  = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
> +            yl1_od  = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
> +            yh1_ev  = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
> +            yh1_od  = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
> +            yl2_ev  = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
> +            yl2_od  = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
> +            yh2_ev  = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
> +            yh2_od  = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
> +                      u_src1, u_src2);
> +            DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
> +                      v_src1, v_src2);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u1_ev  = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
> +            u1_od  = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
> +            v1_ev  = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
> +            v1_od  = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
> +            u2_ev  = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
> +            u2_od  = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
> +            v2_ev  = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
> +            v2_od  = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
> +        }
> +        yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
> +        yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
> +        yl1_od = __lasx_xvsrai_w(yl1_od, 19);
> +        yh1_od = __lasx_xvsrai_w(yh1_od, 19);
> +        u1_ev  = __lasx_xvsrai_w(u1_ev, 19);
> +        v1_ev  = __lasx_xvsrai_w(v1_ev, 19);
> +        u1_od  = __lasx_xvsrai_w(u1_od, 19);
> +        v1_od  = __lasx_xvsrai_w(v1_od, 19);
> +        yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
> +        yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
> +        yl2_od = __lasx_xvsrai_w(yl2_od, 19);
> +        yh2_od = __lasx_xvsrai_w(yh2_od, 19);
> +        u2_ev  = __lasx_xvsrai_w(u2_ev, 19);
> +        v2_ev  = __lasx_xvsrai_w(v2_ev, 19);
> +        u2_od  = __lasx_xvsrai_w(u2_od, 19);
> +        v2_od  = __lasx_xvsrai_w(v2_od, 19);
> +        u1_ev  = __lasx_xvadd_w(u1_ev, headroom);
> +        v1_ev  = __lasx_xvadd_w(v1_ev, headroom);
> +        u1_od  = __lasx_xvadd_w(u1_od, headroom);
> +        v1_od  = __lasx_xvadd_w(v1_od, headroom);
> +        u2_ev  = __lasx_xvadd_w(u2_ev, headroom);
> +        v2_ev  = __lasx_xvadd_w(v2_ev, headroom);
> +        u2_od  = __lasx_xvadd_w(u2_od, headroom);
> +        v2_od  = __lasx_xvadd_w(v2_od, headroom);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
> +    }
> +    if (res >= 32) {
> +        int Y1, Y2, U, V, count_lum = count << 1;
> +        __m256i l_src1, l_src2, u_src, v_src;
> +        __m256i yl_ev, yl_od, yh_ev, yh_od;
> +        __m256i u_ev, u_od, v_ev, v_od, temp;
> +
> +        yl_ev = __lasx_xvldrepl_w(&t, 0);
> +        yl_od = yl_ev;
> +        yh_ev = yl_ev;
> +        yh_od = yl_ev;
> +        u_ev  = yl_ev;
> +        v_ev  = yl_ev;
> +        u_od  = yl_ev;
> +        v_od  = yl_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp   = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
> +                      32, l_src1, l_src2);
> +            yl_ev  = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
> +            yl_od  = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
> +            yh_ev  = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
> +            yh_od  = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
> +                      u_src, v_src);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u_ev  = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
> +            u_od  = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
> +            v_ev  = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
> +            v_od  = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
> +        }
> +        yl_ev = __lasx_xvsrai_w(yl_ev, 19);
> +        yh_ev = __lasx_xvsrai_w(yh_ev, 19);
> +        yl_od = __lasx_xvsrai_w(yl_od, 19);
> +        yh_od = __lasx_xvsrai_w(yh_od, 19);
> +        u_ev  = __lasx_xvsrai_w(u_ev, 19);
> +        v_ev  = __lasx_xvsrai_w(v_ev, 19);
> +        u_od  = __lasx_xvsrai_w(u_od, 19);
> +        v_od  = __lasx_xvsrai_w(v_od, 19);
> +        u_ev  = __lasx_xvadd_w(u_ev, headroom);
> +        v_ev  = __lasx_xvadd_w(v_ev, headroom);
> +        u_od  = __lasx_xvadd_w(u_od, headroom);
> +        v_od  = __lasx_xvadd_w(v_od, headroom);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
> +        res -= 32;
> +    }
> +    if (res >= 16) {
> +        int Y1, Y2, U, V;
> +        int count_lum = count << 1;
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, y_od, u, v, temp;
> +
> +        y_ev = __lasx_xvldrepl_w(&t, 0);
> +        y_od = y_ev;
> +        u    = y_ev;
> +        v    = y_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
> +            y_od  = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
> +                      0, u_src, v_src);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u_src = __lasx_vext2xv_w_h(u_src);
> +            v_src = __lasx_vext2xv_w_h(v_src);
> +            u     = __lasx_xvmaddwev_w_h(u, temp, u_src);
> +            v     = __lasx_xvmaddwev_w_h(v, temp, v_src);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 19);
> +        y_od = __lasx_xvsrai_w(y_od, 19);
> +        u    = __lasx_xvsrai_w(u, 19);
> +        v    = __lasx_xvsrai_w(v, 19);
> +        u    = __lasx_xvadd_w(u, headroom);
> +        v    = __lasx_xvadd_w(v, headroom);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
> +        res -= 16;
> +    }
> +    if (res >= 8) {
> +        int Y1, Y2, U, V;
> +        int count_lum = count << 1;
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, uv, temp;
> +
> +        y_ev = __lasx_xvldrepl_w(&t, 0);
> +        uv   = y_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
> +            l_src = __lasx_vext2xv_w_h(l_src);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
> +            v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u_src = __lasx_xvilvl_d(v_src, u_src);
> +            u_src = __lasx_vext2xv_w_h(u_src);
> +            uv    = __lasx_xvmaddwev_w_h(uv, temp, u_src);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 19);
> +        uv   = __lasx_xvsrai_w(uv, 19);
> +        uv   = __lasx_xvadd_w(uv, headroom);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
> +    }
> +    for (; count < len_count; count++) {
> +        int Y1 = 1 << 18;
> +        int Y2 = Y1;
> +        int U  = Y1;
> +        int V  = Y1;
> +
> +        for (j = 0; j < lumFilterSize; j++) {
> +            Y1 += lumSrc[j][count * 2]     * lumFilter[j];
> +            Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            U += chrUSrc[j][count] * chrFilter[j];
> +            V += chrVSrc[j][count] * chrFilter[j];
> +        }
> +        Y1 >>= 19;
> +        Y2 >>= 19;
> +        U  >>= 19;
> +        V  >>= 19;
> +        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
> +        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +             c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
> +        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                      r, g, b, y, target, 0);
> +    }
> +}
> +
> +static void
> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
> +                        const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                        const int16_t *abuf[2], uint8_t *dest, int dstW,
> +                        int yalpha, int uvalpha, int y,
> +                        enum AVPixelFormat target, int hasAlpha)
> +{
> +    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
> +                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
> +                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
> +    int yalpha1   = 4096 - yalpha;
> +    int uvalpha1  = 4096 - uvalpha;
> +    int i, count  = 0;
> +    int len       = dstW - 15;
> +    int len_count = (dstW + 1) >> 1;
> +    const void *r, *g, *b;
> +    int head  = YUVRGB_TABLE_HEADROOM;
> +    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
> +    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
> +    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
> +    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
> +    __m256i headroom   = __lasx_xvreplgr2vr_w(head);
> +
> +    for (i = 0; i < len; i += 16) {
> +        int Y1, Y2, U, V;
> +        int i_dex = i << 1;
> +        int c_dex = count << 1;
> +        __m256i y0_h, y0_l, y0, u0, v0;
> +        __m256i y1_h, y1_l, y1, u1, v1;
> +        __m256i y_l, y_h, u, v;
> +
> +        DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
> +                  buf1, i_dex, y0, u0, v0, y1);
> +        DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
> +        DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
> +        DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
> +        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
> +        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
> +        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
> +        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
> +        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
> +        y_l  = __lasx_xvsrai_w(y_l, 19);
> +        y_h  = __lasx_xvsrai_w(y_h, 19);
> +        u    = __lasx_xvsrai_w(u, 19);
> +        v    = __lasx_xvsrai_w(v, 19);
> +        u    = __lasx_xvadd_w(u, headroom);
> +        v    = __lasx_xvadd_w(v, headroom);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
> +    }
> +    if (dstW - i >= 8) {
> +        int Y1, Y2, U, V;
> +        int i_dex = i << 1;
> +        __m256i y0_l, y0, u0, v0;
> +        __m256i y1_l, y1, u1, v1;
> +        __m256i y_l, u, v;
> +
> +        y0   = __lasx_xvldx(buf0, i_dex);

1. Not long ago, I tried to constify the src pointer of several asm
functions and noticed that they produced new warnings for loongarch
(according to patchwork:
https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/),
even though I was sure that the code is const-correct. After finding
(via https://github.com/opencv/opencv/pull/21833) a toolchain
(https://gitee.com/wenux/cross-compiler-la-on-x86) that can build the
lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
at the moment; at least, my self-compiled loongarch-GCC did not support
lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
use const at all, even for functions that only read data (I presume the
vl in __lsx_vldx stands for "vector load"?).
So I sent another iteration
https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html of
that patchset that now added wrappers for __lsx_vldx() and
__lasx_xvldx() and cc'ed you and some other developers from loongson to
alert you of the issue in the hope that you fix the headers, so that my
wrappers wouldn't need to be applied. That didn't work, as my mails
could not be delivered to you. So I applied the patchset.
2. You use __lasx_xvldx() to read from a const int16_t. This will give
new warnings unless the above issue has been fixed. Has it?
3. I don't know whether it has, as patchwork's fate tests don't work for
a few days already. Given that the mails I receive from patchwork when
it doesn't like a commit message arrive from "Patchwork
<yinshiyou-hf at loongson.cn>" I presume that loongson is now somehow
running patchwork, so you should be able to inform the right people to
fix it.
4. If you fixed the const-issue, can you please make an updated
toolchain with lsx and lasx support enabled available to us?

- Andreas

> +        u0   = __lasx_xvldrepl_d((ubuf0 + count), 0);
> +        v0   = __lasx_xvldrepl_d((vbuf0 + count), 0);
> +        y1   = __lasx_xvldx(buf1, i_dex);
> +        u1   = __lasx_xvldrepl_d((ubuf1 + count), 0);
> +        v1   = __lasx_xvldrepl_d((vbuf1 + count), 0);
> +        DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
> +        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
> +        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
> +        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
> +        y_l  = __lasx_xvsrai_w(y_l, 19);
> +        u    = __lasx_xvsrai_w(u, 19);
> +        v    = __lasx_xvsrai_w(v, 19);
> +        u    = __lasx_xvadd_w(u, headroom);
> +        v    = __lasx_xvadd_w(v, headroom);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 2, 2);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 3, 3);
> +        i += 8;
> +    }
> +    for (; count < len_count; count++) {
> +        int Y1 = (buf0[count * 2]     * yalpha1  +
> +                  buf1[count * 2]     * yalpha)  >> 19;
> +        int Y2 = (buf0[count * 2 + 1] * yalpha1  +
> +                  buf1[count * 2 + 1] * yalpha) >> 19;
> +        int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
> +        int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
> +
> +        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
> +        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +             c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
> +        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                      r, g, b, y, target, 0);
> +    }
> +}
> +
> +static void
> +yuv2rgb_1_template_lasx(SwsContext *c, const int16_t *buf0,
> +                        const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                        const int16_t *abuf0, uint8_t *dest, int dstW,
> +                        int uvalpha, int y, enum AVPixelFormat target,
> +                        int hasAlpha)
> +{
> +    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
> +    int i;
> +    int len       = (dstW - 15);
> +    int len_count = (dstW + 1) >> 1;
> +    const void *r, *g, *b;
> +
> +    if (uvalpha < 2048) {
> +        int count    = 0;
> +        int head = YUVRGB_TABLE_HEADROOM;
> +        __m256i headroom  = __lasx_xvreplgr2vr_h(head);
> +
> +        for (i = 0; i < len; i += 16) {
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            int c_dex = count << 1;
> +            __m256i src_y, src_u, src_v;
> +            __m256i u, v, y_l, y_h;
> +
> +            DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
> +            src_v = __lasx_xvldx(vbuf0, c_dex);
> +            src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
> +            src_y = __lasx_xvsrari_h(src_y, 7);
> +            src_u = __lasx_xvsrari_h(src_u, 7);
> +            y_l   = __lasx_xvsllwil_w_h(src_y, 0);
> +            y_h   = __lasx_xvexth_w_h(src_y);
> +            u     = __lasx_xvaddwev_w_h(src_u, headroom);
> +            v     = __lasx_xvaddwod_w_h(src_u, headroom);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
> +        }
> +        if (dstW - i >= 8){
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            __m256i src_y, src_u, src_v;
> +            __m256i y_l, uv;
> +
> +            src_y  = __lasx_xvldx(buf0, i_dex);
> +            src_u  = __lasx_xvldrepl_d((ubuf0 + count), 0);
> +            src_v  = __lasx_xvldrepl_d((vbuf0 + count), 0);
> +            src_u  = __lasx_xvilvl_d(src_v, src_u);
> +            y_l    = __lasx_xvsrari_h(src_y, 7);
> +            uv     = __lasx_xvsrari_h(src_u, 7);
> +            y_l    = __lasx_vext2xv_w_h(y_l);
> +            uv     = __lasx_vext2xv_w_h(uv);
> +            uv     = __lasx_xvaddwev_w_h(uv, headroom);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 6, 7, 3, 7);
> +            i += 8;
> +        }
> +        for (; count < len_count; count++) {
> +            int Y1 = (buf0[count * 2    ] + 64) >> 7;
> +            int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
> +            int U  = (ubuf0[count]        + 64) >> 7;
> +            int V  = (vbuf0[count]        + 64) >> 7;
> +
> +            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
> +            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
> +            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                          r, g, b, y, target, 0);
> +        }
> +    } else {
> +        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
> +        int count = 0;
> +        int HEADROOM = YUVRGB_TABLE_HEADROOM;
> +        __m256i headroom    = __lasx_xvreplgr2vr_w(HEADROOM);
> +
> +        for (i = 0; i < len; i += 16) {
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            int c_dex = count << 1;
> +            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
> +            __m256i y_l, y_h, u, v;
> +
> +            DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
> +                      ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
> +            src_v1 = __lasx_xvldx(vbuf1, c_dex);
> +            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
> +            src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
> +            src_y  = __lasx_xvsrari_h(src_y, 7);
> +            u      = __lasx_xvaddwev_w_h(src_u0, src_u1);
> +            v      = __lasx_xvaddwod_w_h(src_u0, src_u1);
> +            y_l    = __lasx_xvsllwil_w_h(src_y, 0);
> +            y_h    = __lasx_xvexth_w_h(src_y);
> +            u      = __lasx_xvsrari_w(u, 8);
> +            v      = __lasx_xvsrari_w(v, 8);
> +            u      = __lasx_xvadd_w(u, headroom);
> +            v      = __lasx_xvadd_w(v, headroom);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
> +        }
> +        if (dstW - i >= 8) {
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
> +            __m256i uv;
> +
> +            src_y  = __lasx_xvldx(buf0, i_dex);
> +            src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
> +            src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
> +            src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
> +            src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
> +
> +            src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
> +            src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
> +            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
> +            src_y  = __lasx_xvsrari_h(src_y, 7);
> +            uv     = __lasx_xvhaddw_w_h(src_u0, src_u0);
> +            src_y  = __lasx_vext2xv_w_h(src_y);
> +            uv     = __lasx_xvsrari_w(uv, 8);
> +            uv     = __lasx_xvadd_w(uv, headroom);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 6, 7, 3, 7);
> +            i += 8;
> +        }
> +        for (; count < len_count; count++) {
> +            int Y1 = (buf0[count * 2    ]         +  64) >> 7;
> +            int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
> +            int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
> +            int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
> +
> +            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
> +            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
> +            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                          r, g, b, y, target, 0);
> +        }
> +    }
> +}
> +
> +#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                \
> +static void name ## ext ## _X_lasx(SwsContext *c, const int16_t *lumFilter,            \
> +                                   const int16_t **lumSrc, int lumFilterSize,          \
> +                                   const int16_t *chrFilter, const int16_t **chrUSrc,  \
> +                                   const int16_t **chrVSrc, int chrFilterSize,         \
> +                                   const int16_t **alpSrc, uint8_t *dest, int dstW,    \
> +                                   int y)                                              \
> +{                                                                                      \
> +    name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize,              \
> +                                     chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \
> +                                     alpSrc, dest, dstW, y, fmt, hasAlpha);            \
> +}
> +
> +#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                               \
> +YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                        \
> +static void name ## ext ## _2_lasx(SwsContext *c, const int16_t *buf[2],               \
> +                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
> +                                   const int16_t *abuf[2], uint8_t *dest, int dstW,    \
> +                                   int yalpha, int uvalpha, int y)                     \
> +{                                                                                      \
> +    name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest,                   \
> +                                     dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \
> +}
> +
> +#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                 \
> +YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                       \
> +static void name ## ext ## _1_lasx(SwsContext *c, const int16_t *buf0,                 \
> +                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
> +                                   const int16_t *abuf0, uint8_t *dest, int dstW,      \
> +                                   int uvalpha, int y)                                 \
> +{                                                                                      \
> +    name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest,                 \
> +                                     dstW, uvalpha, y, fmt, hasAlpha);                 \
> +}
> +
> +
> +#if CONFIG_SMALL
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +#endif
> +YUV2RGBWRAPPER(yuv2rgb,, x32_1,  AV_PIX_FMT_RGB32_1, 0)
> +YUV2RGBWRAPPER(yuv2rgb,, x32,    AV_PIX_FMT_RGB32,   0)
> +#endif
> +YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24,     0)
> +YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24,     0)
> +YUV2RGBWRAPPER(yuv2rgb,,  16,    AV_PIX_FMT_RGB565,    0)
> +YUV2RGBWRAPPER(yuv2rgb,,  15,    AV_PIX_FMT_RGB555,    0)
> +YUV2RGBWRAPPER(yuv2rgb,,  12,    AV_PIX_FMT_RGB444,    0)
> +YUV2RGBWRAPPER(yuv2rgb,,   8,    AV_PIX_FMT_RGB8,      0)
> +YUV2RGBWRAPPER(yuv2rgb,,   4,    AV_PIX_FMT_RGB4,      0)
> +YUV2RGBWRAPPER(yuv2rgb,,   4b,   AV_PIX_FMT_RGB4_BYTE, 0)
> +
> +// This function is copied from libswscale/output.c
> +static av_always_inline void yuv2rgb_write_full(SwsContext *c,
> +    uint8_t *dest, int i, int R, int A, int G, int B,
> +    int y, enum AVPixelFormat target, int hasAlpha, int err[4])
> +{
> +    int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
> +
> +    if ((R | G | B) & 0xC0000000) {
> +        R = av_clip_uintp2(R, 30);
> +        G = av_clip_uintp2(G, 30);
> +        B = av_clip_uintp2(B, 30);
> +    }
> +
> +    switch(target) {
> +    case AV_PIX_FMT_ARGB:
> +        dest[0] = hasAlpha ? A : 255;
> +        dest[1] = R >> 22;
> +        dest[2] = G >> 22;
> +        dest[3] = B >> 22;
> +        break;
> +    case AV_PIX_FMT_RGB24:
> +        dest[0] = R >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = B >> 22;
> +        break;
> +    case AV_PIX_FMT_RGBA:
> +        dest[0] = R >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = B >> 22;
> +        dest[3] = hasAlpha ? A : 255;
> +        break;
> +    case AV_PIX_FMT_ABGR:
> +        dest[0] = hasAlpha ? A : 255;
> +        dest[1] = B >> 22;
> +        dest[2] = G >> 22;
> +        dest[3] = R >> 22;
> +        break;
> +    case AV_PIX_FMT_BGR24:
> +        dest[0] = B >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = R >> 22;
> +        break;
> +    case AV_PIX_FMT_BGRA:
> +        dest[0] = B >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = R >> 22;
> +        dest[3] = hasAlpha ? A : 255;
> +        break;
> +    case AV_PIX_FMT_BGR4_BYTE:
> +    case AV_PIX_FMT_RGB4_BYTE:
> +    case AV_PIX_FMT_BGR8:
> +    case AV_PIX_FMT_RGB8:
> +    {
> +        int r,g,b;
> +
> +        switch (c->dither) {
> +        default:
> +        case SWS_DITHER_AUTO:
> +        case SWS_DITHER_ED:
> +            R >>= 22;
> +            G >>= 22;
> +            B >>= 22;
> +            R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
> +            G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
> +            B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
> +            c->dither_error[0][i] = err[0];
> +            c->dither_error[1][i] = err[1];
> +            c->dither_error[2][i] = err[2];
> +            r = R >> (isrgb8 ? 5 : 7);
> +            g = G >> (isrgb8 ? 5 : 6);
> +            b = B >> (isrgb8 ? 6 : 7);
> +            r = av_clip(r, 0, isrgb8 ? 7 : 1);
> +            g = av_clip(g, 0, isrgb8 ? 7 : 3);
> +            b = av_clip(b, 0, isrgb8 ? 3 : 1);
> +            err[0] = R - r*(isrgb8 ? 36 : 255);
> +            err[1] = G - g*(isrgb8 ? 36 : 85);
> +            err[2] = B - b*(isrgb8 ? 85 : 255);
> +            break;
> +        case SWS_DITHER_A_DITHER:
> +            if (isrgb8) {
> +  /* see http://pippin.gimp.org/a_dither/ for details/origin */
> +#define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff))
> +                r = (((R >> 19) + A_DITHER(i,y)  -96)>>8);
> +                g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
> +                b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
> +                r = av_clip_uintp2(r, 3);
> +                g = av_clip_uintp2(g, 3);
> +                b = av_clip_uintp2(b, 2);
> +            } else {
> +                r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
> +                g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
> +                b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
> +                r = av_clip_uintp2(r, 1);
> +                g = av_clip_uintp2(g, 2);
> +                b = av_clip_uintp2(b, 1);
> +            }
> +            break;
> +        case SWS_DITHER_X_DITHER:
> +            if (isrgb8) {
> +  /* see http://pippin.gimp.org/a_dither/ for details/origin */
> +#define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2)
> +                r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
> +                g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
> +                b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
> +                r = av_clip_uintp2(r, 3);
> +                g = av_clip_uintp2(g, 3);
> +                b = av_clip_uintp2(b, 2);
> +            } else {
> +                r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
> +                g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
> +                b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
> +                r = av_clip_uintp2(r, 1);
> +                g = av_clip_uintp2(g, 2);
> +                b = av_clip_uintp2(b, 1);
> +            }
> +
> +            break;
> +        }
> +
> +        if(target == AV_PIX_FMT_BGR4_BYTE) {
> +            dest[0] = r + 2*g + 8*b;
> +        } else if(target == AV_PIX_FMT_RGB4_BYTE) {
> +            dest[0] = b + 2*g + 8*r;
> +        } else if(target == AV_PIX_FMT_BGR8) {
> +            dest[0] = r + 8*g + 64*b;
> +        } else if(target == AV_PIX_FMT_RGB8) {
> +            dest[0] = b + 4*g + 32*r;
> +        } else
> +            av_assert2(0);
> +        break; }
> +    }
> +}
> +
> +#define YUVTORGB_SETUP                                           \
> +    int y_offset   = c->yuv2rgb_y_offset;                        \
> +    int y_coeff    = c->yuv2rgb_y_coeff;                         \
> +    int v2r_coe    = c->yuv2rgb_v2r_coeff;                       \
> +    int v2g_coe    = c->yuv2rgb_v2g_coeff;                       \
> +    int u2g_coe    = c->yuv2rgb_u2g_coeff;                       \
> +    int u2b_coe    = c->yuv2rgb_u2b_coeff;                       \
> +    __m256i offset = __lasx_xvreplgr2vr_w(y_offset);             \
> +    __m256i coeff  = __lasx_xvreplgr2vr_w(y_coeff);              \
> +    __m256i v2r    = __lasx_xvreplgr2vr_w(v2r_coe);              \
> +    __m256i v2g    = __lasx_xvreplgr2vr_w(v2g_coe);              \
> +    __m256i u2g    = __lasx_xvreplgr2vr_w(u2g_coe);              \
> +    __m256i u2b    = __lasx_xvreplgr2vr_w(u2b_coe);              \
> +
> +
> +#define YUVTORGB(y, u, v, R, G, B, offset, coeff,              \
> +                 y_temp, v2r, v2g, u2g, u2b)                   \
> +{                                                              \
> +     y = __lasx_xvsub_w(y, offset);                            \
> +     y = __lasx_xvmul_w(y, coeff);                             \
> +     y = __lasx_xvadd_w(y, y_temp);                            \
> +     R = __lasx_xvmadd_w(y, v, v2r);                           \
> +     v = __lasx_xvmadd_w(y, v, v2g);                           \
> +     G = __lasx_xvmadd_w(v, u, u2g);                           \
> +     B = __lasx_xvmadd_w(y, u, u2b);                           \
> +}
> +
> +#define WRITE_FULL_A(r, g, b, a, t1, s)                                      \
> +{                                                                            \
> +    R = __lasx_xvpickve2gr_w(r, t1);                                         \
> +    G = __lasx_xvpickve2gr_w(g, t1);                                         \
> +    B = __lasx_xvpickve2gr_w(b, t1);                                         \
> +    A = __lasx_xvpickve2gr_w(a, t1);                                         \
> +    if (A & 0x100)                                                           \
> +        A = av_clip_uint8(A);                                                \
> +    yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
> +    dest += step;                                                            \
> +}
> +
> +#define WRITE_FULL(r, g, b, t1, s)                                            \
> +{                                                                             \
> +    R = __lasx_xvpickve2gr_w(r, t1);                                          \
> +    G = __lasx_xvpickve2gr_w(g, t1);                                          \
> +    B = __lasx_xvpickve2gr_w(b, t1);                                          \
> +    yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
> +    dest += step;                                                             \
> +}
> +
> +static void
> +yuv2rgb_full_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
> +                             const int16_t **lumSrc, int lumFilterSize,
> +                             const int16_t *chrFilter, const int16_t **chrUSrc,
> +                             const int16_t **chrVSrc, int chrFilterSize,
> +                             const int16_t **alpSrc, uint8_t *dest,
> +                             int dstW, int y, enum AVPixelFormat target,
> +                             int hasAlpha)
> +{
> +    int i, j, B, G, R, A;
> +    int step       = (target == AV_PIX_FMT_RGB24 ||
> +                      target == AV_PIX_FMT_BGR24) ? 3 : 4;
> +    int err[4]     = {0};
> +    int a_temp     = 1 << 18;
> +    int templ      = 1 << 9;
> +    int tempc      = templ - (128 << 19);
> +    int ytemp      = 1 << 21;
> +    int len        = dstW - 15;
> +    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
> +    YUVTORGB_SETUP
> +
> +    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
> +       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
> +        step = 1;
> +
> +    for (i = 0; i < len; i += 16) {
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
> +        __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
> +        int n = i << 1;
> +
> +        y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
> +        u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvldx(lumSrc[j], n);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
> +            y_od  = __lasx_xvmaddwod_w_h(y_od, l_src, temp);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
> +                      u_src, v_src);
> +            DUP2_ARG3(__lasx_xvmaddwev_w_h, u_ev, u_src, temp, v_ev,
> +                      v_src, temp, u_ev, v_ev);
> +            DUP2_ARG3(__lasx_xvmaddwod_w_h, u_od, u_src, temp, v_od,
> +                      v_src, temp, u_od, v_od);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 10);
> +        y_od = __lasx_xvsrai_w(y_od, 10);
> +        u_ev = __lasx_xvsrai_w(u_ev, 10);
> +        u_od = __lasx_xvsrai_w(u_od, 10);
> +        v_ev = __lasx_xvsrai_w(v_ev, 10);
> +        v_od = __lasx_xvsrai_w(v_od, 10);
> +        YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +        YUVTORGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a_src, a_ev, a_od;
> +
> +            a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
> +            for (j = 0; j < lumFilterSize; j++) {
> +                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
> +                a_src = __lasx_xvldx(alpSrc[j], n);
> +                a_ev  = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
> +                a_od  = __lasx_xvmaddwod_w_h(a_od, a_src, temp);
> +            }
> +            a_ev = __lasx_xvsrai_w(a_ev, 19);
> +            a_od = __lasx_xvsrai_w(a_od, 19);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
> +        } else {
> +            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
> +            WRITE_FULL(R_od, G_od, B_od, 0, 1);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
> +            WRITE_FULL(R_od, G_od, B_od, 1, 3);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
> +            WRITE_FULL(R_od, G_od, B_od, 2, 5);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
> +            WRITE_FULL(R_od, G_od, B_od, 3, 7);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
> +            WRITE_FULL(R_od, G_od, B_od, 4, 9);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
> +            WRITE_FULL(R_od, G_od, B_od, 5, 11);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
> +            WRITE_FULL(R_od, G_od, B_od, 6, 13);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
> +            WRITE_FULL(R_od, G_od, B_od, 7, 15);
> +        }
> +    }
> +    if (dstW - i >= 8) {
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, u_ev, v_ev, uv, temp;
> +        __m256i R_ev, G_ev, B_ev;
> +        int n = i << 1;
> +
> +        y_ev = __lasx_xvreplgr2vr_w(templ);
> +        u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvldx(lumSrc[j], n);
> +            l_src = __lasx_xvpermi_d(l_src, 0xD8);
> +            l_src = __lasx_xvilvl_h(l_src, l_src);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
> +            u_src = __lasx_xvpermi_d(u_src, 0xD8);
> +            v_src = __lasx_xvpermi_d(v_src, 0xD8);
> +            uv    = __lasx_xvilvl_h(v_src, u_src);
> +            u_ev  = __lasx_xvmaddwev_w_h(u_ev, uv, temp);
> +            v_ev  = __lasx_xvmaddwod_w_h(v_ev, uv, temp);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 10);
> +        u_ev = __lasx_xvsrai_w(u_ev, 10);
> +        v_ev = __lasx_xvsrai_w(v_ev, 10);
> +        YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a_src, a_ev;
> +
> +            a_ev = __lasx_xvreplgr2vr_w(a_temp);
> +            for (j = 0; j < lumFilterSize; j++) {
> +                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
> +                a_src = __lasx_xvldx(alpSrc[j], n);
> +                a_src = __lasx_xvpermi_d(a_src, 0xD8);
> +                a_src = __lasx_xvilvl_h(a_src, a_src);
> +                a_ev  =  __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
> +            }
> +            a_ev = __lasx_xvsrai_w(a_ev, 19);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 1);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 2);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 3);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 4);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 5);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 6);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 7);
> +        } else {
> +            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 1, 1);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 2, 2);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 3, 3);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 4, 4);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 5, 5);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 6, 6);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 7, 7);
> +        }
> +        i += 8;
> +    }
> +    for (; i < dstW; i++) {
> +        int Y = templ;
> +        int V, U = V = tempc;
> +
> +        A = 0;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            Y += lumSrc[j][i] * lumFilter[j];
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            U += chrUSrc[j][i] * chrFilter[j];
> +            V += chrVSrc[j][i] * chrFilter[j];
> +
> +        }
> +        Y >>= 10;
> +        U >>= 10;
> +        V >>= 10;
> +        if (hasAlpha) {
> +            A = 1 << 18;
> +            for (j = 0; j < lumFilterSize; j++) {
> +                A += alpSrc[j][i] * lumFilter[j];
> +            }
> +            A >>= 19;
> +            if (A & 0x100)
> +                A = av_clip_uint8(A);
> +        }
> +        Y -= y_offset;
> +        Y *= y_coeff;
> +        Y += ytemp;
> +        R  = (unsigned)Y + V * v2r_coe;
> +        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +        B  = (unsigned)Y + U * u2b_coe;
> +        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +        dest += step;
> +    }
> +    c->dither_error[0][i] = err[0];
> +    c->dither_error[1][i] = err[1];
> +    c->dither_error[2][i] = err[2];
> +}
> +
> +static void
> +yuv2rgb_full_2_template_lasx(SwsContext *c, const int16_t *buf[2],
> +                             const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                             const int16_t *abuf[2], uint8_t *dest, int dstW,
> +                             int yalpha, int uvalpha, int y,
> +                             enum AVPixelFormat target, int hasAlpha)
> +{
> +    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
> +                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
> +                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
> +                  *abuf0 = hasAlpha ? abuf[0] : NULL,
> +                  *abuf1 = hasAlpha ? abuf[1] : NULL;
> +    int yalpha1  = 4096 - yalpha;
> +    int uvalpha1 = 4096 - uvalpha;
> +    int uvtemp   = 128 << 19;
> +    int atemp    = 1 << 18;
> +    int err[4]   = {0};
> +    int ytemp    = 1 << 21;
> +    int len      = dstW - 15;
> +    int i, R, G, B, A;
> +    int step = (target == AV_PIX_FMT_RGB24 ||
> +                target == AV_PIX_FMT_BGR24) ? 3 : 4;
> +    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
> +    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
> +    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
> +    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
> +    __m256i uv         = __lasx_xvreplgr2vr_w(uvtemp);
> +    __m256i a_bias     = __lasx_xvreplgr2vr_w(atemp);
> +    __m256i y_temp     = __lasx_xvreplgr2vr_w(ytemp);
> +    YUVTORGB_SETUP
> +
> +    av_assert2(yalpha  <= 4096U);
> +    av_assert2(uvalpha <= 4096U);
> +
> +    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
> +       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
> +        step = 1;
> +
> +    for (i = 0; i < len; i += 16) {
> +        __m256i b0, b1, ub0, ub1, vb0, vb1;
> +        __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
> +        __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
> +        __m256i y_l, y_h, v_l, v_h, u_l, u_h;
> +        __m256i R_l, R_h, G_l, G_h, B_l, B_h;
> +        int n = i << 1;
> +
> +        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
> +                  n, ubuf1, n, b0, b1, ub0, ub1);
> +        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
> +        DUP2_ARG2(__lasx_xvsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
> +        DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
> +                  u0_l, u1_l, v0_l, v1_l);
> +        DUP2_ARG1(__lasx_xvexth_w_h, b0, b1, y0_h, y1_h);
> +        DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
> +                  u0_h, u1_h, v0_h, v1_h);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
> +        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
> +        u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
> +        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
> +        v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
> +        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
> +        u_h  = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
> +        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
> +        v_h  = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
> +        u_l  = __lasx_xvsub_w(u_l, uv);
> +        u_h  = __lasx_xvsub_w(u_h, uv);
> +        v_l  = __lasx_xvsub_w(v_l, uv);
> +        v_h  = __lasx_xvsub_w(v_h, uv);
> +        y_l  = __lasx_xvsrai_w(y_l, 10);
> +        y_h  = __lasx_xvsrai_w(y_h, 10);
> +        u_l  = __lasx_xvsrai_w(u_l, 10);
> +        u_h  = __lasx_xvsrai_w(u_h, 10);
> +        v_l  = __lasx_xvsrai_w(v_l, 10);
> +        v_h  = __lasx_xvsrai_w(v_h, 10);
> +        YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +        YUVTORGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a0, a1, a0_l, a0_h;
> +            __m256i a_l, a_h, a1_l, a1_h;
> +
> +            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
> +            DUP2_ARG2(__lasx_xvsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
> +            DUP2_ARG1(__lasx_xvexth_w_h, a0, a1, a0_h, a1_h);
> +            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
> +            a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
> +            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
> +            a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
> +            a_l = __lasx_xvsrai_w(a_l, 19);
> +            a_h = __lasx_xvsrai_w(a_h, 19);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
> +        } else {
> +            WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +            WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +            WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +            WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +            WRITE_FULL(R_h, G_h, B_h, 0, 4);
> +            WRITE_FULL(R_h, G_h, B_h, 1, 5);
> +            WRITE_FULL(R_h, G_h, B_h, 2, 6);
> +            WRITE_FULL(R_h, G_h, B_h, 3, 7);
> +            WRITE_FULL(R_l, G_l, B_l, 4, 8);
> +            WRITE_FULL(R_l, G_l, B_l, 5, 9);
> +            WRITE_FULL(R_l, G_l, B_l, 6, 10);
> +            WRITE_FULL(R_l, G_l, B_l, 7, 11);
> +            WRITE_FULL(R_h, G_h, B_h, 4, 12);
> +            WRITE_FULL(R_h, G_h, B_h, 5, 13);
> +            WRITE_FULL(R_h, G_h, B_h, 6, 14);
> +            WRITE_FULL(R_h, G_h, B_h, 7, 15);
> +        }
> +    }
> +    if (dstW - i >= 8) {
> +        __m256i b0, b1, ub0, ub1, vb0, vb1;
> +        __m256i y0_l, y1_l, u0_l;
> +        __m256i v0_l, u1_l, v1_l;
> +        __m256i y_l, u_l, v_l;
> +        __m256i R_l, G_l, B_l;
> +        int n = i << 1;
> +
> +        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
> +                  ubuf1, n, b0, b1, ub0, ub1);
> +        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
> +        DUP2_ARG1(__lasx_vext2xv_w_h, b0, b1, y0_l, y1_l);
> +        DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
> +                  u0_l, u1_l, v0_l, v1_l);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
> +        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
> +        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
> +        u_l  = __lasx_xvsub_w(u_l, uv);
> +        v_l  = __lasx_xvsub_w(v_l, uv);
> +        y_l  = __lasx_xvsrai_w(y_l, 10);
> +        u_l  = __lasx_xvsrai_w(u_l, 10);
> +        v_l  = __lasx_xvsrai_w(v_l, 10);
> +        YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a0, a1, a0_l;
> +            __m256i a_l, a1_l;
> +
> +            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
> +            DUP2_ARG1(__lasx_vext2xv_w_h, a0, a1, a0_l, a1_l);
> +            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
> +            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
> +            a_l = __lasx_xvsrai_w(a_l, 19);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
> +        } else {
> +            WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +            WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +            WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +            WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +            WRITE_FULL(R_l, G_l, B_l, 4, 4);
> +            WRITE_FULL(R_l, G_l, B_l, 5, 5);
> +            WRITE_FULL(R_l, G_l, B_l, 6, 6);
> +            WRITE_FULL(R_l, G_l, B_l, 7, 7);
> +        }
> +        i += 8;
> +    }
> +    for (; i < dstW; i++){
> +        int Y = ( buf0[i] * yalpha1  +  buf1[i] * yalpha         ) >> 10;
> +        int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
> +        int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
> +
> +        A = 0;
> +        if (hasAlpha){
> +            A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
> +            if (A & 0x100)
> +                A = av_clip_uint8(A);
> +        }
> +
> +        Y -= y_offset;
> +        Y *= y_coeff;
> +        Y += ytemp;
> +        R  = (unsigned)Y + V * v2r_coe;
> +        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +        B  = (unsigned)Y + U * u2b_coe;
> +        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +        dest += step;
> +    }
> +    c->dither_error[0][i] = err[0];
> +    c->dither_error[1][i] = err[1];
> +    c->dither_error[2][i] = err[2];
> +}
> +
> +static void
> +yuv2rgb_full_1_template_lasx(SwsContext *c, const int16_t *buf0,
> +                             const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                             const int16_t *abuf0, uint8_t *dest, int dstW,
> +                             int uvalpha, int y, enum AVPixelFormat target,
> +                             int hasAlpha)
> +{
> +    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
> +    int i, B, G, R, A;
> +    int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
> +    int err[4]     = {0};
> +    int ytemp      = 1 << 21;
> +    int bias_int   = 64;
> +    int len        = dstW - 15;
> +    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
> +    YUVTORGB_SETUP
> +
> +    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
> +       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
> +        step = 1;
> +    if (uvalpha < 2048) {
> +        int uvtemp   = 128 << 7;
> +        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
> +        __m256i bias = __lasx_xvreplgr2vr_w(bias_int);
> +
> +        for (i = 0; i < len; i += 16) {
> +            __m256i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
> +            __m256i y_l, y_h, u_l, u_h, v_l, v_h;
> +            __m256i R_l, R_h, G_l, G_h, B_l, B_h;
> +            int n = i << 1;
> +
> +            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
> +            vb  = __lasx_xvldx(vbuf0, n);
> +            y_l = __lasx_xvsllwil_w_h(b, 2);
> +            y_h = __lasx_xvexth_w_h(b);
> +            DUP2_ARG2(__lasx_xvsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
> +            DUP2_ARG1(__lasx_xvexth_w_h, ub, vb, ub_h, vb_h);
> +            y_h = __lasx_xvslli_w(y_h, 2);
> +            u_l = __lasx_xvsub_w(ub_l, uv);
> +            u_h = __lasx_xvsub_w(ub_h, uv);
> +            v_l = __lasx_xvsub_w(vb_l, uv);
> +            v_h = __lasx_xvsub_w(vb_h, uv);
> +            u_l = __lasx_xvslli_w(u_l, 2);
> +            u_h = __lasx_xvslli_w(u_h, 2);
> +            v_l = __lasx_xvslli_w(v_l, 2);
> +            v_h = __lasx_xvslli_w(v_h, 2);
> +            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +            YUVTORGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src;
> +                __m256i a_l, a_h;
> +
> +                a_src = __lasx_xvld(abuf0 + i, 0);
> +                a_l   = __lasx_xvsllwil_w_h(a_src, 0);
> +                a_h   = __lasx_xvexth_w_h(a_src);
> +                a_l   = __lasx_xvadd_w(a_l, bias);
> +                a_h   = __lasx_xvadd_w(a_h, bias);
> +                a_l   = __lasx_xvsrai_w(a_l, 7);
> +                a_h   = __lasx_xvsrai_w(a_h, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
> +            } else {
> +                WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +                WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +                WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +                WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +                WRITE_FULL(R_h, G_h, B_h, 0, 4);
> +                WRITE_FULL(R_h, G_h, B_h, 1, 5);
> +                WRITE_FULL(R_h, G_h, B_h, 2, 6);
> +                WRITE_FULL(R_h, G_h, B_h, 3, 7);
> +                WRITE_FULL(R_l, G_l, B_l, 4, 8);
> +                WRITE_FULL(R_l, G_l, B_l, 5, 9);
> +                WRITE_FULL(R_l, G_l, B_l, 6, 10);
> +                WRITE_FULL(R_l, G_l, B_l, 7, 11);
> +                WRITE_FULL(R_h, G_h, B_h, 4, 12);
> +                WRITE_FULL(R_h, G_h, B_h, 5, 13);
> +                WRITE_FULL(R_h, G_h, B_h, 6, 14);
> +                WRITE_FULL(R_h, G_h, B_h, 7, 15);
> +            }
> +        }
> +        if (dstW - i >= 8) {
> +            __m256i b, ub, vb, ub_l, vb_l;
> +            __m256i y_l, u_l, v_l;
> +            __m256i R_l, G_l, B_l;
> +            int n = i << 1;
> +
> +            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
> +            vb  = __lasx_xvldx(vbuf0, n);
> +            y_l = __lasx_vext2xv_w_h(b);
> +            DUP2_ARG1(__lasx_vext2xv_w_h, ub, vb, ub_l, vb_l);
> +            y_l = __lasx_xvslli_w(y_l, 2);
> +            u_l = __lasx_xvsub_w(ub_l, uv);
> +            v_l = __lasx_xvsub_w(vb_l, uv);
> +            u_l = __lasx_xvslli_w(u_l, 2);
> +            v_l = __lasx_xvslli_w(v_l, 2);
> +            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src, a_l;
> +
> +                a_src = __lasx_xvldx(abuf0, n);
> +                a_src = __lasx_vext2xv_w_h(a_src);
> +                a_l   = __lasx_xvadd_w(bias, a_src);
> +                a_l   = __lasx_xvsrai_w(a_l, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
> +            } else {
> +                WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +                WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +                WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +                WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +                WRITE_FULL(R_l, G_l, B_l, 4, 4);
> +                WRITE_FULL(R_l, G_l, B_l, 5, 5);
> +                WRITE_FULL(R_l, G_l, B_l, 6, 6);
> +                WRITE_FULL(R_l, G_l, B_l, 7, 7);
> +            }
> +            i += 8;
> +        }
> +        for (; i < dstW; i++) {
> +            int Y = buf0[i] << 2;
> +            int U = (ubuf0[i] - uvtemp) << 2;
> +            int V = (vbuf0[i] - uvtemp) << 2;
> +
> +            A = 0;
> +            if(hasAlpha) {
> +                A = (abuf0[i] + 64) >> 7;
> +                if (A & 0x100)
> +                    A = av_clip_uint8(A);
> +            }
> +            Y -= y_offset;
> +            Y *= y_coeff;
> +            Y += ytemp;
> +            R  = (unsigned)Y + V * v2r_coe;
> +            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +            B  = (unsigned)Y + U * u2b_coe;
> +            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +            dest += step;
> +        }
> +    } else {
> +        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
> +        int uvtemp   = 128 << 8;
> +        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
> +        __m256i zero = __lasx_xvldi(0);
> +        __m256i bias = __lasx_xvreplgr2vr_h(bias_int);
> +
> +        for (i = 0; i < len; i += 16) {
> +            __m256i b, ub0, ub1, vb0, vb1;
> +            __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
> +            __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
> +            int n = i << 1;
> +
> +            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
> +                      ubuf1, n, b, ub0, vb0, ub1);
> +            vb1 = __lasx_xvldx(vbuf, n);
> +            y_ev = __lasx_xvaddwev_w_h(b, zero);
> +            y_od = __lasx_xvaddwod_w_h(b, zero);
> +            DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
> +            DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
> +            DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
> +            DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
> +                      u_ev, u_od, v_ev, v_od);
> +            DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
> +                      u_ev, u_od, v_ev, v_od);
> +            YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +            YUVTORGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src;
> +                __m256i a_ev, a_od;
> +
> +                a_src = __lasx_xvld(abuf0 + i, 0);
> +                a_ev  = __lasx_xvaddwev_w_h(bias, a_src);
> +                a_od  = __lasx_xvaddwod_w_h(bias, a_src);
> +                a_ev  = __lasx_xvsrai_w(a_ev, 7);
> +                a_od  = __lasx_xvsrai_w(a_od, 7);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
> +            } else {
> +                WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
> +                WRITE_FULL(R_od, G_od, B_od, 0, 1);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
> +                WRITE_FULL(R_od, G_od, B_od, 1, 3);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
> +                WRITE_FULL(R_od, G_od, B_od, 2, 5);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
> +                WRITE_FULL(R_od, G_od, B_od, 3, 7);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
> +                WRITE_FULL(R_od, G_od, B_od, 4, 9);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
> +                WRITE_FULL(R_od, G_od, B_od, 5, 11);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
> +                WRITE_FULL(R_od, G_od, B_od, 6, 13);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
> +                WRITE_FULL(R_od, G_od, B_od, 7, 15);
> +            }
> +        }
> +        if (dstW - i >= 8) {
> +            __m256i b, ub0, ub1, vb0, vb1;
> +            __m256i y_l, u_l, v_l;
> +            __m256i R_l, G_l, B_l;
> +            int n = i << 1;
> +
> +            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
> +                      ubuf1, n, b, ub0, vb0, ub1);
> +            vb1 = __lasx_xvldx(vbuf1, n);
> +            y_l = __lasx_vext2xv_w_h(b);
> +            y_l = __lasx_xvslli_w(y_l, 2);
> +            DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
> +                      ub0, vb0, ub1, vb1);
> +            DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
> +            u_l = __lasx_xvsub_w(u_l, uv);
> +            v_l = __lasx_xvsub_w(v_l, uv);
> +            u_l = __lasx_xvslli_w(u_l, 1);
> +            v_l = __lasx_xvslli_w(v_l, 1);
> +            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src;
> +                __m256i a_l;
> +
> +                a_src  = __lasx_xvld(abuf0 + i, 0);
> +                a_src  = __lasx_xvpermi_d(a_src, 0xD8);
> +                a_src  = __lasx_xvilvl_h(a_src, a_src);
> +                a_l    = __lasx_xvaddwev_w_h(bias, a_src);
> +                a_l   = __lasx_xvsrai_w(a_l, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
> +            } else {
> +                WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +                WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +                WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +                WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +                WRITE_FULL(R_l, G_l, B_l, 4, 4);
> +                WRITE_FULL(R_l, G_l, B_l, 5, 5);
> +                WRITE_FULL(R_l, G_l, B_l, 6, 6);
> +                WRITE_FULL(R_l, G_l, B_l, 7, 7);
> +            }
> +            i += 8;
> +        }
> +        for (; i < dstW; i++) {
> +            int Y = buf0[i] << 2;
> +            int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
> +            int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
> +
> +            A = 0;
> +            if(hasAlpha) {
> +                A = (abuf0[i] + 64) >> 7;
> +                if (A & 0x100)
> +                    A = av_clip_uint8(A);
> +            }
> +            Y -= y_offset;
> +            Y *= y_coeff;
> +            Y += ytemp;
> +            R  = (unsigned)Y + V * v2r_coe;
> +            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +            B  = (unsigned)Y + U * u2b_coe;
> +            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +            dest += step;
> +        }
> +    }
> +    c->dither_error[0][i] = err[0];
> +    c->dither_error[1][i] = err[1];
> +    c->dither_error[2][i] = err[2];
> +}
> +#if CONFIG_SMALL
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,  1)
> +YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,  1)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,  1)
> +YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,  1)
> +#endif
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB,  0)
> +#endif
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full,  AV_PIX_FMT_BGR24, 0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full,  AV_PIX_FMT_RGB24, 0)
> +
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full,  AV_PIX_FMT_BGR4_BYTE, 0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full,  AV_PIX_FMT_RGB4_BYTE, 0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
> +#undef yuvTorgb
> +#undef yuvTorgb_setup
> +
> +
> +av_cold void ff_sws_init_output_loongarch(SwsContext *c)
> +{
> +
> +    if(c->flags & SWS_FULL_CHR_H_INT) {
> +        switch (c->dstFormat) {
> +        case AV_PIX_FMT_RGBA:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2rgba32_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgba32_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgba32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2rgba32_full_X_lasx;
> +                c->yuv2packed2 = yuv2rgba32_full_2_lasx;
> +                c->yuv2packed1 = yuv2rgba32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2rgbx32_full_X_lasx;
> +                c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
> +                c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_ARGB:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2argb32_full_X_lasx;
> +            c->yuv2packed2 = yuv2argb32_full_2_lasx;
> +            c->yuv2packed1 = yuv2argb32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2argb32_full_X_lasx;
> +                c->yuv2packed2 = yuv2argb32_full_2_lasx;
> +                c->yuv2packed1 = yuv2argb32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2xrgb32_full_X_lasx;
> +                c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
> +                c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_BGRA:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2bgra32_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgra32_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgra32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2bgra32_full_X_lasx;
> +                c->yuv2packed2 = yuv2bgra32_full_2_lasx;
> +                c->yuv2packed1 = yuv2bgra32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2bgrx32_full_X_lasx;
> +                c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
> +                c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_ABGR:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2abgr32_full_X_lasx;
> +            c->yuv2packed2 = yuv2abgr32_full_2_lasx;
> +            c->yuv2packed1 = yuv2abgr32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2abgr32_full_X_lasx;
> +                c->yuv2packed2 = yuv2abgr32_full_2_lasx;
> +                c->yuv2packed1 = yuv2abgr32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2xbgr32_full_X_lasx;
> +                c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
> +                c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_RGB24:
> +            c->yuv2packedX = yuv2rgb24_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgb24_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgb24_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR24:
> +            c->yuv2packedX = yuv2bgr24_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgr24_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgr24_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR4_BYTE:
> +            c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB4_BYTE:
> +            c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR8:
> +            c->yuv2packedX = yuv2bgr8_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgr8_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgr8_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB8:
> +            c->yuv2packedX = yuv2rgb8_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgb8_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgb8_full_1_lasx;
> +            break;
> +    }
> +    } else {
> +        switch (c->dstFormat) {
> +        case AV_PIX_FMT_RGB32:
> +        case AV_PIX_FMT_BGR32:
> +#if CONFIG_SMALL
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packed1 = yuv2rgbx32_1_lasx;
> +                c->yuv2packed2 = yuv2rgbx32_2_lasx;
> +                c->yuv2packedX = yuv2rgbx32_X_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_RGB32_1:
> +        case AV_PIX_FMT_BGR32_1:
> +#if CONFIG_SMALL
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
> +                c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
> +                c->yuv2packedX = yuv2rgbx32_1_X_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_RGB24:
> +            c->yuv2packed1 = yuv2rgb24_1_lasx;
> +            c->yuv2packed2 = yuv2rgb24_2_lasx;
> +            c->yuv2packedX = yuv2rgb24_X_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR24:
> +            c->yuv2packed1 = yuv2bgr24_1_lasx;
> +            c->yuv2packed2 = yuv2bgr24_2_lasx;
> +            c->yuv2packedX = yuv2bgr24_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB565LE:
> +        case AV_PIX_FMT_RGB565BE:
> +        case AV_PIX_FMT_BGR565LE:
> +        case AV_PIX_FMT_BGR565BE:
> +            c->yuv2packed1 = yuv2rgb16_1_lasx;
> +            c->yuv2packed2 = yuv2rgb16_2_lasx;
> +            c->yuv2packedX = yuv2rgb16_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB555LE:
> +        case AV_PIX_FMT_RGB555BE:
> +        case AV_PIX_FMT_BGR555LE:
> +        case AV_PIX_FMT_BGR555BE:
> +            c->yuv2packed1 = yuv2rgb15_1_lasx;
> +            c->yuv2packed2 = yuv2rgb15_2_lasx;
> +            c->yuv2packedX = yuv2rgb15_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB444LE:
> +        case AV_PIX_FMT_RGB444BE:
> +        case AV_PIX_FMT_BGR444LE:
> +        case AV_PIX_FMT_BGR444BE:
> +            c->yuv2packed1 = yuv2rgb12_1_lasx;
> +            c->yuv2packed2 = yuv2rgb12_2_lasx;
> +            c->yuv2packedX = yuv2rgb12_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB8:
> +        case AV_PIX_FMT_BGR8:
> +            c->yuv2packed1 = yuv2rgb8_1_lasx;
> +            c->yuv2packed2 = yuv2rgb8_2_lasx;
> +            c->yuv2packedX = yuv2rgb8_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB4:
> +        case AV_PIX_FMT_BGR4:
> +            c->yuv2packed1 = yuv2rgb4_1_lasx;
> +            c->yuv2packed2 = yuv2rgb4_2_lasx;
> +            c->yuv2packedX = yuv2rgb4_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB4_BYTE:
> +        case AV_PIX_FMT_BGR4_BYTE:
> +            c->yuv2packed1 = yuv2rgb4b_1_lasx;
> +            c->yuv2packed2 = yuv2rgb4b_2_lasx;
> +            c->yuv2packedX = yuv2rgb4b_X_lasx;
> +            break;
> +        }
> +    }
> +}
> diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
> index 1e0bb1b116..97fe947e2e 100644
> --- a/libswscale/loongarch/swscale_init_loongarch.c
> +++ b/libswscale/loongarch/swscale_init_loongarch.c
> @@ -28,6 +28,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
>  {
>      int cpu_flags = av_get_cpu_flags();
>      if (have_lasx(cpu_flags)) {
> +        ff_sws_init_output_loongarch(c);
>          if (c->srcBpc == 8) {
>              if (c->dstBpc <= 14) {
>                  c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
> @@ -47,6 +48,8 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
>              }
>              break;
>          }
> +        if (c->dstBpc == 8)
> +            c->yuv2planeX = ff_yuv2planeX_8_lasx;
>      }
>  }
>  
> diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
> index 07a8145da2..d6c0399737 100644
> --- a/libswscale/loongarch/swscale_loongarch.h
> +++ b/libswscale/loongarch/swscale_loongarch.h
> @@ -69,4 +69,10 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
>                                uint8_t *dest, int width, int height,
>                                int src1Stride, int src2Stride, int dstStride);
>  
> +av_cold void ff_sws_init_output_loongarch(SwsContext *c);
> +
> +void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
> +                          const int16_t **src, uint8_t *dest, int dstW,
> +                          const uint8_t *dither, int offset);
> +
>  #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */