[FFmpeg-devel] [PATCH v1 2/6] avcodec/la: Add LSX optimization for loop filter.
Hao Chen
chenhao at loongson.cn
Thu May 4 11:49:48 EEST 2023
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 161fps
after: 199fps
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/h264dsp.S | 2873 +++++++++++++++++
libavcodec/loongarch/h264dsp_init_loongarch.c | 37 +-
libavcodec/loongarch/h264dsp_lasx.c | 1354 +-------
libavcodec/loongarch/h264dsp_loongarch.h | 67 +-
5 files changed, 2959 insertions(+), 1375 deletions(-)
create mode 100644 libavcodec/loongarch/h264dsp.S
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 4bf06d903b..6eabe71c0b 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -31,4 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_mc_uni_lsx.o \
loongarch/hevc_mc_uniw_lsx.o
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
- loongarch/h264idct_la.o
+ loongarch/h264idct_la.o \
+ loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/h264dsp.S b/libavcodec/loongarch/h264dsp.S
new file mode 100644
index 0000000000..9031e474ae
--- /dev/null
+++ b/libavcodec/loongarch/h264dsp.S
@@ -0,0 +1,2873 @@
+/*
+ * Loongson LSX/LASX optimized h264dsp
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+const vec_shuf
+.rept 2
+.byte 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+.endr
+endconst
+
+.macro AVC_LPF_P1_OR_Q1 _in0, _in1, _in2, _in3, _in4, _in5, _out, _tmp0, _tmp1
+ vavgr.hu \_tmp0, \_in0, \_in1
+ vslli.h \_tmp1, \_in2, 1
+ vsub.h \_tmp0, \_tmp0, \_tmp1
+ vavg.h \_tmp0, \_in3, \_tmp0
+ vclip.h \_tmp0, \_tmp0, \_in4, \_in5
+ vadd.h \_out, \_in2, \_tmp0
+.endm
+
+.macro AVC_LPF_P0Q0 _in0, _in1, _in2, _in3, _in4, _in5, _out0, \
+ _out1, _tmp0, _tmp1
+ vsub.h \_tmp0, \_in0, \_in1
+ vsub.h \_tmp1, \_in2, \_in3
+ vslli.h \_tmp0, \_tmp0, 2
+ vaddi.hu \_tmp1, \_tmp1, 4
+ vadd.h \_tmp0, \_tmp0, \_tmp1
+ vsrai.h \_tmp0, \_tmp0, 3
+ vclip.h \_tmp0, \_tmp0, \_in4, \_in5
+ vadd.h \_out0, \_in1, \_tmp0
+ vsub.h \_out1, \_in0, \_tmp0
+ vclip255.h \_out0, \_out0
+ vclip255.h \_out1, \_out1
+.endm
+
+function ff_h264_h_lpf_luma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ slli.d t1, a1, 2 //img_width_4x
+ slli.d t2, a1, 3 //img_width_8x
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ la.local t4, vec_shuf
+ add.d t3, t0, a1 //img_width_3x
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_LUMA_8
+ vldi vr0, 0 //zero
+ addi.d t4, a0, -4 //src
+ vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
+ add.d t5, t4, t2 //src_tmp
+ vld vr4, t4, 0 //row0
+ vldx vr5, t4, a1 //row1
+ vldx vr6, t4, t0 //row2
+ vldx vr7, t4, t3 //row3
+ add.d t6, t4, t1 // src += img_width_4x
+ vld vr8, t6, 0 //row4
+ vldx vr9, t6, a1 //row5
+ vldx vr10, t6, t0 //row6
+ vldx vr11, t6, t3 //row7
+ vld vr12, t5, 0 //row8
+ vldx vr13, t5, a1 //row9
+ vldx vr14, t5, t0 //row10
+ vldx vr15, t5, t3 //row11
+ add.d t6, t5, t1 // src_tmp += img_width_4x
+ vld vr16, t6, 0 //row12
+ vldx vr17, t6, a1 //row13
+ vldx vr18, t6, t0 //row14
+ vldx vr19, t6, t3 //row15
+ LSX_TRANSPOSE16X8_B vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \
+ vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17, \
+ vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
+ //vr10: p3_org, vr11: p2_org, vr12: p1_org, vr13: p0_org
+ //vr14: q0_org, vr15: q1_org, vr16: q2_org, vr17: q3_org
+ vabsd.bu vr20, vr13, vr14 //p0_asub_q0
+ vabsd.bu vr21, vr12, vr13 //p1_asub_p0
+ vabsd.bu vr22, vr15, vr14 //q1_asub_q0
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vslt.bu vr6, vr20, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr21, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr22, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8 //is_less_than
+ vand.v vr8, vr8, vr3 //is_less_than
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_LUMA_8
+ vneg.b vr9, vr1 //neg_tc_h
+ vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
+ vexth.hu.bu vr19, vr1 //tc_h.1
+ vexth.h.b vr2, vr9 //neg_tc_h.1
+ vsllwil.h.b vr9, vr9, 0 //neg_tc_h.0
+
+ vsllwil.hu.bu vr23, vr12, 0 //p1_org_h.0
+ vexth.hu.bu vr3, vr12 //p1_org_h.1
+ vsllwil.hu.bu vr24, vr13, 0 //p0_org_h.0
+ vexth.hu.bu vr4, vr13 //p0_org_h.1
+ vsllwil.hu.bu vr25, vr14, 0 //q0_org_h.0
+ vexth.hu.bu vr6, vr14 //q0_org_h.1
+
+ vabsd.bu vr0, vr11, vr13 //p2_asub_p0
+ vslt.bu vr7, vr0, vr5
+ vand.v vr7, vr8, vr7 //is_less_than_beta
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_LUMA_BETA
+ vsllwil.hu.bu vr26, vr11, 0 //p2_org_h.0
+ vexth.hu.bu vr0, vr11 //p2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr24, vr25, vr23, vr26, vr9, vr18, vr27, vr28, vr29 //vr27: p1_h.0
+ AVC_LPF_P1_OR_Q1 vr4, vr6, vr3, vr0, vr2, vr19, vr28, vr29, vr30 //vr28: p1_h.1
+ vpickev.b vr27, vr28, vr27
+ vbitsel.v vr12, vr12, vr27, vr7
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr1, vr7
+.END_LUMA_BETA:
+ vabsd.bu vr26, vr16, vr14 //q2_asub_q0
+ vslt.bu vr7, vr26, vr5
+ vand.v vr7, vr7, vr8
+ vsllwil.hu.bu vr27, vr15, 0 //q1_org_h.0
+ vexth.hu.bu vr26, vr15 //q1_org_h.1
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_LUMA_BETA_SEC
+ vsllwil.hu.bu vr28, vr16, 0 //q2_org_h.0
+ vexth.hu.bu vr0, vr16 //q2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr24, vr25, vr27, vr28, vr9, vr18, vr29, vr30, vr31 //vr29: q1_h.0
+ AVC_LPF_P1_OR_Q1 vr4, vr6, vr26, vr0, vr2, vr19, vr22, vr30, vr31 //vr22:q1_h.1
+ vpickev.b vr29, vr22, vr29
+ vbitsel.v vr15, vr15, vr29, vr7
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr1, vr7
+.END_LUMA_BETA_SEC:
+ vneg.b vr22, vr1 //neg_thresh_h
+ vsllwil.h.b vr28, vr22, 0 //neg_thresh_h.0
+ vexth.h.b vr29, vr22 //neg_thresh_h.1
+ vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
+ vexth.hu.bu vr1, vr1 //tc_h.1
+ AVC_LPF_P0Q0 vr25, vr24, vr23, vr27, vr28, vr18, vr30, vr31, vr0, vr2
+ AVC_LPF_P0Q0 vr6, vr4, vr3, vr26, vr29, vr1, vr20, vr21, vr0, vr2
+ vpickev.b vr30, vr20, vr30 //p0_h
+ vpickev.b vr31, vr21, vr31 //q0_h
+ vbitsel.v vr13, vr13, vr30, vr8 //p0_org
+ vbitsel.v vr14, vr14, vr31, vr8 //q0_org
+ //vr10: p3_org, vr11: p2_org, vr12: p1_org, vr13: p0_org
+ //vr14: q0_org, vr15: q1_org, vr16: q2_org, vr17: q3_org
+
+ vilvl.b vr4, vr12, vr10 // row0.0
+ vilvl.b vr5, vr16, vr14 // row0.1
+ vilvl.b vr6, vr13, vr11 // row2.0
+ vilvl.b vr7, vr17, vr15 // row2.1
+
+ vilvh.b vr8, vr12, vr10 // row1.0
+ vilvh.b vr9, vr16, vr14 // row1.1
+ vilvh.b vr10, vr13, vr11 // row3.0
+ vilvh.b vr11, vr17, vr15 // row3.1
+
+ vilvl.b vr12, vr6, vr4 // row4.0
+ vilvl.b vr13, vr7, vr5 // row4.1
+ vilvl.b vr14, vr10, vr8 // row6.0
+ vilvl.b vr15, vr11, vr9 // row6.1
+
+ vilvh.b vr16, vr6, vr4 // row5.0
+ vilvh.b vr17, vr7, vr5 // row5.1
+ vilvh.b vr18, vr10, vr8 // row7.0
+ vilvh.b vr19, vr11, vr9 // row7.1
+
+ vilvl.w vr4, vr13, vr12 // row4: 0, 4, 1, 5
+ vilvh.w vr5, vr13, vr12 // row4: 2, 6, 3, 7
+ vilvl.w vr6, vr17, vr16 // row5: 0, 4, 1, 5
+ vilvh.w vr7, vr17, vr16 // row5: 2, 6, 3, 7
+
+ vilvl.w vr8, vr15, vr14 // row6: 0, 4, 1, 5
+ vilvh.w vr9, vr15, vr14 // row6: 2, 6, 3, 7
+ vilvl.w vr10, vr19, vr18 // row7: 0, 4, 1, 5
+ vilvh.w vr11, vr19, vr18 // row7: 2, 6, 3, 7
+
+ vbsrl.v vr20, vr4, 8
+ vbsrl.v vr21, vr5, 8
+ vbsrl.v vr22, vr6, 8
+ vbsrl.v vr23, vr7, 8
+
+ vbsrl.v vr24, vr8, 8
+ vbsrl.v vr25, vr9, 8
+ vbsrl.v vr26, vr10, 8
+ vbsrl.v vr27, vr11, 8
+
+ fst.d f4, t4, 0
+ fstx.d f20, t4, a1
+ fstx.d f5, t4, t0
+ fstx.d f21, t4, t3
+ add.d t4, t4, t1
+ fst.d f6, t4, 0
+ fstx.d f22, t4, a1
+ fstx.d f7, t4, t0
+ fstx.d f23, t4, t3
+ add.d t4, t4, t1
+ fst.d f8, t4, 0
+ fstx.d f24, t4, a1
+ fstx.d f9, t4, t0
+ fstx.d f25, t4, t3
+ add.d t4, t4, t1
+ fst.d f10, t4, 0
+ fstx.d f26, t4, a1
+ fstx.d f11, t4, t0
+ fstx.d f27, t4, t3
+
+.END_LUMA_8:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function ff_h264_v_lpf_luma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ la.local t4, vec_shuf
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ add.d t1, t0, a1 //img_width_3x
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ addi.d sp, sp, -24
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_V_LUMA_8
+ sub.d t2, a0, t1 //data - img_width_3x
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+ vldi vr0, 0 //zero
+ vld vr10, t2, 0 //p2_org
+ vldx vr11, t2, a1 //p1_org
+ vldx vr12, t2, t0 //p0_org
+ vld vr13, a0, 0 //q0_org
+ vldx vr14, a0, a1 //q1_org
+
+ vslt.bu vr0, vr0, vr2 //is_bs_greater_than0
+ vabsd.bu vr16, vr11, vr12 //p1_asub_p0
+ vabsd.bu vr15, vr12, vr13 //p0_asub_q0
+ vabsd.bu vr17, vr14, vr13 //q1_asub_q0
+
+ vslt.bu vr6, vr15, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr16, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr17, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8
+ vand.v vr8, vr8, vr0 //is_less_than
+
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_V_LUMA_8
+ vldx vr15, a0, t0 //q2_org
+ vneg.b vr0, vr1 //neg_tc_h
+ vsllwil.h.b vr18, vr1, 0 //tc_h.0
+ vexth.h.b vr19, vr1 //tc_h.1
+ vsllwil.h.b vr9, vr0, 0 //neg_tc_h.0
+ vexth.h.b vr2, vr0 //neg_tc_h.1
+
+ vsllwil.hu.bu vr16, vr11, 0 //p1_org_h.0
+ vexth.hu.bu vr17, vr11 //p1_org_h.1
+ vsllwil.hu.bu vr20, vr12, 0 //p0_org_h.0
+ vexth.hu.bu vr21, vr12 //p0_org_h.1
+ vsllwil.hu.bu vr22, vr13, 0 //q0_org_h.0
+ vexth.hu.bu vr23, vr13 //q0_org_h.1
+
+ vabsd.bu vr0, vr10, vr12 //p2_asub_p0
+ vslt.bu vr7, vr0, vr5 //is_less_than_beta
+ vand.v vr7, vr7, vr8 //is_less_than_beta
+
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_V_LESS_BETA
+ vsllwil.hu.bu vr3, vr10, 0 //p2_org_h.0
+ vexth.hu.bu vr4, vr10 //p2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr20, vr22, vr16, vr3, vr9, vr18, vr24, vr0, vr26
+ AVC_LPF_P1_OR_Q1 vr21, vr23, vr17, vr4, vr2, vr19, vr25, vr0, vr26
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr24, vr11, vr24, vr7
+ addi.d t3, t2, 16
+ vstx vr24, t2, a1
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr7, vr1
+.END_V_LESS_BETA:
+ vabsd.bu vr0, vr15, vr13 //q2_asub_q0
+ vslt.bu vr7, vr0, vr5 //is_less_than_beta
+ vand.v vr7, vr7, vr8 //is_less_than_beta
+ vsllwil.hu.bu vr3, vr14, 0 //q1_org_h.0
+ vexth.hu.bu vr4, vr14 //q1_org_h.1
+
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_V_LESS_BETA_SEC
+ vsllwil.hu.bu vr11, vr15, 0 //q2_org_h.0
+ vexth.hu.bu vr15, vr15 //q2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr20, vr22, vr3, vr11, vr9, vr18, vr24, vr0, vr26
+ AVC_LPF_P1_OR_Q1 vr21, vr23, vr4, vr15, vr2, vr19, vr25, vr0, vr26
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr24, vr14, vr24, vr7
+ vstx vr24, a0, a1
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr1, vr7
+.END_V_LESS_BETA_SEC:
+ vneg.b vr0, vr1
+ vsllwil.h.b vr9, vr0, 0 //neg_thresh_h.0
+ vexth.h.b vr2, vr0 //neg_thresh_h.1
+ vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
+ vexth.hu.bu vr19, vr1 //tc_h.1
+ AVC_LPF_P0Q0 vr22, vr20, vr16, vr3, vr9, vr18, vr11, vr15, vr0, vr26
+ AVC_LPF_P0Q0 vr23, vr21, vr17, vr4, vr2, vr19, vr10, vr14, vr0, vr26
+ vpickev.b vr11, vr10, vr11 //p0_h
+ vpickev.b vr15, vr14, vr15 //q0_h
+ vbitsel.v vr11, vr12, vr11, vr8 //p0_h
+ vbitsel.v vr15, vr13, vr15, vr8 //q0_h
+ vstx vr11, t2, t0
+ vst vr15, a0, 0
+.END_V_LUMA_8:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ addi.d sp, sp, 24
+endfunc
+
+const chroma_shuf
+.byte 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3
+endconst
+
+function ff_h264_h_lpf_chroma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ slli.d t1, a1, 2 //img_width_4x
+ la.local t4, chroma_shuf
+ add.d t2, t0, a1 //img_width_3x
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_CHROMA_8
+ vldi vr0, 0
+ addi.d t4, a0, -2
+ vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
+ add.d t5, t4, t1
+ vld vr4, t4, 0 //row0
+ vldx vr5, t4, a1 //row1
+ vldx vr6, t4, t0 //row2
+ vldx vr7, t4, t2 //row3
+ vld vr8, t5, 0 //row4
+ vldx vr9, t5, a1 //row5
+ vldx vr10, t5, t0 //row6
+ vldx vr11, t5, t2 //row7
+ vilvl.b vr12, vr6, vr4 //p1_org
+ vilvl.b vr13, vr7, vr5 //p0_org
+ vilvl.b vr14, vr10, vr8 //q0_org
+ vilvl.b vr15, vr11, vr9 //q1_org
+ vilvl.b vr4, vr13, vr12 //row0
+ vilvl.b vr5, vr15, vr14 //row1
+ vilvl.w vr6, vr5, vr4 //row2
+ vilvh.w vr7, vr5, vr4 //row3
+ vilvl.d vr12, vr6, vr6 //p1_org
+ vilvh.d vr13, vr6, vr6 //p0_org
+ vilvl.d vr14, vr7, vr7 //q0_org
+ vilvh.d vr15, vr7, vr7 //q1_org
+
+ vabsd.bu vr20, vr13, vr14 //p0_asub_q0
+ vabsd.bu vr21, vr12, vr13 //p1_asub_p0
+ vabsd.bu vr22, vr15, vr14 //q1_asub_q0
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vslt.bu vr6, vr20, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr21, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr22, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8 //is_less_than
+ vand.v vr8, vr8, vr3 //is_less_than
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_CHROMA_8
+
+ vneg.b vr9, vr1 //neg_tc_h
+ vexth.hu.bu vr3, vr12 //p1_org_h
+ vexth.hu.bu vr4, vr13 //p0_org_h.1
+ vexth.hu.bu vr5, vr14 //q0_org_h.1
+ vexth.hu.bu vr6, vr15 //q1_org_h.1
+
+ vexth.hu.bu vr18, vr1 //tc_h.1
+ vexth.h.b vr2, vr9 //neg_tc_h.1
+
+ AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
+ vpickev.b vr10, vr10, vr10 //p0_h
+ vpickev.b vr11, vr11, vr11 //q0_h
+ vbitsel.v vr13, vr13, vr10, vr8
+ vbitsel.v vr14, vr14, vr11, vr8
+ vilvl.b vr15, vr14, vr13
+ addi.d t4, t4, 1
+ add.d t5, t4, a1
+ add.d t6, t4, t0
+ add.d t7, t4, t2
+ vstelm.h vr15, t4, 0, 0
+ vstelm.h vr15, t5, 0, 1
+ vstelm.h vr15, t6, 0, 2
+ vstelm.h vr15, t7, 0, 3
+ add.d t4, t4, t1
+ add.d t5, t4, a1
+ add.d t6, t4, t0
+ add.d t7, t4, t2
+ vstelm.h vr15, t4, 0, 4
+ vstelm.h vr15, t5, 0, 5
+ vstelm.h vr15, t6, 0, 6
+ vstelm.h vr15, t7, 0, 7
+.END_CHROMA_8:
+endfunc
+
+function ff_h264_v_lpf_chroma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ la.local t4, chroma_shuf
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_CHROMA_V_8
+ vldi vr0, 0
+ sub.d t4, a0, t0
+ vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
+ vld vr12, t4, 0 //p1_org
+ vldx vr13, t4, a1 //p0_org
+ vld vr14, a0, 0 //q0_org
+ vldx vr15, a0, a1 //q1_org
+
+ vabsd.bu vr20, vr13, vr14 //p0_asub_q0
+ vabsd.bu vr21, vr12, vr13 //p1_asub_p0
+ vabsd.bu vr22, vr15, vr14 //q1_asub_q0
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vslt.bu vr6, vr20, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr21, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr22, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8 //is_less_than
+ vand.v vr8, vr8, vr3 //is_less_than
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_CHROMA_V_8
+
+ vneg.b vr9, vr1 //neg_tc_h
+ vsllwil.hu.bu vr3, vr12, 0 //p1_org_h
+ vsllwil.hu.bu vr4, vr13, 0 //p0_org_h.1
+ vsllwil.hu.bu vr5, vr14, 0 //q0_org_h.1
+ vsllwil.hu.bu vr6, vr15, 0 //q1_org_h.1
+
+ vexth.hu.bu vr18, vr1 //tc_h.1
+ vexth.h.b vr2, vr9 //neg_tc_h.1
+
+ AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
+ vpickev.b vr10, vr10, vr10 //p0_h
+ vpickev.b vr11, vr11, vr11 //q0_h
+ vbitsel.v vr10, vr13, vr10, vr8
+ vbitsel.v vr11, vr14, vr11, vr8
+ fstx.d f10, t4, a1
+ fst.d f11, a0, 0
+.END_CHROMA_V_8:
+endfunc
+
+.macro AVC_LPF_P0P1P2_OR_Q0Q1Q2 _in0, _in1, _in2, _in3, _in4, _in5 \
+ _out0, _out1, _out2, _tmp0, _const3
+ vadd.h \_tmp0, \_in1, \_in2
+ vadd.h \_tmp0, \_tmp0, \_in3
+ vslli.h \_out2, \_in0, 1
+ vslli.h \_out0, \_tmp0, 1
+ vadd.h \_out0, \_out0, \_in4
+ vadd.h \_out1, \_in4, \_tmp0
+ vadd.h \_out0, \_out0, \_in5
+ vmadd.h \_out2, \_in4, \_const3
+ vsrar.h \_out0, \_out0, \_const3
+ vadd.h \_out2, \_out2, \_tmp0
+ vsrari.h \_out1, \_out1, 2
+ vsrar.h \_out2, \_out2, \_const3
+.endm
+
+.macro AVC_LPF_P0_OR_Q0 _in0, _in1, _in2, _out0, _tmp0
+ vslli.h \_tmp0, \_in2, 1
+ vadd.h \_out0, \_in0, \_in1
+ vadd.h \_out0, \_out0, \_tmp0
+ vsrari.h \_out0, \_out0, 2
+.endm
+
+////LSX optimization is enough for this function.
+function ff_h264_h_lpf_luma_intra_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ slli.d t1, a1, 2 //img_width_4x
+ addi.d t4, a0, -4 //src
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ add.d t2, t0, a1 //img_width_3x
+ add.d t5, t4, t1
+ vld vr0, t4, 0 //row0
+ vldx vr1, t4, a1 //row1
+ vldx vr2, t4, t0 //row2
+ vldx vr3, t4, t2 //row3
+ add.d t6, t5, t1
+ vld vr4, t5, 0 //row4
+ vldx vr5, t5, a1 //row5
+ vldx vr6, t5, t0 //row6
+ vldx vr7, t5, t2 //row7
+ add.d t7, t6, t1
+ vld vr8, t6, 0 //row8
+ vldx vr9, t6, a1 //row9
+ vldx vr10, t6, t0 //row10
+ vldx vr11, t6, t2 //row11
+ vld vr12, t7, 0 //row12
+ vldx vr13, t7, a1 //row13
+ vldx vr14, t7, t0 //row14
+ vldx vr15, t7, t2 //row15
+ LSX_TRANSPOSE16X8_B vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+ // vr0: p3_org, vr1: p2_org, vr2: p1_org, vr3: p0_org
+ // vr4: q0_org, vr5: q1_org, vr6: q2_org, vr7: q3_org
+
+ vreplgr2vr.b vr16, a2 //alpha_in
+ vreplgr2vr.b vr17, a3 //beta_in
+ vabsd.bu vr10, vr3, vr4 //p0_asub_q0
+ vabsd.bu vr11, vr2, vr3 //p1_asub_p0
+ vabsd.bu vr12, vr5, vr4 //q1_asub_q0
+
+ vslt.bu vr8, vr10, vr16 //is_less_than_alpha
+ vslt.bu vr9, vr11, vr17 //is_less_than_beta
+ vand.v vr18, vr8, vr9 //is_less_than
+ vslt.bu vr9, vr12, vr17 //is_less_than_beta
+ vand.v vr18, vr18, vr9 //is_less_than
+
+ vsetnez.v $fcc0, vr18
+ bceqz $fcc0, .END_H_INTRA_8
+ vsrli.b vr16, vr16, 2 //less_alpha_shift2_add2
+ vaddi.bu vr16, vr16, 2
+ vslt.bu vr16, vr10, vr16
+ vsllwil.hu.bu vr10, vr2, 0 //p1_org_h.0
+ vexth.hu.bu vr11, vr2 //p1_org_h.1
+ vsllwil.hu.bu vr12, vr3, 0 //p0_org_h.0
+ vexth.hu.bu vr13, vr3 //p0_org_h.1
+
+ vsllwil.hu.bu vr14, vr4, 0 //q0_org_h.0
+ vexth.hu.bu vr15, vr4 //q0_org_h.1
+ vsllwil.hu.bu vr19, vr5, 0 //q1_org_h.0
+ vexth.hu.bu vr20, vr5 //q1_org_h.1
+
+ vabsd.bu vr21, vr1, vr3 //p2_asub_p0
+ vslt.bu vr9, vr21, vr17 //is_less_than_beta
+ vand.v vr9, vr9, vr16
+ vxori.b vr22, vr9, 0xff //negate_is_less_than_beta
+ vand.v vr9, vr9, vr18
+ vand.v vr22, vr22, vr18
+
+ vsetnez.v $fcc0, vr9
+ bceqz $fcc0, .END_H_INTRA_LESS_BETA
+ vsllwil.hu.bu vr23, vr1, 0 //p2_org_h.0
+ vexth.hu.bu vr24, vr1 //p2_org_h.1
+ vsllwil.hu.bu vr25, vr0, 0 //p3_org_h.0
+ vexth.hu.bu vr26, vr0 //p3_org_h.1
+ vldi vr27, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr12, vr14, vr10, vr23, vr19, vr28, vr29, vr30, vr31, vr27
+ //vr28: p0_h.0 vr29: p1_h.0 vr30: p2_h.0
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr13, vr15, vr11, vr24, vr20, vr23, vr25, vr21, vr31, vr27
+ //vr23: p0_h.1 vr25: p1_h.1 vr21: p2_h.1
+ vpickev.b vr28, vr23, vr28 //p0_h
+ vpickev.b vr29, vr25, vr29 //p1_h
+ vpickev.b vr30, vr21, vr30 //p2_h
+ vbitsel.v vr3, vr3, vr28, vr9
+ vbitsel.v vr2, vr2, vr29, vr9
+ vbitsel.v vr1, vr1, vr30, vr9
+.END_H_INTRA_LESS_BETA:
+ AVC_LPF_P0_OR_Q0 vr12, vr19, vr10, vr23, vr25
+ AVC_LPF_P0_OR_Q0 vr13, vr20, vr11, vr24, vr25
+ //vr23: p0_h.0 vr24: p0_h.1
+ vpickev.b vr23, vr24, vr23
+ vbitsel.v vr3, vr3, vr23, vr22
+
+ vabsd.bu vr21, vr6, vr4 //q2_asub_q0
+ vslt.bu vr9, vr21, vr17 //is_less_than_beta
+ vand.v vr9, vr9, vr16
+ vxori.b vr22, vr9, 0xff //negate_is_less_than_beta
+ vand.v vr9, vr9, vr18
+ vand.v vr22, vr22, vr18
+
+ vsetnez.v $fcc0, vr9
+ bceqz $fcc0, .END_H_INTRA_LESS_BETA_SEC
+ vsllwil.hu.bu vr23, vr6, 0 //q2_org_h.0
+ vexth.hu.bu vr24, vr6 //q2_org_h.1
+ vsllwil.hu.bu vr25, vr7, 0 //q3_org_h.0
+ vexth.hu.bu vr26, vr7 //q3_org_h.1
+ vldi vr27, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr14, vr12, vr19, vr23, vr10, vr28, vr29, vr30, vr31, vr27
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr15, vr13, vr20, vr24, vr11, vr23, vr25, vr21, vr31, vr27
+ vpickev.b vr28, vr23, vr28 //q0_h
+ vpickev.b vr29, vr25, vr29 //q1_h
+ vpickev.b vr30, vr21, vr30 //q2_h
+ vbitsel.v vr4, vr4, vr28, vr9
+ vbitsel.v vr5, vr5, vr29, vr9
+ vbitsel.v vr6, vr6, vr30, vr9
+.END_H_INTRA_LESS_BETA_SEC:
+ AVC_LPF_P0_OR_Q0 vr14, vr10, vr19, vr23, vr25
+ AVC_LPF_P0_OR_Q0 vr15, vr11, vr20, vr24, vr25
+ vpickev.b vr23, vr24, vr23
+ vbitsel.v vr4, vr4, vr23, vr22
+
+ vilvl.b vr14, vr2, vr0 // row0.0
+ vilvl.b vr15, vr6, vr4 // row0.1
+ vilvl.b vr16, vr3, vr1 // row2.0
+ vilvl.b vr17, vr7, vr5 // row2.1
+
+ vilvh.b vr18, vr2, vr0 // row1.0
+ vilvh.b vr19, vr6, vr4 // row1.1
+ vilvh.b vr20, vr3, vr1 // row3.0
+ vilvh.b vr21, vr7, vr5 // row3.1
+
+ vilvl.b vr2, vr16, vr14 // row4.0
+ vilvl.b vr3, vr17, vr15 // row4.1
+ vilvl.b vr4, vr20, vr18 // row6.0
+ vilvl.b vr5, vr21, vr19 // row6.1
+
+ vilvh.b vr6, vr16, vr14 // row5.0
+ vilvh.b vr7, vr17, vr15 // row5.1
+ vilvh.b vr8, vr20, vr18 // row7.0
+ vilvh.b vr9, vr21, vr19 // row7.1
+
+ vilvl.w vr14, vr3, vr2 // row4: 0, 4, 1, 5
+ vilvh.w vr15, vr3, vr2 // row4: 2, 6, 3, 7
+ vilvl.w vr16, vr7, vr6 // row5: 0, 4, 1, 5
+ vilvh.w vr17, vr7, vr6 // row5: 2, 6, 3, 7
+
+ vilvl.w vr18, vr5, vr4 // row6: 0, 4, 1, 5
+ vilvh.w vr19, vr5, vr4 // row6: 2, 6, 3, 7
+ vilvl.w vr20, vr9, vr8 // row7: 0, 4, 1, 5
+ vilvh.w vr21, vr9, vr8 // row7: 2, 6, 3, 7
+
+ vbsrl.v vr0, vr14, 8
+ vbsrl.v vr1, vr15, 8
+ vbsrl.v vr2, vr16, 8
+ vbsrl.v vr3, vr17, 8
+
+ vbsrl.v vr4, vr18, 8
+ vbsrl.v vr5, vr19, 8
+ vbsrl.v vr6, vr20, 8
+ vbsrl.v vr7, vr21, 8
+
+ fst.d f14, t4, 0
+ fstx.d f0, t4, a1
+ fstx.d f15, t4, t0
+ fstx.d f1, t4, t2
+ fst.d f16, t5, 0
+ fstx.d f2, t5, a1
+ fstx.d f17, t5, t0
+ fstx.d f3, t5, t2
+ fst.d f18, t6, 0
+ fstx.d f4, t6, a1
+ fstx.d f19, t6, t0
+ fstx.d f5, t6, t2
+ fst.d f20, t7, 0
+ fstx.d f6, t7, a1
+ fstx.d f21, t7, t0
+ fstx.d f7, t7, t2
+.END_H_INTRA_8:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+//LSX optimization is enough for this function.
+function ff_h264_v_lpf_luma_intra_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ add.d t1, t0, a1 //img_width_3x
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ sub.d t4, a0, t1 //src - img_width_3x
+
+ vld vr0, a0, 0 //q0_org
+ vldx vr1, a0, a1 //q1_org
+ vldx vr2, t4, a1 //p1_org
+ vldx vr3, t4, t0 //p0_org
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vabsd.bu vr6, vr3, vr0 //p0_asub_q0
+ vabsd.bu vr7, vr2, vr3 //p1_asub_p0
+ vabsd.bu vr8, vr1, vr0 //q1_asub_q0
+
+ vslt.bu vr9, vr6, vr4 //is_less_than_alpha
+ vslt.bu vr10, vr7, vr5 //is_less_than_beta
+ vand.v vr11, vr9, vr10 //is_less_than
+ vslt.bu vr10, vr8, vr5
+ vand.v vr11, vr10, vr11
+
+ vsetnez.v $fcc0, vr11
+ bceqz $fcc0, .END_V_INTRA_8
+
+ vld vr12, t4, 0 //p2_org
+ vldx vr13, a0, t0 //q2_org
+ vsrli.b vr14, vr4, 2 //is_alpha_shift2_add2
+ vsllwil.hu.bu vr15, vr2, 0 //p1_org_h.0
+ vexth.hu.bu vr16, vr2 //p1_org_h.1
+ vaddi.bu vr14, vr14, 2
+ vsllwil.hu.bu vr17, vr3, 0 //p0_org_h.0
+ vexth.hu.bu vr18, vr3 //p0_org_h.1
+ vslt.bu vr14, vr6, vr14
+ vsllwil.hu.bu vr19, vr0, 0 //q0_org_h.0
+ vexth.hu.bu vr20, vr0 //q0_org_h.1
+ vsllwil.hu.bu vr21, vr1, 0 //q1_org_h.0
+ vexth.hu.bu vr22, vr1 //q1_org_h.1
+
+ vabsd.bu vr23, vr12, vr3 //p2_asub_p0
+ vslt.bu vr10, vr23, vr5 //is_less_than_beta
+ vand.v vr10, vr10, vr14
+ vxori.b vr23, vr10, 0xff //negate_is_less_than_beta
+ vand.v vr10, vr10, vr11
+ vand.v vr23, vr23, vr11
+
+ vsetnez.v $fcc0, vr10
+ bceqz $fcc0, .END_V_INTRA_LESS_BETA
+ sub.d t5, t4, a1
+ vld vr24, t5, 0 //p3_org
+ vsllwil.hu.bu vr26, vr12, 0 //p2_org_h.0
+ vexth.hu.bu vr27, vr12 //p2_org_h.1
+ vsllwil.hu.bu vr28, vr24, 0 //p3_org_h.0
+ vexth.hu.bu vr29, vr24 //p3_org_h.1
+ vldi vr4, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr17, vr19, vr15, vr26, vr21, vr25, vr30, vr31, vr24, vr4
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr18, vr20, vr16, vr27, vr22, vr6, vr7, vr8, vr24, vr4
+
+ vpickev.b vr25, vr6, vr25 //p0_h
+ vpickev.b vr30, vr7, vr30 //p1_h
+ vpickev.b vr31, vr8, vr31 //p2_h
+
+ vbitsel.v vr3, vr3, vr25, vr10
+ vbitsel.v vr2, vr2, vr30, vr10
+ vbitsel.v vr12, vr12, vr31, vr10
+
+ vstx vr2, t4, a1
+ vst vr12, t4, 0
+.END_V_INTRA_LESS_BETA:
+ AVC_LPF_P0_OR_Q0 vr17, vr21, vr15, vr24, vr30
+ AVC_LPF_P0_OR_Q0 vr18, vr22, vr16, vr25, vr30
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr3, vr3, vr24, vr23
+ vstx vr3, t4, t0
+
+ vabsd.bu vr23, vr13, vr0 //q2_asub_q0
+ vslt.bu vr10, vr23, vr5 //is_less_than_beta
+ vand.v vr10, vr10, vr14
+ vxori.b vr23, vr10, 0xff //negate_is_less_than_beta
+ vand.v vr10, vr10, vr11
+ vand.v vr23, vr23, vr11
+
+ vsetnez.v $fcc0, vr10
+ bceqz $fcc0, .END_V_INTRA_LESS_BETA_SEC
+ vldx vr24, a0, t1 //q3_org
+
+ vsllwil.hu.bu vr26, vr13, 0 //q2_org_h.0
+ vexth.hu.bu vr27, vr13 //q2_org_h.1
+ vsllwil.hu.bu vr28, vr24, 0 //q3_org_h.0
+ vexth.hu.bu vr29, vr24 //q3_org_h.1
+ vldi vr4, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr19, vr17, vr21, vr26, vr15, vr25, vr30, vr31, vr24, vr4
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr20, vr18, vr22, vr27, vr16, vr6, vr7, vr8, vr24, vr4
+
+ vpickev.b vr25, vr6, vr25
+ vpickev.b vr30, vr7, vr30
+ vpickev.b vr31, vr8, vr31
+
+ vbitsel.v vr0, vr0, vr25, vr10
+ vbitsel.v vr1, vr1, vr30, vr10
+ vbitsel.v vr13, vr13, vr31, vr10
+ vstx vr1, a0, a1
+ vstx vr13, a0, t0
+.END_V_INTRA_LESS_BETA_SEC:
+ AVC_LPF_P0_OR_Q0 vr19, vr15, vr21, vr24, vr30
+ AVC_LPF_P0_OR_Q0 vr20, vr16, vr22, vr25, vr30
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr0, vr0, vr24, vr23
+ vst vr0, a0, 0
+.END_V_INTRA_8:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function ff_h264_h_lpf_chroma_intra_8_lsx
+ addi.d t4, a0, -2
+ slli.d t0, a1, 1 //img_2x
+ slli.d t2, a1, 2 //img_4x
+ add.d t1, t0, a1 //img_3x
+
+ add.d t5, t4, t2
+ fld.s f0, t4, 0 //row0
+ fldx.s f1, t4, a1 //row1
+ fldx.s f2, t4, t0 //row2
+ fldx.s f3, t4, t1 //row3
+ fld.s f4, t5, 0 //row4
+ fldx.s f5, t5, a1 //row5
+ fldx.s f6, t5, t0 //row6
+ fldx.s f7, t5, t1 //row7
+
+ vilvl.b vr8, vr2, vr0 //p1_org
+ vilvl.b vr9, vr3, vr1 //p0_org
+ vilvl.b vr10, vr6, vr4 //q0_org
+ vilvl.b vr11, vr7, vr5 //q1_org
+
+ vilvl.b vr0, vr9, vr8
+ vilvl.b vr1, vr11, vr10
+ vilvl.w vr2, vr1, vr0
+ vilvh.w vr3, vr1, vr0
+
+ vilvl.d vr8, vr2, vr2 //p1_org
+ vilvh.d vr9, vr2, vr2 //p0_org
+ vilvl.d vr10, vr3, vr3 //q0_org
+ vilvh.d vr11, vr3, vr3 //q1_org
+
+ vreplgr2vr.b vr0, a2 //alpha
+ vreplgr2vr.b vr1, a3 //beta
+
+ vabsd.bu vr2, vr9, vr10 //p0_asub_q0
+ vabsd.bu vr3, vr8, vr9 //p1_asub_p0
+ vabsd.bu vr4, vr11, vr10 //q1_asub_q0
+
+ vslt.bu vr5, vr2, vr0 //is_less_than_alpha
+ vslt.bu vr6, vr3, vr1 //is_less_than_beta
+ vand.v vr7, vr5, vr6 //is_less_than
+ vslt.bu vr6, vr4, vr1
+ vand.v vr7, vr7, vr6
+
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_H_CHROMA_INTRA_8
+
+ vexth.hu.bu vr12, vr8 //p1_org_h
+ vexth.hu.bu vr13, vr9 //p0_org_h
+ vexth.hu.bu vr14, vr10 //q0_org_h
+ vexth.hu.bu vr15, vr11 //q1_org_h
+
+ AVC_LPF_P0_OR_Q0 vr13, vr15, vr12, vr16, vr18
+ AVC_LPF_P0_OR_Q0 vr14, vr12, vr15, vr17, vr18
+
+ vpickev.b vr18, vr16, vr16
+ vpickev.b vr19, vr17, vr17
+ vbitsel.v vr9, vr9, vr18, vr7
+ vbitsel.v vr10, vr10, vr19, vr7
+.END_H_CHROMA_INTRA_8:
+ vilvl.b vr11, vr10, vr9
+ addi.d t4, t4, 1
+ vstelm.h vr11, t4, 0, 0
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 1
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 2
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 3
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 4
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 5
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 6
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 7
+endfunc
+
+function ff_h264_v_lpf_chroma_intra_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ sub.d t2, a0, a1
+ sub.d t1, a0, t0 //data - img_width_2x
+
+ vreplgr2vr.b vr0, a2
+ vreplgr2vr.b vr1, a3
+
+ vld vr2, t1, 0 //p1_org
+ vldx vr3, t1, a1 //p0_org
+ vld vr4, a0, 0 //q0_org
+ vldx vr5, a0, a1 //q1_org
+
+ vabsd.bu vr6, vr3, vr4 //p0_asub_q0
+ vabsd.bu vr7, vr2, vr3 //p1_asub_p0
+ vabsd.bu vr8, vr5, vr4 //q1_asub_q0
+
+ vslt.bu vr9, vr6, vr0 //is_less_than_alpha
+ vslt.bu vr10, vr7, vr1 //is_less_than_beta
+ vand.v vr11, vr9, vr10 //is_less_than
+ vslt.bu vr10, vr8, vr1
+ vand.v vr11, vr10, vr11
+
+ vsetnez.v $fcc0, vr11
+ bceqz $fcc0, .END_V_CHROMA_INTRA_8
+
+ vsllwil.hu.bu vr6, vr2, 0 //p1_org_h.0
+ vsllwil.hu.bu vr8, vr3, 0 //p0_org_h.0
+ vsllwil.hu.bu vr13, vr4, 0 //q0_org_h.0
+ vsllwil.hu.bu vr15, vr5, 0 //q1_org_h.0
+
+ AVC_LPF_P0_OR_Q0 vr8, vr15, vr6, vr17, vr23
+ AVC_LPF_P0_OR_Q0 vr13, vr6, vr15, vr18, vr23
+
+ vpickev.b vr19, vr17, vr17
+ vpickev.b vr20, vr18, vr18
+ vbitsel.v vr3, vr3, vr19, vr11
+ vbitsel.v vr4, vr4, vr20, vr11
+
+ vstelm.d vr3, t2, 0, 0
+ vstelm.d vr4, a0, 0, 0
+.END_V_CHROMA_INTRA_8:
+endfunc
+
+function ff_biweight_h264_pixels16_8_lsx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ addi.d a7, a7, 1
+ ori a7, a7, 1
+ sll.d a7, a7, a4
+ addi.d a4, a4, 1
+
+ vreplgr2vr.b vr0, a6 //tmp0
+ vreplgr2vr.b vr1, a5 //tmp1
+ vreplgr2vr.h vr8, a7 //offset
+ vreplgr2vr.h vr9, a4 //denom
+ vilvh.b vr20, vr1, vr0 //wgt
+
+ add.d t4, a1, t2
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ add.d t5, a0, t2
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t0
+ vldx vr13, a0, t1
+ vld vr14, t5, 0
+ vldx vr15, t5, a2
+ vldx vr16, t5, t0
+ vldx vr17, t5, t1
+
+ vilvl.b vr18, vr10, vr0
+ vilvl.b vr19, vr11, vr1
+ vilvl.b vr21, vr12, vr2
+ vilvl.b vr22, vr13, vr3
+ vilvh.b vr0, vr10, vr0
+ vilvh.b vr1, vr11, vr1
+ vilvh.b vr2, vr12, vr2
+ vilvh.b vr3, vr13, vr3
+
+ vilvl.b vr10, vr14, vr4
+ vilvl.b vr11, vr15, vr5
+ vilvl.b vr12, vr16, vr6
+ vilvl.b vr13, vr17, vr7
+ vilvh.b vr14, vr14, vr4
+ vilvh.b vr15, vr15, vr5
+ vilvh.b vr16, vr16, vr6
+ vilvh.b vr17, vr17, vr7
+
+ vmov vr4, vr8
+ vmov vr5, vr8
+ vmov vr6, vr8
+ vmov vr7, vr8
+ vmaddwev.h.bu.b vr4, vr18, vr20
+ vmaddwev.h.bu.b vr5, vr19, vr20
+ vmaddwev.h.bu.b vr6, vr21, vr20
+ vmaddwev.h.bu.b vr7, vr22, vr20
+ vmaddwod.h.bu.b vr4, vr18, vr20
+ vmaddwod.h.bu.b vr5, vr19, vr20
+ vmaddwod.h.bu.b vr6, vr21, vr20
+ vmaddwod.h.bu.b vr7, vr22, vr20
+ vmov vr18, vr8
+ vmov vr19, vr8
+ vmov vr21, vr8
+ vmov vr22, vr8
+ vmaddwev.h.bu.b vr18, vr0, vr20
+ vmaddwev.h.bu.b vr19, vr1, vr20
+ vmaddwev.h.bu.b vr21, vr2, vr20
+ vmaddwev.h.bu.b vr22, vr3, vr20
+ vmaddwod.h.bu.b vr18, vr0, vr20
+ vmaddwod.h.bu.b vr19, vr1, vr20
+ vmaddwod.h.bu.b vr21, vr2, vr20
+ vmaddwod.h.bu.b vr22, vr3, vr20
+ vmov vr0, vr8
+ vmov vr1, vr8
+ vmov vr2, vr8
+ vmov vr3, vr8
+ vmaddwev.h.bu.b vr0, vr10, vr20
+ vmaddwev.h.bu.b vr1, vr11, vr20
+ vmaddwev.h.bu.b vr2, vr12, vr20
+ vmaddwev.h.bu.b vr3, vr13, vr20
+ vmaddwod.h.bu.b vr0, vr10, vr20
+ vmaddwod.h.bu.b vr1, vr11, vr20
+ vmaddwod.h.bu.b vr2, vr12, vr20
+ vmaddwod.h.bu.b vr3, vr13, vr20
+ vmov vr10, vr8
+ vmov vr11, vr8
+ vmov vr12, vr8
+ vmov vr13, vr8
+ vmaddwev.h.bu.b vr10, vr14, vr20
+ vmaddwev.h.bu.b vr11, vr15, vr20
+ vmaddwev.h.bu.b vr12, vr16, vr20
+ vmaddwev.h.bu.b vr13, vr17, vr20
+ vmaddwod.h.bu.b vr10, vr14, vr20
+ vmaddwod.h.bu.b vr11, vr15, vr20
+ vmaddwod.h.bu.b vr12, vr16, vr20
+ vmaddwod.h.bu.b vr13, vr17, vr20
+
+ vssran.bu.h vr4, vr4, vr9
+ vssran.bu.h vr5, vr5, vr9
+ vssran.bu.h vr6, vr6, vr9
+ vssran.bu.h vr7, vr7, vr9
+ vssran.bu.h vr18, vr18, vr9
+ vssran.bu.h vr19, vr19, vr9
+ vssran.bu.h vr21, vr21, vr9
+ vssran.bu.h vr22, vr22, vr9
+ vssran.bu.h vr0, vr0, vr9
+ vssran.bu.h vr1, vr1, vr9
+ vssran.bu.h vr2, vr2, vr9
+ vssran.bu.h vr3, vr3, vr9
+ vssran.bu.h vr10, vr10, vr9
+ vssran.bu.h vr11, vr11, vr9
+ vssran.bu.h vr12, vr12, vr9
+ vssran.bu.h vr13, vr13, vr9
+
+ vilvl.d vr4, vr18, vr4
+ vilvl.d vr5, vr19, vr5
+ vilvl.d vr6, vr21, vr6
+ vilvl.d vr7, vr22, vr7
+ vilvl.d vr0, vr10, vr0
+ vilvl.d vr1, vr11, vr1
+ vilvl.d vr2, vr12, vr2
+ vilvl.d vr3, vr13, vr3
+
+ vst vr4, a0, 0
+ vstx vr5, a0, a2
+ vstx vr6, a0, t0
+ vstx vr7, a0, t1
+ vst vr0, t5, 0
+ vstx vr1, t5, a2
+ vstx vr2, t5, t0
+ vstx vr3, t5, t1
+
+ addi.d t6, zero, 16
+ bne a3, t6, .END_BIWEIGHT_PIXELS16
+ add.d t4, t4, t2
+ add.d t5, t5, t2
+ vld vr0, t4, 0
+ vldx vr1, t4, a2
+ vldx vr2, t4, t0
+ vldx vr3, t4, t1
+ add.d t6, t4, t2
+ add.d t7, t5, t2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t0
+ vldx vr7, t6, t1
+
+ vld vr10, t5, 0
+ vldx vr11, t5, a2
+ vldx vr12, t5, t0
+ vldx vr13, t5, t1
+ vld vr14, t7, 0
+ vldx vr15, t7, a2
+ vldx vr16, t7, t0
+ vldx vr17, t7, t1
+
+ vilvl.b vr18, vr10, vr0
+ vilvl.b vr19, vr11, vr1
+ vilvl.b vr21, vr12, vr2
+ vilvl.b vr22, vr13, vr3
+ vilvh.b vr0, vr10, vr0
+ vilvh.b vr1, vr11, vr1
+ vilvh.b vr2, vr12, vr2
+ vilvh.b vr3, vr13, vr3
+
+ vilvl.b vr10, vr14, vr4
+ vilvl.b vr11, vr15, vr5
+ vilvl.b vr12, vr16, vr6
+ vilvl.b vr13, vr17, vr7
+ vilvh.b vr14, vr14, vr4
+ vilvh.b vr15, vr15, vr5
+ vilvh.b vr16, vr16, vr6
+ vilvh.b vr17, vr17, vr7
+
+ vmov vr4, vr8
+ vmov vr5, vr8
+ vmov vr6, vr8
+ vmov vr7, vr8
+ vmaddwev.h.bu.b vr4, vr18, vr20
+ vmaddwev.h.bu.b vr5, vr19, vr20
+ vmaddwev.h.bu.b vr6, vr21, vr20
+ vmaddwev.h.bu.b vr7, vr22, vr20
+ vmaddwod.h.bu.b vr4, vr18, vr20
+ vmaddwod.h.bu.b vr5, vr19, vr20
+ vmaddwod.h.bu.b vr6, vr21, vr20
+ vmaddwod.h.bu.b vr7, vr22, vr20
+ vmov vr18, vr8
+ vmov vr19, vr8
+ vmov vr21, vr8
+ vmov vr22, vr8
+ vmaddwev.h.bu.b vr18, vr0, vr20
+ vmaddwev.h.bu.b vr19, vr1, vr20
+ vmaddwev.h.bu.b vr21, vr2, vr20
+ vmaddwev.h.bu.b vr22, vr3, vr20
+ vmaddwod.h.bu.b vr18, vr0, vr20
+ vmaddwod.h.bu.b vr19, vr1, vr20
+ vmaddwod.h.bu.b vr21, vr2, vr20
+ vmaddwod.h.bu.b vr22, vr3, vr20
+ vmov vr0, vr8
+ vmov vr1, vr8
+ vmov vr2, vr8
+ vmov vr3, vr8
+ vmaddwev.h.bu.b vr0, vr10, vr20
+ vmaddwev.h.bu.b vr1, vr11, vr20
+ vmaddwev.h.bu.b vr2, vr12, vr20
+ vmaddwev.h.bu.b vr3, vr13, vr20
+ vmaddwod.h.bu.b vr0, vr10, vr20
+ vmaddwod.h.bu.b vr1, vr11, vr20
+ vmaddwod.h.bu.b vr2, vr12, vr20
+ vmaddwod.h.bu.b vr3, vr13, vr20
+ vmov vr10, vr8
+ vmov vr11, vr8
+ vmov vr12, vr8
+ vmov vr13, vr8
+ vmaddwev.h.bu.b vr10, vr14, vr20
+ vmaddwev.h.bu.b vr11, vr15, vr20
+ vmaddwev.h.bu.b vr12, vr16, vr20
+ vmaddwev.h.bu.b vr13, vr17, vr20
+ vmaddwod.h.bu.b vr10, vr14, vr20
+ vmaddwod.h.bu.b vr11, vr15, vr20
+ vmaddwod.h.bu.b vr12, vr16, vr20
+ vmaddwod.h.bu.b vr13, vr17, vr20
+
+ vssran.bu.h vr4, vr4, vr9
+ vssran.bu.h vr5, vr5, vr9
+ vssran.bu.h vr6, vr6, vr9
+ vssran.bu.h vr7, vr7, vr9
+ vssran.bu.h vr18, vr18, vr9
+ vssran.bu.h vr19, vr19, vr9
+ vssran.bu.h vr21, vr21, vr9
+ vssran.bu.h vr22, vr22, vr9
+ vssran.bu.h vr0, vr0, vr9
+ vssran.bu.h vr1, vr1, vr9
+ vssran.bu.h vr2, vr2, vr9
+ vssran.bu.h vr3, vr3, vr9
+ vssran.bu.h vr10, vr10, vr9
+ vssran.bu.h vr11, vr11, vr9
+ vssran.bu.h vr12, vr12, vr9
+ vssran.bu.h vr13, vr13, vr9
+
+ vilvl.d vr4, vr18, vr4
+ vilvl.d vr5, vr19, vr5
+ vilvl.d vr6, vr21, vr6
+ vilvl.d vr7, vr22, vr7
+ vilvl.d vr0, vr10, vr0
+ vilvl.d vr1, vr11, vr1
+ vilvl.d vr2, vr12, vr2
+ vilvl.d vr3, vr13, vr3
+
+ vst vr4, t5, 0
+ vstx vr5, t5, a2
+ vstx vr6, t5, t0
+ vstx vr7, t5, t1
+ vst vr0, t7, 0
+ vstx vr1, t7, a2
+ vstx vr2, t7, t0
+ vstx vr3, t7, t1
+.END_BIWEIGHT_PIXELS16:
+endfunc
+
+function ff_biweight_h264_pixels16_8_lasx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ addi.d a7, a7, 1
+ ori a7, a7, 1
+ sll.d a7, a7, a4
+ addi.d a4, a4, 1
+
+ xvreplgr2vr.b xr0, a6 //tmp0
+ xvreplgr2vr.b xr1, a5 //tmp1
+ xvreplgr2vr.h xr8, a7 //offset
+ xvreplgr2vr.h xr9, a4 //denom
+ xvilvh.b xr20, xr1, xr0 //wgt
+
+ add.d t4, a1, t2
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ add.d t5, a0, t2
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t0
+ vldx vr13, a0, t1
+ vld vr14, t5, 0
+ vldx vr15, t5, a2
+ vldx vr16, t5, t0
+ vldx vr17, t5, t1
+
+ xvpermi.q xr1, xr0, 0x20
+ xvpermi.q xr3, xr2, 0x20
+ xvpermi.q xr5, xr4, 0x20
+ xvpermi.q xr7, xr6, 0x20
+
+ xvpermi.q xr11, xr10, 0x20
+ xvpermi.q xr13, xr12, 0x20
+ xvpermi.q xr15, xr14, 0x20
+ xvpermi.q xr17, xr16, 0x20
+
+ xvilvl.b xr0, xr11, xr1 //vec0
+ xvilvl.b xr2, xr13, xr3 //vec2
+ xvilvl.b xr4, xr15, xr5 //vec4
+ xvilvl.b xr6, xr17, xr7 //vec6
+
+ xvilvh.b xr10, xr11, xr1 //vec1
+ xvilvh.b xr12, xr13, xr3 //vec2
+ xvilvh.b xr14, xr15, xr5 //vec5
+ xvilvh.b xr16, xr17, xr7 //vec7
+
+ xmov xr1, xr8
+ xmov xr3, xr8
+ xmov xr5, xr8
+ xmov xr7, xr8
+ xvmaddwev.h.bu.b xr1, xr0, xr20
+ xvmaddwev.h.bu.b xr3, xr2, xr20
+ xvmaddwev.h.bu.b xr5, xr4, xr20
+ xvmaddwev.h.bu.b xr7, xr6, xr20
+ xvmaddwod.h.bu.b xr1, xr0, xr20
+ xvmaddwod.h.bu.b xr3, xr2, xr20
+ xvmaddwod.h.bu.b xr5, xr4, xr20
+ xvmaddwod.h.bu.b xr7, xr6, xr20
+ xmov xr11, xr8
+ xmov xr13, xr8
+ xmov xr15, xr8
+ xmov xr17, xr8
+ xvmaddwev.h.bu.b xr11, xr10, xr20
+ xvmaddwev.h.bu.b xr13, xr12, xr20
+ xvmaddwev.h.bu.b xr15, xr14, xr20
+ xvmaddwev.h.bu.b xr17, xr16, xr20
+ xvmaddwod.h.bu.b xr11, xr10, xr20
+ xvmaddwod.h.bu.b xr13, xr12, xr20
+ xvmaddwod.h.bu.b xr15, xr14, xr20
+ xvmaddwod.h.bu.b xr17, xr16, xr20
+
+ xvssran.bu.h xr1, xr1, xr9 //vec0
+ xvssran.bu.h xr3, xr3, xr9 //vec2
+ xvssran.bu.h xr5, xr5, xr9 //vec4
+ xvssran.bu.h xr7, xr7, xr9 //vec6
+ xvssran.bu.h xr11, xr11, xr9 //vec1
+ xvssran.bu.h xr13, xr13, xr9 //vec3
+ xvssran.bu.h xr15, xr15, xr9 //vec5
+ xvssran.bu.h xr17, xr17, xr9 //vec7
+
+ xvilvl.d xr0, xr11, xr1
+ xvilvl.d xr2, xr13, xr3
+ xvilvl.d xr4, xr15, xr5
+ xvilvl.d xr6, xr17, xr7
+
+ xvpermi.d xr1, xr0, 0x4E
+ xvpermi.d xr3, xr2, 0x4E
+ xvpermi.d xr5, xr4, 0x4E
+ xvpermi.d xr7, xr6, 0x4E
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ vstx vr2, a0, t0
+ vstx vr3, a0, t1
+ vst vr4, t5, 0
+ vstx vr5, t5, a2
+ vstx vr6, t5, t0
+ vstx vr7, t5, t1
+
+ addi.d t6, zero, 16
+ bne a3, t6, .END_BIWEIGHT_PIXELS16_LASX
+ add.d t4, t4, t2
+ add.d t5, t5, t2
+ vld vr0, t4, 0
+ vldx vr1, t4, a2
+ vldx vr2, t4, t0
+ vldx vr3, t4, t1
+ add.d t6, t4, t2
+ add.d t7, t5, t2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t0
+ vldx vr7, t6, t1
+
+ vld vr10, t5, 0
+ vldx vr11, t5, a2
+ vldx vr12, t5, t0
+ vldx vr13, t5, t1
+ vld vr14, t7, 0
+ vldx vr15, t7, a2
+ vldx vr16, t7, t0
+ vldx vr17, t7, t1
+
+ xvpermi.q xr1, xr0, 0x20
+ xvpermi.q xr3, xr2, 0x20
+ xvpermi.q xr5, xr4, 0x20
+ xvpermi.q xr7, xr6, 0x20
+
+ xvpermi.q xr11, xr10, 0x20
+ xvpermi.q xr13, xr12, 0x20
+ xvpermi.q xr15, xr14, 0x20
+ xvpermi.q xr17, xr16, 0x20
+
+ xvilvl.b xr0, xr11, xr1 //vec0
+ xvilvl.b xr2, xr13, xr3 //vec2
+ xvilvl.b xr4, xr15, xr5 //vec4
+ xvilvl.b xr6, xr17, xr7 //vec6
+
+ xvilvh.b xr10, xr11, xr1 //vec1
+ xvilvh.b xr12, xr13, xr3 //vec2
+ xvilvh.b xr14, xr15, xr5 //vec5
+ xvilvh.b xr16, xr17, xr7 //vec7
+
+ xmov xr1, xr8
+ xmov xr3, xr8
+ xmov xr5, xr8
+ xmov xr7, xr8
+ xvmaddwev.h.bu.b xr1, xr0, xr20
+ xvmaddwev.h.bu.b xr3, xr2, xr20
+ xvmaddwev.h.bu.b xr5, xr4, xr20
+ xvmaddwev.h.bu.b xr7, xr6, xr20
+ xvmaddwod.h.bu.b xr1, xr0, xr20
+ xvmaddwod.h.bu.b xr3, xr2, xr20
+ xvmaddwod.h.bu.b xr5, xr4, xr20
+ xvmaddwod.h.bu.b xr7, xr6, xr20
+ xmov xr11, xr8
+ xmov xr13, xr8
+ xmov xr15, xr8
+ xmov xr17, xr8
+ xvmaddwev.h.bu.b xr11, xr10, xr20
+ xvmaddwev.h.bu.b xr13, xr12, xr20
+ xvmaddwev.h.bu.b xr15, xr14, xr20
+ xvmaddwev.h.bu.b xr17, xr16, xr20
+ xvmaddwod.h.bu.b xr11, xr10, xr20
+ xvmaddwod.h.bu.b xr13, xr12, xr20
+ xvmaddwod.h.bu.b xr15, xr14, xr20
+ xvmaddwod.h.bu.b xr17, xr16, xr20
+
+ xvssran.bu.h xr1, xr1, xr9 //vec0
+ xvssran.bu.h xr3, xr3, xr9 //vec2
+ xvssran.bu.h xr5, xr5, xr9 //vec4
+ xvssran.bu.h xr7, xr7, xr9 //vec6
+ xvssran.bu.h xr11, xr11, xr9 //vec1
+ xvssran.bu.h xr13, xr13, xr9 //vec3
+ xvssran.bu.h xr15, xr15, xr9 //vec5
+ xvssran.bu.h xr17, xr17, xr9 //vec7
+
+ xvilvl.d xr0, xr11, xr1
+ xvilvl.d xr2, xr13, xr3
+ xvilvl.d xr4, xr15, xr5
+ xvilvl.d xr6, xr17, xr7
+
+ xvpermi.d xr1, xr0, 0x4E
+ xvpermi.d xr3, xr2, 0x4E
+ xvpermi.d xr5, xr4, 0x4E
+ xvpermi.d xr7, xr6, 0x4E
+
+ vst vr0, t5, 0
+ vstx vr1, t5, a2
+ vstx vr2, t5, t0
+ vstx vr3, t5, t1
+ vst vr4, t7, 0
+ vstx vr5, t7, a2
+ vstx vr6, t7, t0
+ vstx vr7, t7, t1
+.END_BIWEIGHT_PIXELS16_LASX:
+endfunc
+
+function ff_biweight_h264_pixels8_8_lsx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ addi.d a7, a7, 1
+ ori a7, a7, 1
+ sll.d a7, a7, a4
+ addi.d a4, a4, 1
+ addi.d t3, zero, 8
+
+ vreplgr2vr.b vr0, a6 //tmp0
+ vreplgr2vr.b vr1, a5 //tmp1
+ vreplgr2vr.h vr8, a7 //offset
+ vreplgr2vr.h vr9, a4 //denom
+ vilvh.b vr20, vr1, vr0 //wgt
+
+ fld.d f0, a1, 0 //src0
+ fldx.d f1, a1, a2 //src1
+ fldx.d f2, a1, t0 //src2
+ fldx.d f3, a1, t1 //src3
+ fld.d f10, a0, 0 //src10
+ fldx.d f11, a0, a2 //src11
+ fldx.d f12, a0, t0 //src12
+ fldx.d f13, a0, t1 //src13
+
+ vilvl.d vr4, vr1, vr0 //src0
+ vilvl.d vr5, vr3, vr2 //src2
+ vilvl.d vr6, vr11, vr10 //dst0
+ vilvl.d vr7, vr13, vr12 //dst2
+
+ vilvl.b vr0, vr6, vr4 //vec0.0
+ vilvh.b vr1, vr6, vr4 //vec0.1
+ vilvl.b vr2, vr7, vr5 //vec1.0
+ vilvh.b vr3, vr7, vr5 //vec1.1
+
+ vmov vr4, vr8
+ vmov vr5, vr8
+ vmov vr6, vr8
+ vmov vr7, vr8
+ vmaddwev.h.bu.b vr4, vr0, vr20
+ vmaddwev.h.bu.b vr5, vr1, vr20
+ vmaddwev.h.bu.b vr6, vr2, vr20
+ vmaddwev.h.bu.b vr7, vr3, vr20
+ vmaddwod.h.bu.b vr4, vr0, vr20
+ vmaddwod.h.bu.b vr5, vr1, vr20
+ vmaddwod.h.bu.b vr6, vr2, vr20
+ vmaddwod.h.bu.b vr7, vr3, vr20
+
+ vssran.bu.h vr0, vr4, vr9 //vec0
+ vssran.bu.h vr1, vr5, vr9 //vec0
+ vssran.bu.h vr2, vr6, vr9 //vec1
+ vssran.bu.h vr3, vr7, vr9 //vec1
+
+ vilvl.d vr4, vr1, vr0
+ vilvl.d vr6, vr3, vr2
+
+ vbsrl.v vr5, vr4, 8
+ vbsrl.v vr7, vr6, 8
+
+ fst.d f4, a0, 0
+ fstx.d f5, a0, a2
+ fstx.d f6, a0, t0
+ fstx.d f7, a0, t1
+
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8
+ addi.d t3, zero, 16
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+
+ fld.d f0, a1, 0 //src0
+ fldx.d f1, a1, a2 //src1
+ fldx.d f2, a1, t0 //src2
+ fldx.d f3, a1, t1 //src3
+ fld.d f10, a0, 0 //src10
+ fldx.d f11, a0, a2 //src11
+ fldx.d f12, a0, t0 //src12
+ fldx.d f13, a0, t1 //src13
+
+ vilvl.d vr4, vr1, vr0 //src0
+ vilvl.d vr5, vr3, vr2 //src2
+ vilvl.d vr6, vr11, vr10 //dst0
+ vilvl.d vr7, vr13, vr12 //dst2
+
+ vilvl.b vr0, vr6, vr4 //vec0.0
+ vilvh.b vr1, vr6, vr4 //vec0.1
+ vilvl.b vr2, vr7, vr5 //vec1.0
+ vilvh.b vr3, vr7, vr5 //vec1.1
+
+ vmov vr4, vr8
+ vmov vr5, vr8
+ vmov vr6, vr8
+ vmov vr7, vr8
+ vmaddwev.h.bu.b vr4, vr0, vr20
+ vmaddwev.h.bu.b vr5, vr1, vr20
+ vmaddwev.h.bu.b vr6, vr2, vr20
+ vmaddwev.h.bu.b vr7, vr3, vr20
+ vmaddwod.h.bu.b vr4, vr0, vr20
+ vmaddwod.h.bu.b vr5, vr1, vr20
+ vmaddwod.h.bu.b vr6, vr2, vr20
+ vmaddwod.h.bu.b vr7, vr3, vr20
+
+ vssran.bu.h vr0, vr4, vr9 //vec0
+ vssran.bu.h vr1, vr5, vr9 //vec0
+ vssran.bu.h vr2, vr6, vr9 //vec1
+ vssran.bu.h vr3, vr7, vr9 //vec1
+
+ vilvl.d vr4, vr1, vr0
+ vilvl.d vr6, vr3, vr2
+
+ vbsrl.v vr5, vr4, 8
+ vbsrl.v vr7, vr6, 8
+
+ fst.d f4, a0, 0
+ fstx.d f5, a0, a2
+ fstx.d f6, a0, t0
+ fstx.d f7, a0, t1
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ add.d t4, a1, t2
+ add.d t5, a0, t2
+
+ fld.d f0, a1, 0 //src0
+ fldx.d f1, a1, a2 //src1
+ fldx.d f2, a1, t0 //src2
+ fldx.d f3, a1, t1 //src3
+ fld.d f4, t4, 0 //src4
+ fldx.d f5, t4, a2 //src5
+ fldx.d f6, t4, t0 //src6
+ fldx.d f7, t4, t1 //src7
+ fld.d f10, a0, 0 //src10
+ fldx.d f11, a0, a2 //src11
+ fldx.d f12, a0, t0 //src12
+ fldx.d f13, a0, t1 //src13
+ fld.d f14, t5, 0 //src10
+ fldx.d f15, t5, a2 //src11
+ fldx.d f16, t5, t0 //src12
+ fldx.d f17, t5, t1 //src13
+
+ vilvl.d vr0, vr1, vr0 //src0
+ vilvl.d vr2, vr3, vr2 //src2
+ vilvl.d vr4, vr5, vr4 //src4
+ vilvl.d vr6, vr7, vr6 //src6
+ vilvl.d vr10, vr11, vr10 //dst0
+ vilvl.d vr12, vr13, vr12 //dst2
+ vilvl.d vr14, vr15, vr14 //dst4
+ vilvl.d vr16, vr17, vr16 //dst6
+
+ vilvl.b vr1, vr10, vr0 //vec0.0
+ vilvh.b vr3, vr10, vr0 //vec0.1
+ vilvl.b vr5, vr12, vr2 //vec2.0
+ vilvh.b vr7, vr12, vr2 //vec2.1
+ vilvl.b vr11, vr14, vr4 //vec4.0
+ vilvh.b vr13, vr14, vr4 //vec4.1
+ vilvl.b vr15, vr16, vr6 //vec6.0
+ vilvh.b vr17, vr16, vr6 //vec6.1
+
+ vmov vr0, vr8
+ vmov vr2, vr8
+ vmov vr4, vr8
+ vmov vr6, vr8
+ vmaddwev.h.bu.b vr0, vr1, vr20
+ vmaddwev.h.bu.b vr2, vr3, vr20
+ vmaddwev.h.bu.b vr4, vr5, vr20
+ vmaddwev.h.bu.b vr6, vr7, vr20
+ vmaddwod.h.bu.b vr0, vr1, vr20
+ vmaddwod.h.bu.b vr2, vr3, vr20
+ vmaddwod.h.bu.b vr4, vr5, vr20
+ vmaddwod.h.bu.b vr6, vr7, vr20
+
+ vmov vr10, vr8
+ vmov vr12, vr8
+ vmov vr14, vr8
+ vmov vr16, vr8
+
+ vmaddwev.h.bu.b vr10, vr11, vr20
+ vmaddwev.h.bu.b vr12, vr13, vr20
+ vmaddwev.h.bu.b vr14, vr15, vr20
+ vmaddwev.h.bu.b vr16, vr17, vr20
+ vmaddwod.h.bu.b vr10, vr11, vr20
+ vmaddwod.h.bu.b vr12, vr13, vr20
+ vmaddwod.h.bu.b vr14, vr15, vr20
+ vmaddwod.h.bu.b vr16, vr17, vr20
+
+ vssran.bu.h vr1, vr0, vr9 //vec0
+ vssran.bu.h vr3, vr2, vr9 //vec0
+ vssran.bu.h vr5, vr4, vr9 //vec2
+ vssran.bu.h vr7, vr6, vr9 //vec2
+
+ vssran.bu.h vr11, vr10, vr9 //vec4
+ vssran.bu.h vr13, vr12, vr9 //vec4
+ vssran.bu.h vr15, vr14, vr9 //vec6
+ vssran.bu.h vr17, vr16, vr9 //vec6
+
+ vilvl.d vr0, vr3, vr1
+ vilvl.d vr2, vr7, vr5
+ vilvl.d vr10, vr13, vr11
+ vilvl.d vr12, vr17, vr15
+
+ vbsrl.v vr1, vr0, 8
+ vbsrl.v vr3, vr2, 8
+ vbsrl.v vr11, vr10, 8
+ vbsrl.v vr13, vr12, 8
+
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a2
+ fstx.d f2, a0, t0
+ fstx.d f3, a0, t1
+ fst.d f10, t5, 0
+ fstx.d f11, t5, a2
+ fstx.d f12, t5, t0
+ fstx.d f13, t5, t1
+.END_BIWEIGHT_H264_PIXELS8:
+endfunc
+
+function ff_biweight_h264_pixels8_8_lasx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ addi.d a7, a7, 1
+ ori a7, a7, 1
+ sll.d a7, a7, a4
+ addi.d a4, a4, 1
+ addi.d t3, zero, 8
+
+ xvreplgr2vr.b xr0, a6 //tmp0
+ xvreplgr2vr.b xr1, a5 //tmp1
+ xvreplgr2vr.h xr8, a7 //offset
+ xvreplgr2vr.h xr9, a4 //denom
+ xvilvh.b xr20, xr1, xr0 //wgt
+
+ fld.d f0, a1, 0 //src0
+ fldx.d f1, a1, a2 //src1
+ fldx.d f2, a1, t0 //src2
+ fldx.d f3, a1, t1 //src3
+ fld.d f10, a0, 0 //src10
+ fldx.d f11, a0, a2 //src11
+ fldx.d f12, a0, t0 //src12
+ fldx.d f13, a0, t1 //src13
+
+ vilvl.d vr4, vr1, vr0 //src0
+ vilvl.d vr5, vr3, vr2 //src2
+ vilvl.d vr6, vr11, vr10 //dst0
+ vilvl.d vr7, vr13, vr12 //dst2
+
+ xvpermi.q xr5, xr4, 0x20
+ xvpermi.q xr7, xr6, 0x20
+
+ xvilvl.b xr0, xr7, xr5
+ xvilvh.b xr1, xr7, xr5
+
+ xmov xr2, xr8
+ xmov xr3, xr8
+ xvmaddwev.h.bu.b xr2, xr0, xr20
+ xvmaddwev.h.bu.b xr3, xr1, xr20
+ xvmaddwod.h.bu.b xr2, xr0, xr20
+ xvmaddwod.h.bu.b xr3, xr1, xr20
+
+ xvssran.bu.h xr4, xr2, xr9 //vec0
+ xvssran.bu.h xr5, xr3, xr9 //vec2
+
+ xvilvl.d xr0, xr5, xr4
+ xvpermi.d xr2, xr0, 0x4E
+ vbsrl.v vr1, vr0, 8
+ vbsrl.v vr3, vr2, 8
+
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a2
+ fstx.d f2, a0, t0
+ fstx.d f3, a0, t1
+
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8_LASX
+ addi.d t3, zero, 16
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+
+ fld.d f0, a1, 0 //src0
+ fldx.d f1, a1, a2 //src1
+ fldx.d f2, a1, t0 //src2
+ fldx.d f3, a1, t1 //src3
+ fld.d f10, a0, 0 //src10
+ fldx.d f11, a0, a2 //src11
+ fldx.d f12, a0, t0 //src12
+ fldx.d f13, a0, t1 //src13
+
+ vilvl.d vr4, vr1, vr0 //src0
+ vilvl.d vr5, vr3, vr2 //src2
+ vilvl.d vr6, vr11, vr10 //dst0
+ vilvl.d vr7, vr13, vr12 //dst2
+
+ xvpermi.q xr5, xr4, 0x20
+ xvpermi.q xr7, xr6, 0x20
+
+ xvilvl.b xr0, xr7, xr5
+ xvilvh.b xr1, xr7, xr5
+
+ xmov xr2, xr8
+ xmov xr3, xr8
+ xvmaddwev.h.bu.b xr2, xr0, xr20
+ xvmaddwev.h.bu.b xr3, xr1, xr20
+ xvmaddwod.h.bu.b xr2, xr0, xr20
+ xvmaddwod.h.bu.b xr3, xr1, xr20
+
+ xvssran.bu.h xr4, xr2, xr9 //vec0
+ xvssran.bu.h xr5, xr3, xr9 //vec2
+
+ xvilvl.d xr0, xr5, xr4
+ xvpermi.d xr2, xr0, 0x4E
+ vbsrl.v vr1, vr0, 8
+ vbsrl.v vr3, vr2, 8
+
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a2
+ fstx.d f2, a0, t0
+ fstx.d f3, a0, t1
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8_LASX
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ add.d t4, a1, t2
+ add.d t5, a0, t2
+
+ fld.d f0, a1, 0 //src0
+ fldx.d f1, a1, a2 //src1
+ fldx.d f2, a1, t0 //src2
+ fldx.d f3, a1, t1 //src3
+ fld.d f4, t4, 0 //src4
+ fldx.d f5, t4, a2 //src5
+ fldx.d f6, t4, t0 //src6
+ fldx.d f7, t4, t1 //src7
+ fld.d f10, a0, 0 //dst0
+ fldx.d f11, a0, a2 //dst1
+ fldx.d f12, a0, t0 //dst2
+ fldx.d f13, a0, t1 //dst3
+ fld.d f14, t5, 0 //dst4
+ fldx.d f15, t5, a2 //dst5
+ fldx.d f16, t5, t0 //dst6
+ fldx.d f17, t5, t1 //dst7
+
+ vilvl.d vr0, vr1, vr0 //src0
+ vilvl.d vr2, vr3, vr2 //src2
+ vilvl.d vr4, vr5, vr4 //src4
+ vilvl.d vr6, vr7, vr6 //src6
+ vilvl.d vr10, vr11, vr10 //dst0
+ vilvl.d vr12, vr13, vr12 //dst2
+ vilvl.d vr14, vr15, vr14 //dst4
+ vilvl.d vr16, vr17, vr16 //dst6
+
+ xvpermi.q xr2, xr0, 0x20
+ xvpermi.q xr6, xr4, 0x20
+ xvpermi.q xr12, xr10, 0x20
+ xvpermi.q xr16, xr14, 0x20
+
+ xvilvl.b xr0, xr12, xr2
+ xvilvh.b xr1, xr12, xr2
+ xvilvl.b xr10, xr16, xr6
+ xvilvh.b xr11, xr16, xr6
+
+ xmov xr2, xr8
+ xmov xr3, xr8
+ xmov xr12, xr8
+ xmov xr13, xr8
+ xvmaddwev.h.bu.b xr2, xr0, xr20
+ xvmaddwev.h.bu.b xr3, xr1, xr20
+ xvmaddwev.h.bu.b xr12, xr10, xr20
+ xvmaddwev.h.bu.b xr13, xr11, xr20
+ xvmaddwod.h.bu.b xr2, xr0, xr20
+ xvmaddwod.h.bu.b xr3, xr1, xr20
+ xvmaddwod.h.bu.b xr12, xr10, xr20
+ xvmaddwod.h.bu.b xr13, xr11, xr20
+
+ xvssran.bu.h xr4, xr2, xr9 //vec0
+ xvssran.bu.h xr5, xr3, xr9 //vec2
+ xvssran.bu.h xr14, xr12, xr9 //vec0
+ xvssran.bu.h xr15, xr13, xr9 //vec2
+
+ xvilvl.d xr0, xr5, xr4
+ xvilvl.d xr10, xr15, xr14
+ xvpermi.d xr2, xr0, 0x4E
+ xvpermi.d xr12, xr10, 0x4E
+ vbsrl.v vr1, vr0, 8
+ vbsrl.v vr3, vr2, 8
+ vbsrl.v vr11, vr10, 8
+ vbsrl.v vr13, vr12, 8
+
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a2
+ fstx.d f2, a0, t0
+ fstx.d f3, a0, t1
+ fst.d f10, t5, 0
+ fstx.d f11, t5, a2
+ fstx.d f12, t5, t0
+ fstx.d f13, t5, t1
+.END_BIWEIGHT_H264_PIXELS8_LASX:
+endfunc
+
+//LSX optimization is enough for this function.
+function ff_biweight_h264_pixels4_8_lsx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ addi.d a7, a7, 1
+ ori a7, a7, 1
+ sll.d a7, a7, a4
+ addi.d a4, a4, 1
+ addi.d t3, zero, 4
+
+ vreplgr2vr.b vr0, a6 //tmp0
+ vreplgr2vr.b vr1, a5 //tmp1
+ vreplgr2vr.h vr8, a7 //offset
+ vreplgr2vr.h vr9, a4 //denom
+ vilvh.b vr20, vr1, vr0 //wgt
+
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fld.s f10, a0, 0
+ fldx.s f11, a0, a2
+ vilvl.w vr2, vr1, vr0
+ vilvl.w vr12, vr11, vr10
+ vilvl.b vr0, vr12, vr2
+
+ vmov vr1, vr8
+ vmaddwev.h.bu.b vr1, vr0, vr20
+ vmaddwod.h.bu.b vr1, vr0, vr20
+
+ vssran.bu.h vr1, vr1, vr9 //vec0
+ vbsrl.v vr2, vr1, 4
+ fst.s f1, a0, 0
+ fstx.s f2, a0, a2
+
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS4
+ addi.d t3, zero, 8
+ fldx.s f0, a1, t0
+ fldx.s f1, a1, t1
+ fldx.s f10, a0, t0
+ fldx.s f11, a0, t1
+ vilvl.w vr2, vr1, vr0
+ vilvl.w vr12, vr11, vr10
+ vilvl.b vr0, vr12, vr2
+
+ vmov vr1, vr8
+ vmaddwev.h.bu.b vr1, vr0, vr20
+ vmaddwod.h.bu.b vr1, vr0, vr20
+
+ vssran.bu.h vr1, vr1, vr9 //vec0
+ vbsrl.v vr2, vr1, 4
+ fstx.s f1, a0, t0
+ fstx.s f2, a0, t1
+
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS4
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t0
+ fldx.s f3, a1, t1
+ fld.s f10, a0, 0
+ fldx.s f11, a0, a2
+ fldx.s f12, a0, t0
+ fldx.s f13, a0, t1
+ vilvl.w vr4, vr1, vr0
+ vilvl.w vr5, vr3, vr2
+ vilvl.w vr14, vr11, vr10
+ vilvl.w vr15, vr13, vr12
+
+ vilvl.b vr0, vr14, vr4
+ vilvl.b vr10, vr15, vr5
+
+ vmov vr1, vr8
+ vmov vr11, vr8
+ vmaddwev.h.bu.b vr1, vr0, vr20
+ vmaddwev.h.bu.b vr11, vr10, vr20
+ vmaddwod.h.bu.b vr1, vr0, vr20
+ vmaddwod.h.bu.b vr11, vr10, vr20
+
+ vssran.bu.h vr0, vr1, vr9 //vec0
+ vssran.bu.h vr10, vr11, vr9 //vec0
+ vbsrl.v vr2, vr0, 4
+ vbsrl.v vr12, vr10, 4
+
+ fst.s f0, a0, 0
+ fstx.s f2, a0, a2
+ fstx.s f10, a0, t0
+ fstx.s f12, a0, t1
+.END_BIWEIGHT_H264_PIXELS4:
+endfunc
+
+function ff_weight_h264_pixels16_8_lsx
+ slli.d t0, a1, 1
+ slli.d t2, a1, 2
+ add.d t1, t0, a1
+ addi.d t3, zero, 16
+
+ sll.d a5, a5, a3
+ vreplgr2vr.h vr20, a4 //weight
+ vreplgr2vr.h vr8, a5 //offset
+ vreplgr2vr.h vr9, a3 //log2_denom
+ vldi vr23, 0
+
+ add.d t4, a0, t2
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vldx vr2, a0, t0
+ vldx vr3, a0, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a1
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ vilvl.b vr10, vr23, vr0
+ vilvl.b vr11, vr23, vr1
+ vilvl.b vr12, vr23, vr2
+ vilvl.b vr13, vr23, vr3
+ vilvl.b vr14, vr23, vr4
+ vilvl.b vr15, vr23, vr5
+ vilvl.b vr16, vr23, vr6
+ vilvl.b vr17, vr23, vr7
+
+ vmul.h vr10, vr10, vr20
+ vmul.h vr11, vr11, vr20
+ vmul.h vr12, vr12, vr20
+ vmul.h vr13, vr13, vr20
+ vmul.h vr14, vr14, vr20
+ vmul.h vr15, vr15, vr20
+ vmul.h vr16, vr16, vr20
+ vmul.h vr17, vr17, vr20
+ vsadd.h vr10, vr8, vr10
+ vsadd.h vr11, vr8, vr11
+ vsadd.h vr12, vr8, vr12
+ vsadd.h vr13, vr8, vr13
+ vsadd.h vr14, vr8, vr14
+ vsadd.h vr15, vr8, vr15
+ vsadd.h vr16, vr8, vr16
+ vsadd.h vr17, vr8, vr17
+
+ vilvh.b vr18, vr23, vr0
+ vilvh.b vr19, vr23, vr1
+ vilvh.b vr21, vr23, vr2
+ vilvh.b vr22, vr23, vr3
+ vilvh.b vr0, vr23, vr4
+ vilvh.b vr1, vr23, vr5
+ vilvh.b vr2, vr23, vr6
+ vilvh.b vr3, vr23, vr7
+ vmul.h vr18, vr18, vr20
+ vmul.h vr19, vr19, vr20
+ vmul.h vr21, vr21, vr20
+ vmul.h vr22, vr22, vr20
+ vmul.h vr0, vr0, vr20
+ vmul.h vr1, vr1, vr20
+ vmul.h vr2, vr2, vr20
+ vmul.h vr3, vr3, vr20
+ vsadd.h vr18, vr8, vr18
+ vsadd.h vr19, vr8, vr19
+ vsadd.h vr21, vr8, vr21
+ vsadd.h vr22, vr8, vr22
+ vsadd.h vr0, vr8, vr0
+ vsadd.h vr1, vr8, vr1
+ vsadd.h vr2, vr8, vr2
+ vsadd.h vr3, vr8, vr3
+
+ vssrarn.bu.h vr10, vr10, vr9
+ vssrarn.bu.h vr11, vr11, vr9
+ vssrarn.bu.h vr12, vr12, vr9
+ vssrarn.bu.h vr13, vr13, vr9
+ vssrarn.bu.h vr14, vr14, vr9
+ vssrarn.bu.h vr15, vr15, vr9
+ vssrarn.bu.h vr16, vr16, vr9
+ vssrarn.bu.h vr17, vr17, vr9
+ vssrarn.bu.h vr4, vr18, vr9
+ vssrarn.bu.h vr5, vr19, vr9
+ vssrarn.bu.h vr6, vr21, vr9
+ vssrarn.bu.h vr7, vr22, vr9
+ vssrarn.bu.h vr0, vr0, vr9
+ vssrarn.bu.h vr1, vr1, vr9
+ vssrarn.bu.h vr2, vr2, vr9
+ vssrarn.bu.h vr3, vr3, vr9
+
+ vilvl.d vr10, vr4, vr10
+ vilvl.d vr11, vr5, vr11
+ vilvl.d vr12, vr6, vr12
+ vilvl.d vr13, vr7, vr13
+ vilvl.d vr14, vr0, vr14
+ vilvl.d vr15, vr1, vr15
+ vilvl.d vr16, vr2, vr16
+ vilvl.d vr17, vr3, vr17
+
+ vst vr10, a0, 0
+ vstx vr11, a0, a1
+ vstx vr12, a0, t0
+ vstx vr13, a0, t1
+ vst vr14, t4, 0
+ vstx vr15, t4, a1
+ vstx vr16, t4, t0
+ vstx vr17, t4, t1
+
+ bne a2, t3, .END_WEIGHT_H264_PIXELS16_8
+ add.d a0, t4, t2
+ add.d t4, a0, t2
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vldx vr2, a0, t0
+ vldx vr3, a0, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a1
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ vilvl.b vr10, vr23, vr0
+ vilvl.b vr11, vr23, vr1
+ vilvl.b vr12, vr23, vr2
+ vilvl.b vr13, vr23, vr3
+ vilvl.b vr14, vr23, vr4
+ vilvl.b vr15, vr23, vr5
+ vilvl.b vr16, vr23, vr6
+ vilvl.b vr17, vr23, vr7
+
+ vmul.h vr10, vr10, vr20
+ vmul.h vr11, vr11, vr20
+ vmul.h vr12, vr12, vr20
+ vmul.h vr13, vr13, vr20
+ vmul.h vr14, vr14, vr20
+ vmul.h vr15, vr15, vr20
+ vmul.h vr16, vr16, vr20
+ vmul.h vr17, vr17, vr20
+ vsadd.h vr10, vr8, vr10
+ vsadd.h vr11, vr8, vr11
+ vsadd.h vr12, vr8, vr12
+ vsadd.h vr13, vr8, vr13
+ vsadd.h vr14, vr8, vr14
+ vsadd.h vr15, vr8, vr15
+ vsadd.h vr16, vr8, vr16
+ vsadd.h vr17, vr8, vr17
+
+ vilvh.b vr18, vr23, vr0
+ vilvh.b vr19, vr23, vr1
+ vilvh.b vr21, vr23, vr2
+ vilvh.b vr22, vr23, vr3
+ vilvh.b vr0, vr23, vr4
+ vilvh.b vr1, vr23, vr5
+ vilvh.b vr2, vr23, vr6
+ vilvh.b vr3, vr23, vr7
+ vmul.h vr18, vr18, vr20
+ vmul.h vr19, vr19, vr20
+ vmul.h vr21, vr21, vr20
+ vmul.h vr22, vr22, vr20
+ vmul.h vr0, vr0, vr20
+ vmul.h vr1, vr1, vr20
+ vmul.h vr2, vr2, vr20
+ vmul.h vr3, vr3, vr20
+ vsadd.h vr18, vr8, vr18
+ vsadd.h vr19, vr8, vr19
+ vsadd.h vr21, vr8, vr21
+ vsadd.h vr22, vr8, vr22
+ vsadd.h vr0, vr8, vr0
+ vsadd.h vr1, vr8, vr1
+ vsadd.h vr2, vr8, vr2
+ vsadd.h vr3, vr8, vr3
+
+ vssrarn.bu.h vr10, vr10, vr9
+ vssrarn.bu.h vr11, vr11, vr9
+ vssrarn.bu.h vr12, vr12, vr9
+ vssrarn.bu.h vr13, vr13, vr9
+ vssrarn.bu.h vr14, vr14, vr9
+ vssrarn.bu.h vr15, vr15, vr9
+ vssrarn.bu.h vr16, vr16, vr9
+ vssrarn.bu.h vr17, vr17, vr9
+ vssrarn.bu.h vr4, vr18, vr9
+ vssrarn.bu.h vr5, vr19, vr9
+ vssrarn.bu.h vr6, vr21, vr9
+ vssrarn.bu.h vr7, vr22, vr9
+ vssrarn.bu.h vr0, vr0, vr9
+ vssrarn.bu.h vr1, vr1, vr9
+ vssrarn.bu.h vr2, vr2, vr9
+ vssrarn.bu.h vr3, vr3, vr9
+
+ vilvl.d vr10, vr4, vr10
+ vilvl.d vr11, vr5, vr11
+ vilvl.d vr12, vr6, vr12
+ vilvl.d vr13, vr7, vr13
+ vilvl.d vr14, vr0, vr14
+ vilvl.d vr15, vr1, vr15
+ vilvl.d vr16, vr2, vr16
+ vilvl.d vr17, vr3, vr17
+
+ vst vr10, a0, 0
+ vstx vr11, a0, a1
+ vstx vr12, a0, t0
+ vstx vr13, a0, t1
+ vst vr14, t4, 0
+ vstx vr15, t4, a1
+ vstx vr16, t4, t0
+ vstx vr17, t4, t1
+.END_WEIGHT_H264_PIXELS16_8:
+endfunc
+
+function ff_weight_h264_pixels16_8_lasx
+ slli.d t0, a1, 1
+ slli.d t2, a1, 2
+ add.d t1, t0, a1
+ addi.d t3, zero, 16
+
+ sll.d a5, a5, a3
+ xvreplgr2vr.h xr20, a4 //weight
+ xvreplgr2vr.h xr8, a5 //offset
+ xvreplgr2vr.h xr9, a3 //log2_denom
+
+ add.d t4, a0, t2
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vldx vr2, a0, t0
+ vldx vr3, a0, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a1
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ vext2xv.hu.bu xr0, xr0
+ vext2xv.hu.bu xr1, xr1
+ vext2xv.hu.bu xr2, xr2
+ vext2xv.hu.bu xr3, xr3
+ vext2xv.hu.bu xr4, xr4
+ vext2xv.hu.bu xr5, xr5
+ vext2xv.hu.bu xr6, xr6
+ vext2xv.hu.bu xr7, xr7
+ xvmul.h xr10, xr0, xr20
+ xvmul.h xr11, xr1, xr20
+ xvmul.h xr12, xr2, xr20
+ xvmul.h xr13, xr3, xr20
+ xvmul.h xr14, xr4, xr20
+ xvmul.h xr15, xr5, xr20
+ xvmul.h xr16, xr6, xr20
+ xvmul.h xr17, xr7, xr20
+ xvsadd.h xr10, xr8, xr10
+ xvsadd.h xr11, xr8, xr11
+ xvsadd.h xr12, xr8, xr12
+ xvsadd.h xr13, xr8, xr13
+ xvsadd.h xr14, xr8, xr14
+ xvsadd.h xr15, xr8, xr15
+ xvsadd.h xr16, xr8, xr16
+ xvsadd.h xr17, xr8, xr17
+
+ xvssrarn.bu.h xr10, xr10, xr9
+ xvssrarn.bu.h xr11, xr11, xr9
+ xvssrarn.bu.h xr12, xr12, xr9
+ xvssrarn.bu.h xr13, xr13, xr9
+ xvssrarn.bu.h xr14, xr14, xr9
+ xvssrarn.bu.h xr15, xr15, xr9
+ xvssrarn.bu.h xr16, xr16, xr9
+ xvssrarn.bu.h xr17, xr17, xr9
+
+ xvpermi.d xr10, xr10, 0xD8
+ xvpermi.d xr11, xr11, 0xD8
+ xvpermi.d xr12, xr12, 0xD8
+ xvpermi.d xr13, xr13, 0xD8
+ xvpermi.d xr14, xr14, 0xD8
+ xvpermi.d xr15, xr15, 0xD8
+ xvpermi.d xr16, xr16, 0xD8
+ xvpermi.d xr17, xr17, 0xD8
+
+ vst vr10, a0, 0
+ vstx vr11, a0, a1
+ vstx vr12, a0, t0
+ vstx vr13, a0, t1
+ vst vr14, t4, 0
+ vstx vr15, t4, a1
+ vstx vr16, t4, t0
+ vstx vr17, t4, t1
+
+ bne a2, t3, .END_WEIGHT_H264_PIXELS16_8_LASX
+ add.d a0, t4, t2
+ add.d t4, a0, t2
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vldx vr2, a0, t0
+ vldx vr3, a0, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a1
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ vext2xv.hu.bu xr0, xr0
+ vext2xv.hu.bu xr1, xr1
+ vext2xv.hu.bu xr2, xr2
+ vext2xv.hu.bu xr3, xr3
+ vext2xv.hu.bu xr4, xr4
+ vext2xv.hu.bu xr5, xr5
+ vext2xv.hu.bu xr6, xr6
+ vext2xv.hu.bu xr7, xr7
+ xvmul.h xr10, xr0, xr20
+ xvmul.h xr11, xr1, xr20
+ xvmul.h xr12, xr2, xr20
+ xvmul.h xr13, xr3, xr20
+ xvmul.h xr14, xr4, xr20
+ xvmul.h xr15, xr5, xr20
+ xvmul.h xr16, xr6, xr20
+ xvmul.h xr17, xr7, xr20
+ xvsadd.h xr10, xr8, xr10
+ xvsadd.h xr11, xr8, xr11
+ xvsadd.h xr12, xr8, xr12
+ xvsadd.h xr13, xr8, xr13
+ xvsadd.h xr14, xr8, xr14
+ xvsadd.h xr15, xr8, xr15
+ xvsadd.h xr16, xr8, xr16
+ xvsadd.h xr17, xr8, xr17
+
+ xvssrarn.bu.h xr10, xr10, xr9
+ xvssrarn.bu.h xr11, xr11, xr9
+ xvssrarn.bu.h xr12, xr12, xr9
+ xvssrarn.bu.h xr13, xr13, xr9
+ xvssrarn.bu.h xr14, xr14, xr9
+ xvssrarn.bu.h xr15, xr15, xr9
+ xvssrarn.bu.h xr16, xr16, xr9
+ xvssrarn.bu.h xr17, xr17, xr9
+
+ xvpermi.d xr10, xr10, 0xD8
+ xvpermi.d xr11, xr11, 0xD8
+ xvpermi.d xr12, xr12, 0xD8
+ xvpermi.d xr13, xr13, 0xD8
+ xvpermi.d xr14, xr14, 0xD8
+ xvpermi.d xr15, xr15, 0xD8
+ xvpermi.d xr16, xr16, 0xD8
+ xvpermi.d xr17, xr17, 0xD8
+
+ vst vr10, a0, 0
+ vstx vr11, a0, a1
+ vstx vr12, a0, t0
+ vstx vr13, a0, t1
+ vst vr14, t4, 0
+ vstx vr15, t4, a1
+ vstx vr16, t4, t0
+ vstx vr17, t4, t1
+.END_WEIGHT_H264_PIXELS16_8_LASX:
+endfunc
+
+function ff_weight_h264_pixels8_8_lsx
+ slli.d t0, a1, 1
+ slli.d t2, a1, 2
+ add.d t1, t0, a1
+ addi.d t3, zero, 8
+
+ sll.d a5, a5, a3
+ vreplgr2vr.h vr20, a4 //weight
+ vreplgr2vr.h vr8, a5 //offset
+ vreplgr2vr.h vr9, a3 //log2_denom
+ vldi vr21, 0
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a1
+ fldx.d f2, a0, t0
+ fldx.d f3, a0, t1
+
+ vilvl.b vr10, vr21, vr0
+ vilvl.b vr11, vr21, vr1
+ vilvl.b vr12, vr21, vr2
+ vilvl.b vr13, vr21, vr3
+
+ vmul.h vr10, vr10, vr20
+ vmul.h vr11, vr11, vr20
+ vmul.h vr12, vr12, vr20
+ vmul.h vr13, vr13, vr20
+ vsadd.h vr0, vr8, vr10
+ vsadd.h vr1, vr8, vr11
+ vsadd.h vr2, vr8, vr12
+ vsadd.h vr3, vr8, vr13
+
+ vssrarn.bu.h vr0, vr0, vr9
+ vssrarn.bu.h vr1, vr1, vr9
+ vssrarn.bu.h vr2, vr2, vr9
+ vssrarn.bu.h vr3, vr3, vr9
+
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a1
+ fstx.d f2, a0, t0
+ fstx.d f3, a0, t1
+
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8
+ add.d a0, a0, t2
+ addi.d t3, zero, 16
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a1
+ fldx.d f2, a0, t0
+ fldx.d f3, a0, t1
+
+ vilvl.b vr10, vr21, vr0
+ vilvl.b vr11, vr21, vr1
+ vilvl.b vr12, vr21, vr2
+ vilvl.b vr13, vr21, vr3
+
+ vmul.h vr10, vr10, vr20
+ vmul.h vr11, vr11, vr20
+ vmul.h vr12, vr12, vr20
+ vmul.h vr13, vr13, vr20
+ vsadd.h vr0, vr8, vr10
+ vsadd.h vr1, vr8, vr11
+ vsadd.h vr2, vr8, vr12
+ vsadd.h vr3, vr8, vr13
+
+ vssrarn.bu.h vr0, vr0, vr9
+ vssrarn.bu.h vr1, vr1, vr9
+ vssrarn.bu.h vr2, vr2, vr9
+ vssrarn.bu.h vr3, vr3, vr9
+
+ fst.d f0, a0, 0
+ fstx.d f1, a0, a1
+ fstx.d f2, a0, t0
+ fstx.d f3, a0, t1
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8
+ add.d a0, a0, t2
+ add.d t4, a0, t2
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a1
+ fldx.d f2, a0, t0
+ fldx.d f3, a0, t1
+ fld.d f4, t4, 0
+ fldx.d f5, t4, a1
+ fldx.d f6, t4, t0
+ fldx.d f7, t4, t1
+
+ vilvl.b vr10, vr21, vr0
+ vilvl.b vr11, vr21, vr1
+ vilvl.b vr12, vr21, vr2
+ vilvl.b vr13, vr21, vr3
+ vilvl.b vr14, vr21, vr4
+ vilvl.b vr15, vr21, vr5
+ vilvl.b vr16, vr21, vr6
+ vilvl.b vr17, vr21, vr7
+
+ vmul.h vr0, vr10, vr20
+ vmul.h vr1, vr11, vr20
+ vmul.h vr2, vr12, vr20
+ vmul.h vr3, vr13, vr20
+ vmul.h vr4, vr14, vr20
+ vmul.h vr5, vr15, vr20
+ vmul.h vr6, vr16, vr20
+ vmul.h vr7, vr17, vr20
+
+ vsadd.h vr0, vr8, vr0
+ vsadd.h vr1, vr8, vr1
+ vsadd.h vr2, vr8, vr2
+ vsadd.h vr3, vr8, vr3
+ vsadd.h vr4, vr8, vr4
+ vsadd.h vr5, vr8, vr5
+ vsadd.h vr6, vr8, vr6
+ vsadd.h vr7, vr8, vr7
+
+ vssrarn.bu.h vr10, vr0, vr9
+ vssrarn.bu.h vr11, vr1, vr9
+ vssrarn.bu.h vr12, vr2, vr9
+ vssrarn.bu.h vr13, vr3, vr9
+ vssrarn.bu.h vr14, vr4, vr9
+ vssrarn.bu.h vr15, vr5, vr9
+ vssrarn.bu.h vr16, vr6, vr9
+ vssrarn.bu.h vr17, vr7, vr9
+
+ fst.d f10, a0, 0
+ fstx.d f11, a0, a1
+ fstx.d f12, a0, t0
+ fstx.d f13, a0, t1
+ fst.d f14, t4, 0
+ fstx.d f15, t4, a1
+ fstx.d f16, t4, t0
+ fstx.d f17, t4, t1
+.END_WEIGHT_H264_PIXELS8:
+endfunc
+
+function ff_weight_h264_pixels8_8_lasx
+ slli.d t0, a1, 1
+ slli.d t2, a1, 2
+ add.d t1, t0, a1
+ addi.d t3, zero, 8
+
+ sll.d a5, a5, a3
+ xvreplgr2vr.h xr20, a4 //weight
+ xvreplgr2vr.h xr8, a5 //offset
+ xvreplgr2vr.h xr9, a3 //log2_denom
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a1
+ fldx.d f2, a0, t0
+ fldx.d f3, a0, t1
+ vilvl.d vr4, vr1, vr0
+ vilvl.d vr5, vr3, vr2
+ vext2xv.hu.bu xr6, xr4
+ vext2xv.hu.bu xr7, xr5
+
+ xvmul.h xr11, xr6, xr20
+ xvmul.h xr13, xr7, xr20
+ xvsadd.h xr1, xr8, xr11
+ xvsadd.h xr3, xr8, xr13
+
+ xvssrarn.bu.h xr1, xr1, xr9
+ xvssrarn.bu.h xr3, xr3, xr9
+ xvpermi.d xr2, xr1, 0x2
+ xvpermi.d xr4, xr3, 0x2
+
+ fst.d f1, a0, 0
+ fstx.d f2, a0, a1
+ fstx.d f3, a0, t0
+ fstx.d f4, a0, t1
+
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8_LASX
+ add.d a0, a0, t2
+ addi.d t3, zero, 16
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a1
+ fldx.d f2, a0, t0
+ fldx.d f3, a0, t1
+ vilvl.d vr4, vr1, vr0
+ vilvl.d vr5, vr3, vr2
+ vext2xv.hu.bu xr6, xr4
+ vext2xv.hu.bu xr7, xr5
+
+ xvmul.h xr11, xr6, xr20
+ xvmul.h xr13, xr7, xr20
+ xvsadd.h xr1, xr8, xr11
+ xvsadd.h xr3, xr8, xr13
+
+ xvssrarn.bu.h xr1, xr1, xr9
+ xvssrarn.bu.h xr3, xr3, xr9
+ xvpermi.d xr2, xr1, 0x2
+ xvpermi.d xr4, xr3, 0x2
+
+ fst.d f1, a0, 0
+ fstx.d f2, a0, a1
+ fstx.d f3, a0, t0
+ fstx.d f4, a0, t1
+
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8_LASX
+ add.d a0, a0, t2
+ add.d t4, a0, t2
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a1
+ fldx.d f2, a0, t0
+ fldx.d f3, a0, t1
+ fld.d f4, t4, 0
+ fldx.d f5, t4, a1
+ fldx.d f6, t4, t0
+ fldx.d f7, t4, t1
+
+ vilvl.d vr10, vr1, vr0
+ vilvl.d vr11, vr3, vr2
+ vilvl.d vr12, vr5, vr4
+ vilvl.d vr13, vr7, vr6
+ vext2xv.hu.bu xr10, xr10
+ vext2xv.hu.bu xr11, xr11
+ vext2xv.hu.bu xr12, xr12
+ vext2xv.hu.bu xr13, xr13
+
+ xvmul.h xr0, xr10, xr20
+ xvmul.h xr1, xr11, xr20
+ xvmul.h xr2, xr12, xr20
+ xvmul.h xr3, xr13, xr20
+
+ xvsadd.h xr0, xr8, xr0
+ xvsadd.h xr1, xr8, xr1
+ xvsadd.h xr2, xr8, xr2
+ xvsadd.h xr3, xr8, xr3
+
+ xvssrarn.bu.h xr10, xr0, xr9
+ xvssrarn.bu.h xr12, xr1, xr9
+ xvssrarn.bu.h xr14, xr2, xr9
+ xvssrarn.bu.h xr16, xr3, xr9
+ xvpermi.d xr11, xr10, 0x2
+ xvpermi.d xr13, xr12, 0x2
+ xvpermi.d xr15, xr14, 0x2
+ xvpermi.d xr17, xr16, 0x2
+
+ fst.d f10, a0, 0
+ fstx.d f11, a0, a1
+ fstx.d f12, a0, t0
+ fstx.d f13, a0, t1
+ fst.d f14, t4, 0
+ fstx.d f15, t4, a1
+ fstx.d f16, t4, t0
+ fstx.d f17, t4, t1
+.END_WEIGHT_H264_PIXELS8_LASX:
+endfunc
+
+//LSX optimization is enough for this function.
+function ff_weight_h264_pixels4_8_lsx
+ add.d t0, a0, a1
+ addi.d t3, zero, 4
+
+ sll.d a5, a5, a3
+ vreplgr2vr.h vr20, a4 //weight
+ vreplgr2vr.h vr8, a5 //offset
+ vreplgr2vr.h vr9, a3 //log2_denom
+ vldi vr21, 0
+
+ fld.s f0, a0, 0
+ fldx.s f1, a0, a1
+ vilvl.w vr4, vr1, vr0
+ vilvl.b vr5, vr21, vr4
+ vmul.h vr10, vr5, vr20
+ vsadd.h vr0, vr8, vr10
+ vssrarn.bu.h vr0, vr0, vr9
+
+ fst.s f0, a0, 0
+ vstelm.w vr0, t0, 0, 1
+ blt a2, t3, .END_WEIGHT_H264_PIXELS4
+ add.d a0, t0, a1
+ addi.d t3, zero, 8
+ fld.s f0, a0, 0
+ fldx.s f1, a0, a1
+ add.d t0, a0, a1
+ vilvl.w vr4, vr1, vr0
+ vilvl.b vr5, vr21, vr4
+
+ vmul.h vr10, vr5, vr20
+ vsadd.h vr0, vr8, vr10
+ vssrarn.bu.h vr0, vr0, vr9
+
+ fst.s f0, a0, 0
+ vstelm.w vr0, t0, 0, 1
+ blt a2, t3, .END_WEIGHT_H264_PIXELS4
+ add.d a0, t0, a1
+ add.d t0, a0, a1
+ add.d t1, t0, a1
+ add.d t2, t1, a1
+
+ fld.s f0, a0, 0
+ fld.s f1, t0, 0
+ fld.s f2, t1, 0
+ fld.s f3, t2, 0
+
+ vilvl.w vr4, vr1, vr0
+ vilvl.w vr5, vr3, vr2
+ vilvl.b vr6, vr21, vr4
+ vilvl.b vr7, vr21, vr5
+
+ vmul.h vr10, vr6, vr20
+ vmul.h vr11, vr7, vr20
+ vsadd.h vr0, vr8, vr10
+ vsadd.h vr1, vr8, vr11
+ vssrarn.bu.h vr10, vr0, vr9
+ vssrarn.bu.h vr11, vr1, vr9
+
+ fst.s f10, a0, 0
+ vstelm.w vr10, t0, 0, 1
+ fst.s f11, t1, 0
+ vstelm.w vr11, t2, 0, 1
+.END_WEIGHT_H264_PIXELS4:
+endfunc
+
+function ff_h264_add_pixels4_8_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ vld vr0, a1, 0
+ vld vr1, a1, 16
+ vldi vr2, 0
+ fld.s f3, a0, 0
+ fldx.s f4, a0, a2
+ fldx.s f5, a0, t0
+ fldx.s f6, a0, t1
+ vilvl.w vr7, vr4, vr3
+ vilvl.w vr8, vr6, vr5
+ vilvl.b vr9, vr2, vr7
+ vilvl.b vr10, vr2, vr8
+ vadd.h vr11, vr0, vr9
+ vadd.h vr12, vr1, vr10
+ vpickev.b vr0, vr12, vr11
+ vbsrl.v vr3, vr0, 4
+ vbsrl.v vr4, vr0, 8
+ vbsrl.v vr5, vr0, 12
+ fst.s f0, a0, 0
+ fstx.s f3, a0, a2
+ fstx.s f4, a0, t0
+ fstx.s f5, a0, t1
+ vst vr2, a1, 0
+ vst vr2, a1, 16
+endfunc
+
+function ff_h264_add_pixels8_8_lsx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ add.d t3, a0, t2
+ vldi vr0, 0
+ vld vr1, a1, 0
+ vld vr2, a1, 16
+ vld vr3, a1, 32
+ vld vr4, a1, 48
+ vld vr5, a1, 64
+ vld vr6, a1, 80
+ vld vr7, a1, 96
+ vld vr8, a1, 112
+ fld.d f10, a0, 0
+ fldx.d f11, a0, a2
+ fldx.d f12, a0, t0
+ fldx.d f13, a0, t1
+ fld.d f14, t3, 0
+ fldx.d f15, t3, a2
+ fldx.d f16, t3, t0
+ fldx.d f17, t3, t1
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr0, vr11
+ vilvl.b vr12, vr0, vr12
+ vilvl.b vr13, vr0, vr13
+ vilvl.b vr14, vr0, vr14
+ vilvl.b vr15, vr0, vr15
+ vilvl.b vr16, vr0, vr16
+ vilvl.b vr17, vr0, vr17
+ vadd.h vr1, vr1, vr10
+ vadd.h vr2, vr2, vr11
+ vadd.h vr3, vr3, vr12
+ vadd.h vr4, vr4, vr13
+ vadd.h vr5, vr5, vr14
+ vadd.h vr6, vr6, vr15
+ vadd.h vr7, vr7, vr16
+ vadd.h vr8, vr8, vr17
+ vpickev.b vr10, vr2, vr1
+ vpickev.b vr12, vr4, vr3
+ vpickev.b vr14, vr6, vr5
+ vpickev.b vr16, vr8, vr7
+ vbsrl.v vr11, vr10, 8
+ vbsrl.v vr13, vr12, 8
+ vbsrl.v vr15, vr14, 8
+ vbsrl.v vr17, vr16, 8
+ vst vr0, a1, 0
+ vst vr0, a1, 16
+ vst vr0, a1, 32
+ vst vr0, a1, 48
+ vst vr0, a1, 64
+ vst vr0, a1, 80
+ vst vr0, a1, 96
+ vst vr0, a1, 112
+ fst.d f10, a0, 0
+ fstx.d f11, a0, a2
+ fstx.d f12, a0, t0
+ fstx.d f13, a0, t1
+ fst.d f14, t3, 0
+ fstx.d f15, t3, a2
+ fstx.d f16, t3, t0
+ fstx.d f17, t3, t1
+endfunc
+
+const cnst_value
+.byte 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2
+.byte 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1
+endconst
+
+function ff_h264_loop_filter_strength_lsx
+ vldi vr0, 0
+ ldptr.w t0, sp, 0 //mask_mv1
+ ldptr.w t1, sp, 8 //field
+ beqz t1, .FIELD
+ la.local t2, cnst_value
+ vld vr1, t2, 0
+ vld vr2, t2, 16
+ b .END_FIELD
+.FIELD:
+ vldi vr1, 0x06
+ vldi vr2, 0x03
+.END_FIELD:
+ vldi vr3, 0x01
+ slli.d a6, a6, 3 //step <<= 3
+ slli.d a5, a5, 3 //edges <<= 3
+ move t3, zero
+ slli.d t4, a6, 2
+ move t5, a2
+ move t6, a3
+ move t7, a1
+ move t8, a0
+ slli.d t0, t0, 3
+.ITERATION_FIR:
+ bge t3, a5, .END_ITERATION_FIR
+ vand.v vr20, vr20, vr0
+ and t2, t0, t3
+ bnez t2, .MASK_MV_FIR
+ beqz a4, .BIDIR_FIR
+ vld vr4, t5, 4
+ vld vr5, t5, 44
+ vld vr6, t5, 12
+ vld vr7, t5, 52
+ vilvl.w vr4, vr5, vr4
+ vilvl.w vr6, vr6, vr6
+ vilvl.w vr7, vr7, vr7
+ vshuf4i.h vr5, vr4, 0x4e
+ vsub.b vr6, vr6, vr4
+ vsub.b vr7, vr7, vr5
+ vor.v vr6, vr6, vr7
+ vld vr10, t6, 16
+ vld vr11, t6, 48
+ vld vr12, t6, 208
+ vld vr8, t6, 176
+ vsub.h vr13, vr10, vr11
+ vsub.h vr14, vr10, vr12
+ vsub.h vr15, vr8, vr11
+ vsub.h vr16, vr8, vr12
+ vssrarni.b.h vr14, vr13, 0
+ vssrarni.b.h vr16, vr15, 0
+ vadd.b vr14, vr2, vr14
+ vadd.b vr16, vr2, vr16
+ vssub.bu vr14, vr14, vr1
+ vssub.bu vr16, vr16, vr1
+ vssrarni.b.h vr14, vr14, 0
+ vssrarni.b.h vr16, vr16, 0
+ vor.v vr20, vr6, vr14
+ vshuf4i.h vr16, vr16, 0x4e
+ vor.v vr20, vr20, vr16
+ vshuf4i.h vr21, vr20, 0x4e
+ vmin.bu vr20, vr20, vr21
+ b .MASK_MV_FIR
+.BIDIR_FIR:
+ vld vr4, t5, 4
+ vld vr5, t5, 12
+ vld vr10, t6, 16
+ vld vr11, t6, 48
+ vsub.h vr12, vr11, vr10
+ vssrarni.b.h vr12, vr12, 0
+ vadd.b vr13, vr12, vr2
+ vssub.bu vr14, vr13, vr1
+ vsat.h vr15, vr14, 7
+ vpickev.b vr20, vr15, vr15
+ vsub.b vr6, vr5, vr4
+ vor.v vr20, vr20, vr6
+.MASK_MV_FIR:
+ vld vr4, t7, 12
+ vld vr5, t7, 4
+ vor.v vr6, vr4, vr5
+ vmin.bu vr6, vr6, vr3
+ vmin.bu vr20, vr20, vr3
+ vslli.h vr6, vr6, 1
+ vmax.bu vr6, vr20, vr6
+ vilvl.b vr7, vr0, vr6
+ add.d t3, t3, a6
+ fst.d f7, t8, 32
+ add.d t5, t5, a6
+ add.d t6, t6, t4
+ add.d t7, t7, a6
+ add.d t8, t8, a6
+ b .ITERATION_FIR
+.END_ITERATION_FIR:
+ move t3, zero
+ addi.d a5, zero, 32
+ vldi vr21, 0xff
+ move t5, a2
+ move t6, a3
+ move t7, a1
+ move t8, a0
+ slli.d a7, a7, 3
+.ITERATION_SEC:
+ bge t3, a5, .END_ITERATION_SEC
+ vand.v vr20, vr20, vr21
+ and t2, a7, t3
+ bnez t2, .MASK_MV_SEC
+ beqz a4, .BIDIR_SEC
+ vld vr4, t5, 11
+ vld vr5, t5, 51
+ vld vr6, t5, 12
+ vld vr7, t5, 52
+ vilvl.w vr4, vr5, vr4
+ vilvl.w vr6, vr6, vr6
+ vilvl.w vr7, vr7, vr7
+ vshuf4i.h vr5, vr4, 0x4e
+ vsub.b vr6, vr6, vr4
+ vsub.b vr7, vr7, vr5
+ vor.v vr6, vr6, vr7
+ vld vr10, t6, 44
+ vld vr11, t6, 48
+ vld vr12, t6, 208
+ vld vr8, t6, 204
+ vsub.h vr13, vr10, vr11
+ vsub.h vr14, vr10, vr12
+ vsub.h vr15, vr8, vr11
+ vsub.h vr16, vr8, vr12
+ vssrarni.b.h vr14, vr13, 0
+ vssrarni.b.h vr16, vr15, 0
+ vadd.b vr14, vr2, vr14
+ vadd.b vr16, vr2, vr16
+ vssub.bu vr14, vr14, vr1
+ vssub.bu vr16, vr16, vr1
+ vssrarni.b.h vr14, vr14, 0
+ vssrarni.b.h vr16, vr16, 0
+ vor.v vr20, vr6, vr14
+ vshuf4i.h vr16, vr16, 0x4e
+ vor.v vr20, vr20, vr16
+ vshuf4i.h vr22, vr20, 0x4e
+ vmin.bu vr20, vr20, vr22
+ b .MASK_MV_SEC
+.BIDIR_SEC:
+ vld vr4, t5, 11
+ vld vr5, t5, 12
+ vld vr10, t6, 44
+ vld vr11, t6, 48
+ vsub.h vr12, vr11, vr10
+ vssrarni.b.h vr12, vr12, 0
+ vadd.b vr13, vr12, vr2
+ vssub.bu vr14, vr13, vr1
+ vssrarni.b.h vr14, vr14, 0
+ vsub.b vr6, vr5, vr4
+ vor.v vr20, vr14, vr6
+.MASK_MV_SEC:
+ vld vr4, t7, 12
+ vld vr5, t7, 11
+ vor.v vr6, vr4, vr5
+ vmin.bu vr6, vr6, vr3
+ vmin.bu vr20, vr20, vr3
+ vslli.h vr6, vr6, 1
+ vmax.bu vr6, vr20, vr6
+ vilvl.b vr7, vr0, vr6
+ addi.d t3, t3, 8
+ fst.d f7, t8, 0
+ addi.d t5, t5, 8
+ addi.d t6, t6, 32
+ addi.d t7, t7, 8
+ addi.d t8, t8, 8
+ b .ITERATION_SEC
+.END_ITERATION_SEC:
+ vld vr4, a0, 0
+ vld vr5, a0, 16
+ vilvh.d vr6, vr4, vr4
+ vilvh.d vr7, vr5, vr5
+ LSX_TRANSPOSE4x4_H vr4, vr6, vr5, vr7, vr6, vr7, vr8, vr9, vr10, vr11
+ vilvl.d vr4, vr7, vr6
+ vilvl.d vr5, vr9, vr8
+ vst vr4, a0, 0
+ vst vr5, a0, 16
+endfunc
diff --git a/libavcodec/loongarch/h264dsp_init_loongarch.c b/libavcodec/loongarch/h264dsp_init_loongarch.c
index f8616a7db5..d97c3a86eb 100644
--- a/libavcodec/loongarch/h264dsp_init_loongarch.c
+++ b/libavcodec/loongarch/h264dsp_init_loongarch.c
@@ -29,21 +29,44 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
+ if (chroma_format_idc <= 1)
+ c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lsx;
if (bit_depth == 8) {
c->h264_idct_add = ff_h264_idct_add_8_lsx;
c->h264_idct8_add = ff_h264_idct8_add_8_lsx;
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_lsx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
- if (chroma_format_idc <= 1)
+ if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
- else
+ c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_8_lsx;
+ c->h264_h_loop_filter_chroma_intra = ff_h264_h_lpf_chroma_intra_8_lsx;
+ } else
c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+
+ c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_lsx;
+ c->h264_add_pixels8_clear = ff_h264_add_pixels8_8_lsx;
+ c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_8_lsx;
+ c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lsx;
+ c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lsx;
+ c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lsx;
+ c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_8_lsx;
+
+ c->h264_v_loop_filter_chroma_intra = ff_h264_v_lpf_chroma_intra_8_lsx;
+
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lsx;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lsx;
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lsx;
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_lsx;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lsx;
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_lsx;
+ c->h264_idct8_add = ff_h264_idct8_add_8_lsx;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
}
}
#if HAVE_LASX
@@ -57,23 +80,13 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lasx;
c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lasx;
c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lasx;
- c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_8_lasx;
-
- if (chroma_format_idc <= 1)
- c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_8_lasx;
- c->h264_v_loop_filter_chroma_intra = ff_h264_v_lpf_chroma_intra_8_lasx;
-
- if (chroma_format_idc <= 1)
- c->h264_h_loop_filter_chroma_intra = ff_h264_h_lpf_chroma_intra_8_lasx;
/* Weighted MC */
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_lasx;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lasx;
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_lasx;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lasx;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
c->h264_idct8_add = ff_h264_idct8_add_8_lasx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
diff --git a/libavcodec/loongarch/h264dsp_lasx.c b/libavcodec/loongarch/h264dsp_lasx.c
index 7b2b8ff0f0..5205cc849f 100644
--- a/libavcodec/loongarch/h264dsp_lasx.c
+++ b/libavcodec/loongarch/h264dsp_lasx.c
@@ -67,10 +67,10 @@
void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in, int8_t *tc)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_8x = img_width << 3;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
+ int img_width_2x = img_width << 1;
+ int img_width_4x = img_width << 2;
+ int img_width_8x = img_width << 3;
+ int img_width_3x = img_width_2x + img_width;
__m256i tmp_vec0, bs_vec;
__m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
0x0101010100000000, 0x0303030302020202};
@@ -244,8 +244,8 @@ void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in, int8_t *tc)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_3x = img_width + img_width_2x;
+ int img_width_2x = img_width << 1;
+ int img_width_3x = img_width + img_width_2x;
__m256i tmp_vec0, bs_vec;
__m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
0x0101010100000000, 0x0303030302020202};
@@ -363,184 +363,6 @@ void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
}
}
-void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in, int8_t *tc)
-{
- __m256i tmp_vec0, bs_vec;
- __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
- __m256i zero = __lasx_xvldi(0);
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
-
- tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
- tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
- bs_vec = __lasx_xvslti_b(tc_vec, 0);
- bs_vec = __lasx_xvxori_b(bs_vec, 255);
- bs_vec = __lasx_xvandi_b(bs_vec, 1);
- bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
-
- if (__lasx_xbnz_v(bs_vec)) {
- uint8_t *src = data - 2;
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
- __m256i is_bs_greater_than0;
-
- is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
-
- {
- __m256i row0, row1, row2, row3, row4, row5, row6, row7;
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
- src, img_width_3x, row0, row1, row2, row3);
- src += img_width_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
- src, img_width_3x, row4, row5, row6, row7);
- src -= img_width_4x;
- /* LASX_TRANSPOSE8x4_B */
- DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
- row7, row5, p1_org, p0_org, q0_org, q1_org);
- row0 = __lasx_xvilvl_b(p0_org, p1_org);
- row1 = __lasx_xvilvl_b(q1_org, q0_org);
- row3 = __lasx_xvilvh_w(row1, row0);
- row2 = __lasx_xvilvl_w(row1, row0);
- p1_org = __lasx_xvpermi_d(row2, 0x00);
- p0_org = __lasx_xvpermi_d(row2, 0x55);
- q0_org = __lasx_xvpermi_d(row3, 0x00);
- q1_org = __lasx_xvpermi_d(row3, 0x55);
- }
-
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
- is_less_than = is_less_than & is_bs_greater_than0;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- {
- __m256i tc_h, neg_thresh_h, p0_h, q0_h;
-
- neg_thresh_h = __lasx_xvneg_b(tc_vec);
- neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
- tc_h = __lasx_vext2xv_hu_bu(tc_vec);
-
- AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
- neg_thresh_h, tc_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
- p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
- p0_h, q0_h);
- p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- }
-
- p0_org = __lasx_xvilvl_b(q0_org, p0_org);
- src = data - 1;
- __lasx_xvstelm_h(p0_org, src, 0, 0);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 1);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 2);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 3);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 4);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 5);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 6);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 7);
- }
- }
-}
-
-void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in, int8_t *tc)
-{
- int img_width_2x = img_width << 1;
- __m256i tmp_vec0, bs_vec;
- __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
- __m256i zero = __lasx_xvldi(0);
-
- tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
- tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
- bs_vec = __lasx_xvslti_b(tc_vec, 0);
- bs_vec = __lasx_xvxori_b(bs_vec, 255);
- bs_vec = __lasx_xvandi_b(bs_vec, 1);
- bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
-
- if (__lasx_xbnz_v(bs_vec)) {
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
- __m256i is_bs_greater_than0;
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
- p1_org, p0_org);
- DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
-
- is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
- is_less_than = is_less_than & is_bs_greater_than0;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- {
- __m256i neg_thresh_h, tc_h, p0_h, q0_h;
-
- neg_thresh_h = __lasx_xvneg_b(tc_vec);
- neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
- tc_h = __lasx_vext2xv_hu_bu(tc_vec);
-
- AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
- neg_thresh_h, tc_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
- p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
- p0_h, q0_h);
- p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
- __lasx_xvstelm_d(q0_h, data, 0, 0);
- }
- }
- }
-}
-
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
q3_or_p3_org_in, p1_or_q1_org_in, \
p2_or_q2_org_in, q1_or_p1_org_in, \
@@ -584,9 +406,9 @@ void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
+ int img_width_2x = img_width << 1;
+ int img_width_4x = img_width << 2;
+ int img_width_3x = img_width_2x + img_width;
uint8_t *src = data - 4;
__m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
__m256i is_less_than, is_less_than_beta, is_less_than_alpha;
@@ -760,8 +582,8 @@ void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
+ int img_width_2x = img_width << 1;
+ int img_width_3x = img_width_2x + img_width;
uint8_t *src = data - img_width_2x;
__m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
__m256i is_less_than, is_less_than_beta, is_less_than_alpha;
@@ -877,1160 +699,6 @@ void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
}
}
-void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in)
-{
- uint8_t *src = data - 2;
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-
- {
- __m256i row0, row1, row2, row3, row4, row5, row6, row7;
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
- img_width_3x, row0, row1, row2, row3);
- src += img_width_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
- img_width_3x, row4, row5, row6, row7);
-
- /* LASX_TRANSPOSE8x4_B */
- DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
- p1_org, p0_org, q0_org, q1_org);
- row0 = __lasx_xvilvl_b(p0_org, p1_org);
- row1 = __lasx_xvilvl_b(q1_org, q0_org);
- row3 = __lasx_xvilvh_w(row1, row0);
- row2 = __lasx_xvilvl_w(row1, row0);
- p1_org = __lasx_xvpermi_d(row2, 0x00);
- p0_org = __lasx_xvpermi_d(row2, 0x55);
- q0_org = __lasx_xvpermi_d(row3, 0x00);
- q1_org = __lasx_xvpermi_d(row3, 0x55);
- }
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
- AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
- p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- }
- p0_org = __lasx_xvilvl_b(q0_org, p0_org);
- src = data - 1;
- __lasx_xvstelm_h(p0_org, src, 0, 0);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 1);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 2);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 3);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 4);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 5);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 6);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 7);
-}
-
-void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in)
-{
- ptrdiff_t img_width_2x = img_width << 1;
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- p1_org = __lasx_xvldx(data, -img_width_2x);
- p0_org = __lasx_xvldx(data, -img_width);
- DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
-
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
- AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
- p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
- __lasx_xvstelm_d(q0_h, data, 0, 0);
- }
-}
-
-void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset_in)
-{
- __m256i wgt;
- __m256i src0, src1, src2, src3;
- __m256i dst0, dst1, dst2, dst3;
- __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i denom, offset;
- int stride_2x = stride << 1;
- int stride_4x = stride << 2;
- int stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src += stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
- 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
- dst -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
- 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec0, vec2, vec4, vec6);
- DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec1, vec3, vec5, vec7);
-
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
- offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
-
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- tmp4 = __lasx_xvsra_h(tmp4, denom);
- tmp5 = __lasx_xvsra_h(tmp5, denom);
- tmp6 = __lasx_xvsra_h(tmp6, denom);
- tmp7 = __lasx_xvsra_h(tmp7, denom);
-
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
- dst0, dst1, dst2, dst3);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst0, dst, 0, 2);
- __lasx_xvstelm_d(dst0, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 2);
- __lasx_xvstelm_d(dst1, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 0);
- __lasx_xvstelm_d(dst2, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 2);
- __lasx_xvstelm_d(dst2, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 0);
- __lasx_xvstelm_d(dst3, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 2);
- __lasx_xvstelm_d(dst3, dst, 8, 3);
- dst += stride;
-
- if (16 == height) {
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src += stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
- tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
- dst -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
- tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec0, vec2, vec4, vec6);
- DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec1, vec3, vec5, vec7);
-
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
- offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
-
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- tmp4 = __lasx_xvsra_h(tmp4, denom);
- tmp5 = __lasx_xvsra_h(tmp5, denom);
- tmp6 = __lasx_xvsra_h(tmp6, denom);
- tmp7 = __lasx_xvsra_h(tmp7, denom);
-
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
- tmp6, dst0, dst1, dst2, dst3);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst0, dst, 0, 2);
- __lasx_xvstelm_d(dst0, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 2);
- __lasx_xvstelm_d(dst1, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 0);
- __lasx_xvstelm_d(dst2, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 2);
- __lasx_xvstelm_d(dst2, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 0);
- __lasx_xvstelm_d(dst3, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 2);
- __lasx_xvstelm_d(dst3, dst, 8, 3);
- }
-}
-
-static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- vec1 = __lasx_xvilvh_b(dst0, src0);
- DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- tmp0, tmp1);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
- dst0 = __lasx_xvpickev_b(tmp1, tmp0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1, vec2, vec3;
- __m256i src0, src1, dst0, dst1;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
- uint8_t* dst_tmp = dst;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- tmp0 = __lasx_xvld(dst_tmp, 0);
- DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
- tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
- src0, src1, dst0, dst1);
- DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
- DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
-}
-
-static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
- uint8_t* dst_tmp = dst;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec0, vec2, vec4, vec6);
- DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec1, vec3, vec5, vec7);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
- offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- tmp4 = __lasx_xvsra_h(tmp4, denom);
- tmp5 = __lasx_xvsra_h(tmp5, denom);
- tmp6 = __lasx_xvsra_h(tmp6, denom);
- tmp7 = __lasx_xvsra_h(tmp7, denom);
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
- dst0, dst1, dst2, dst3)
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst2, dst, 0, 0);
- __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst3, dst, 0, 0);
- __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
-}
-
-void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset)
-{
- if (4 == height) {
- avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
- offset);
- } else if (8 == height) {
- avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
- offset);
- } else {
- avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
- offset);
- }
-}
-
-static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, denom, offset;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
- dst0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp0 = __lasx_xvclip255_h(tmp0);
- tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
- __lasx_xvstelm_w(tmp0, dst, 0, 0);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
-}
-
-static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
- dst0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- dst0 = __lasx_xvilvh_b(dst0, src0);
- vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
- tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp0 = __lasx_xvclip255_h(tmp0);
- tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
- __lasx_xvstelm_w(tmp0, dst, 0, 0);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
- __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
- __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
-}
-
-static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
- dst -= stride_4x;
- DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- vec1 = __lasx_xvilvh_b(dst0, src0);
- DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- tmp0, tmp1);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
- tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
- __lasx_xvstelm_w(tmp0, dst, 0, 0);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
- __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_w(tmp0, dst, 0, 4);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
- __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
- __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
-}
-
-void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset)
-{
- if (2 == height) {
- avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
- weight_dst, offset);
- } else if (4 == height) {
- avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
- weight_dst, offset);
- } else {
- avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
- weight_dst, offset);
- }
-}
-
-void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset_in)
-{
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i zero = __lasx_xvldi(0);
- __m256i src0, src1, src2, src3;
- __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i wgt, denom, offset;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
- 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_l, src1_l, src2_l, src3_l);
- DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_h, src1_h, src2_h, src3_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- src2_l = __lasx_xvmul_h(wgt, src2_l);
- src2_h = __lasx_xvmul_h(wgt, src2_h);
- src3_l = __lasx_xvmul_h(wgt, src3_l);
- src3_h = __lasx_xvmul_h(wgt, src3_h);
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
- src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
- src3_h, offset, src2_l, src2_h, src3_l, src3_h);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src2_l = __lasx_xvmaxi_h(src2_l, 0);
- src2_h = __lasx_xvmaxi_h(src2_h, 0);
- src3_l = __lasx_xvmaxi_h(src3_l, 0);
- src3_h = __lasx_xvmaxi_h(src3_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
- src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
- src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
- src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
- src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
- __lasx_xvstelm_d(src0_l, src, 0, 0);
- __lasx_xvstelm_d(src0_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src0_l, src, 0, 2);
- __lasx_xvstelm_d(src0_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 0);
- __lasx_xvstelm_d(src1_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 2);
- __lasx_xvstelm_d(src1_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 0);
- __lasx_xvstelm_d(src2_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 2);
- __lasx_xvstelm_d(src2_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 0);
- __lasx_xvstelm_d(src3_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 2);
- __lasx_xvstelm_d(src3_h, src, 8, 2);
- src += stride;
-
- if (16 == height) {
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
- tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_l, src1_l, src2_l, src3_l);
- DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_h, src1_h, src2_h, src3_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- src2_l = __lasx_xvmul_h(wgt, src2_l);
- src2_h = __lasx_xvmul_h(wgt, src2_h);
- src3_l = __lasx_xvmul_h(wgt, src3_l);
- src3_h = __lasx_xvmul_h(wgt, src3_h);
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
- offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
- offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src2_l = __lasx_xvmaxi_h(src2_l, 0);
- src2_h = __lasx_xvmaxi_h(src2_h, 0);
- src3_l = __lasx_xvmaxi_h(src3_l, 0);
- src3_h = __lasx_xvmaxi_h(src3_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
- src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
- src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
- src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
- src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
- __lasx_xvstelm_d(src0_l, src, 0, 0);
- __lasx_xvstelm_d(src0_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src0_l, src, 0, 2);
- __lasx_xvstelm_d(src0_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 0);
- __lasx_xvstelm_d(src1_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 2);
- __lasx_xvstelm_d(src1_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 0);
- __lasx_xvstelm_d(src2_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 2);
- __lasx_xvstelm_d(src2_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 0);
- __lasx_xvstelm_d(src3_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 2);
- __lasx_xvstelm_d(src3_h, src, 8, 2);
- }
-}
-
-static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i wgt, zero = __lasx_xvldi(0);
- __m256i src0, src0_h, src0_l;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- src0_l = __lasx_xvilvl_b(zero, src0);
- src0_h = __lasx_xvilvh_b(zero, src0);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src0_l = __lasx_xvsadd_h(src0_l, offset);
- src0_h = __lasx_xvsadd_h(src0_h, offset);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-
- src0 = __lasx_xvpickev_d(src0_h, src0_l);
- __lasx_xvstelm_d(src0, src, 0, 0);
- __lasx_xvstelm_d(src0, src + stride, 0, 1);
- __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
-}
-
-static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
- int32_t src_weight, int32_t offset_in)
-{
- __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
- uint32_t offset_val;
- uint8_t* src_tmp = src;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(src_weight);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
- DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
- src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
-
- DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
- __lasx_xvstelm_d(src0, src, 0, 0);
- __lasx_xvstelm_d(src0, src + stride, 0, 1);
- __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src1, src, 0, 0);
- __lasx_xvstelm_d(src1, src + stride, 0, 1);
- __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
-}
-
-static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t src_weight,
- int32_t offset_in)
-{
- __m256i src0, src1, src2, src3;
- __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
- __m256i zero = __lasx_xvldi(0);
- uint32_t offset_val;
- uint8_t* src_tmp = src;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(src_weight);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
- src0_l, src1_l, src2_l, src3_l);
- DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
- src0_h, src1_h, src2_h, src3_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- src2_l = __lasx_xvmul_h(wgt, src2_l);
- src2_h = __lasx_xvmul_h(wgt, src2_h);
- src3_l = __lasx_xvmul_h(wgt, src3_l);
- src3_h = __lasx_xvmul_h(wgt, src3_h);
-
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
- src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
- src3_h, offset, src2_l, src2_h, src3_l, src3_h);
-
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src2_l = __lasx_xvmaxi_h(src2_l, 0);
- src2_h = __lasx_xvmaxi_h(src2_h, 0);
- src3_l = __lasx_xvmaxi_h(src3_l, 0);
- src3_h = __lasx_xvmaxi_h(src3_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
- src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
- src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
- src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
- src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
- DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
- src3_h, src3_l, src0, src1, src2, src3);
-
- __lasx_xvstelm_d(src0, src, 0, 0);
- __lasx_xvstelm_d(src0, src + stride, 0, 1);
- __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src1, src, 0, 0);
- __lasx_xvstelm_d(src1, src + stride, 0, 1);
- __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src2, src, 0, 0);
- __lasx_xvstelm_d(src2, src + stride, 0, 1);
- __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src3, src, 0, 0);
- __lasx_xvstelm_d(src3, src + stride, 0, 1);
- __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
-}
-
-void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset)
-{
- if (4 == height) {
- avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
- } else if (8 == height) {
- avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
- } else {
- avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
- }
-}
-
-static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- uint32_t offset_val;
- __m256i wgt, zero = __lasx_xvldi(0);
- __m256i src0, tmp0, tmp1, denom, offset;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- src0 = __lasx_xvilvl_b(zero, src0);
- src0 = __lasx_xvmul_h(wgt, src0);
- src0 = __lasx_xvsadd_h(src0, offset);
- src0 = __lasx_xvmaxi_h(src0, 0);
- src0 = __lasx_xvssrlrn_bu_h(src0, denom);
- __lasx_xvstelm_w(src0, src, 0, 0);
- __lasx_xvstelm_w(src0, src + stride, 0, 1);
-}
-
-static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- __m256i wgt;
- __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- src0 = __lasx_vext2xv_hu_bu(src0);
- src0 = __lasx_xvmul_h(wgt, src0);
- src0 = __lasx_xvsadd_h(src0, offset);
- src0 = __lasx_xvmaxi_h(src0, 0);
- src0 = __lasx_xvssrlrn_bu_h(src0, denom);
- __lasx_xvstelm_w(src0, src, 0, 0);
- __lasx_xvstelm_w(src0, src + stride, 0, 1);
- __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
- __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
-}
-
-static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- __m256i src0, src0_h, src0_l;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
- __m256i wgt, zero = __lasx_xvldi(0);
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src -= stride_4x;
- DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
- tmp5, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- src0_l = __lasx_xvilvl_b(zero, src0);
- src0_h = __lasx_xvilvh_b(zero, src0);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src0_l = __lasx_xvsadd_h(src0_l, offset);
- src0_h = __lasx_xvsadd_h(src0_h, offset);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- __lasx_xvstelm_w(src0_l, src, 0, 0);
- __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
- __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
- __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
- src += stride_4x;
- __lasx_xvstelm_w(src0_l, src, 0, 4);
- __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
- __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
- __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
-}
-
-void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset)
-{
- if (2 == height) {
- avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
- } else if (4 == height) {
- avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
- } else {
- avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
- }
-}
-
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
{
__m256i src0, dst0, dst1, dst2, dst3, zero;
diff --git a/libavcodec/loongarch/h264dsp_loongarch.h b/libavcodec/loongarch/h264dsp_loongarch.h
index 28dca2b537..e17522dfe0 100644
--- a/libavcodec/loongarch/h264dsp_loongarch.h
+++ b/libavcodec/loongarch/h264dsp_loongarch.h
@@ -47,6 +47,50 @@ void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
int16_t *block, int32_t dst_stride,
const uint8_t nzc[15 * 8]);
+void ff_h264_h_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_v_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_biweight_h264_pixels16_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int height,
+ int log2_denom, int weight_dst,
+ int weight_src, int offset_in);
+void ff_biweight_h264_pixels8_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int height,
+ int log2_denom, int weight_dst,
+ int weight_src, int offset);
+void ff_biweight_h264_pixels4_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int height,
+ int log2_denom, int weight_dst,
+ int weight_src, int offset);
+void ff_weight_h264_pixels16_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int height, int log2_denom,
+ int weight_src, int offset_in);
+void ff_weight_h264_pixels8_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int height, int log2_denom,
+ int weight_src, int offset);
+void ff_weight_h264_pixels4_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int height, int log2_denom,
+ int weight_src, int offset);
+void ff_h264_add_pixels4_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_add_pixels8_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_loop_filter_strength_lsx(int16_t bS[2][4][4], uint8_t nnz[40],
+ int8_t ref[2][40], int16_t mv[2][40][2],
+ int bidir, int edges, int step,
+ int mask_mv0, int mask_mv1, int field);
+
#if HAVE_LASX
void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
@@ -56,24 +100,12 @@ void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta);
-void ff_h264_h_lpf_chroma_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta, int8_t *tc0);
-void ff_h264_v_lpf_chroma_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta, int8_t *tc0);
-void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta);
-void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta);
-void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
+void ff_biweight_h264_pixels16_8_lasx(unsigned char *dst, unsigned char *src,
+ long int stride, int height,
int log2_denom, int weight_dst,
int weight_src, int offset_in);
-void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset);
-void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
+void ff_biweight_h264_pixels8_8_lasx(unsigned char *dst, unsigned char *src,
+ long int stride, int height,
int log2_denom, int weight_dst,
int weight_src, int offset);
void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
@@ -82,9 +114,6 @@ void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
int height, int log2_denom,
int weight_src, int offset);
-void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset);
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
--
2.20.1
More information about the ffmpeg-devel
mailing list