[FFmpeg-devel] [PATCH 3/5] lavc/h264qpel: Add vectorized implementation of luma MC for RISC-V

Arnie Chang arnie.chang at sifive.com
Tue May 9 12:50:28 EEST 2023


Optimize luma motion compensation using RISC-V vector intrinsics.
The performance is elvaluated using 720P videos.
Combining vecterization of chroma and luma MC, the FPS is 1.49x faster than the scalar one,
wihle applying only chroma MC resulted in a speedup of 1.13x.

Signed-off-by: Arnie Chang <arnie.chang at sifive.com>
---
 libavcodec/h264qpel.c                   |    2 +
 libavcodec/h264qpel.h                   |    1 +
 libavcodec/riscv/Makefile               |    2 +
 libavcodec/riscv/h264_lowpass.h         |  249 +++++
 libavcodec/riscv/h264_mc_luma.c         |  412 ++++++++
 libavcodec/riscv/h264_mc_luma.h         |  101 ++
 libavcodec/riscv/h264_mc_luma_avg16.h   | 1183 +++++++++++++++++++++++
 libavcodec/riscv/h264_mc_luma_avg8.h    |  773 +++++++++++++++
 libavcodec/riscv/h264_mc_luma_put16.h   |  963 ++++++++++++++++++
 libavcodec/riscv/h264_mc_luma_put8.h    |  648 +++++++++++++
 libavcodec/riscv/h264_qpel_init_riscv.c |  107 ++
 libavcodec/riscv/h264_utility.h         |   75 ++
 12 files changed, 4516 insertions(+)
 create mode 100644 libavcodec/riscv/h264_lowpass.h
 create mode 100644 libavcodec/riscv/h264_mc_luma.c
 create mode 100644 libavcodec/riscv/h264_mc_luma.h
 create mode 100644 libavcodec/riscv/h264_mc_luma_avg16.h
 create mode 100644 libavcodec/riscv/h264_mc_luma_avg8.h
 create mode 100644 libavcodec/riscv/h264_mc_luma_put16.h
 create mode 100644 libavcodec/riscv/h264_mc_luma_put8.h
 create mode 100644 libavcodec/riscv/h264_qpel_init_riscv.c
 create mode 100644 libavcodec/riscv/h264_utility.h

diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index 65fef03304..4293fa2a7b 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -108,5 +108,7 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
     ff_h264qpel_init_mips(c, bit_depth);
 #elif ARCH_LOONGARCH64
     ff_h264qpel_init_loongarch(c, bit_depth);
+#elif ARCH_RISCV
+    ff_h264qpel_init_riscv(c, bit_depth);
 #endif
 }
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 0259e8de23..f8425ea116 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -37,5 +37,6 @@ void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth);
+void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264QPEL_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 08b76c93cb..088efa3b1e 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -22,3 +22,5 @@ RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
 
 OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
 RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
+OBJS-$(CONFIG_H264QPEL) += riscv/h264_qpel_init_riscv.o
+RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264_mc_luma.o
diff --git a/libavcodec/riscv/h264_lowpass.h b/libavcodec/riscv/h264_lowpass.h
new file mode 100644
index 0000000000..f416f7429f
--- /dev/null
+++ b/libavcodec/riscv/h264_lowpass.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_LOWPASS_H
+#define AVCODEC_RISCV_H264_LOWPASS_H
+#include <riscv_vector.h>
+
+
+__attribute__((always_inline)) static void v_lowpass_u8m1(vuint8m1_t *p_dst0, vuint8m1_t *p_dst1, vuint8m1_t row0, vuint8m1_t row1,
+                                                          vuint8m1_t row2, vuint8m1_t row3, vuint8m1_t row4, vuint8m1_t row5,
+                                                          vuint8m1_t row6, int vl)
+{
+    vuint16m2_t dst0 = __riscv_vwaddu_vv_u16m2(row0, row5, vl);
+    vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row1, row4, vl);
+    vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row2, row3, vl);
+
+    vuint16m2_t dst1 = __riscv_vwaddu_vv_u16m2(row1, row6, vl);
+    vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row2, row5, vl);
+    vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row3, row4, vl);
+
+    vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(dst0);
+    vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(dst1);
+
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+    dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+    dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+    dst0 = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+    dst1 = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+
+    *p_dst0 = __riscv_vnclipu_wx_u8m1(dst0, 5, vl);
+    *p_dst1 = __riscv_vnclipu_wx_u8m1(dst1, 5, vl);
+}
+
+__attribute__((always_inline)) static void v_lowpass_u32m2(vuint32m2_t *p_dst0, vuint32m2_t *p_dst1, vint16m1_t *p_row0, vint16m1_t *p_row1,
+                                                           vint16m1_t *p_row2, vint16m1_t *p_row3, vint16m1_t *p_row4, vint16m1_t *p_row5,
+                                                           vint16m1_t *p_row6, ptrdiff_t stride, int vl)
+{
+    vint32m2_t dst0_s = __riscv_vwadd_vv_i32m2(*p_row0, *p_row5, vl);
+    vint32m2_t add00 = __riscv_vwadd_vv_i32m2(*p_row1, *p_row4, vl);
+    vint32m2_t add01 = __riscv_vwadd_vv_i32m2(*p_row2, *p_row3, vl);
+
+    vint32m2_t dst1_s = __riscv_vwadd_vv_i32m2(*p_row1, *p_row6, vl);
+    vint32m2_t add10 = __riscv_vwadd_vv_i32m2(*p_row2, *p_row5, vl);
+    vint32m2_t add11 = __riscv_vwadd_vv_i32m2(*p_row3, *p_row4, vl);
+
+    dst0_s = __riscv_vmacc_vx_i32m2(dst0_s, 20, add01, vl);
+    dst0_s = __riscv_vmacc_vx_i32m2(dst0_s, -5, add00, vl);
+    dst1_s = __riscv_vmacc_vx_i32m2(dst1_s, 20, add11, vl);
+    dst1_s = __riscv_vmacc_vx_i32m2(dst1_s, -5, add10, vl);
+
+    dst0_s = __riscv_vmax_vx_i32m2(dst0_s, 0, vl);
+    dst1_s = __riscv_vmax_vx_i32m2(dst1_s, 0, vl);
+
+    *p_dst0 = __riscv_vreinterpret_v_i32m2_u32m2(dst0_s);
+    *p_dst1 = __riscv_vreinterpret_v_i32m2_u32m2(dst1_s);
+}
+
+__attribute__((always_inline)) static void h_lowpass_i16m1(vint16m1_t *p_dst0, vint16m1_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+    vuint8mf2_t row00 = __riscv_vle8_v_u8mf2(*pp_src - 2, vl);
+    vuint8mf2_t row01 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 1, vl);
+    vuint8mf2_t row02 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 2, vl);
+    vuint8mf2_t row03 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 3, vl);
+    vuint8mf2_t row04 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 4, vl);
+    vuint8mf2_t row05 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    vuint8mf2_t row10 = __riscv_vle8_v_u8mf2(*pp_src - 2, vl);
+    vuint8mf2_t row11 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 1, vl);
+    vuint8mf2_t row12 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 2, vl);
+    vuint8mf2_t row13 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 3, vl);
+    vuint8mf2_t row14 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 4, vl);
+    vuint8mf2_t row15 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    vuint16m1_t dst0_u = __riscv_vwaddu_vv_u16m1(row00, row05, vl);
+    vuint16m1_t add00 = __riscv_vwaddu_vv_u16m1(row01, row04, vl);
+    vuint16m1_t add01 = __riscv_vwaddu_vv_u16m1(row02, row03, vl);
+
+    vuint16m1_t dst1_u = __riscv_vwaddu_vv_u16m1(row10, row15, vl);
+    vuint16m1_t add10 = __riscv_vwaddu_vv_u16m1(row11, row14, vl);
+    vuint16m1_t add11 = __riscv_vwaddu_vv_u16m1(row12, row13, vl);
+
+    *p_dst0 = __riscv_vreinterpret_v_u16m1_i16m1(dst0_u);
+    *p_dst1 = __riscv_vreinterpret_v_u16m1_i16m1(dst1_u);
+
+    *p_dst0 = __riscv_vmacc_vx_i16m1(*p_dst0, 20, __riscv_vreinterpret_v_u16m1_i16m1(add01), vl);
+    *p_dst0 = __riscv_vmacc_vx_i16m1(*p_dst0, -5, __riscv_vreinterpret_v_u16m1_i16m1(add00), vl);
+    *p_dst1 = __riscv_vmacc_vx_i16m1(*p_dst1, 20, __riscv_vreinterpret_v_u16m1_i16m1(add11), vl);
+    *p_dst1 = __riscv_vmacc_vx_i16m1(*p_dst1, -5, __riscv_vreinterpret_v_u16m1_i16m1(add10), vl);
+}
+
+__attribute__((always_inline)) static void h_lowpass_u16m2(vuint16m2_t *p_dst0, vuint16m2_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+    vuint8m1_t row00 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+    vuint8m1_t row01 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+    vuint8m1_t row02 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+    vuint8m1_t row03 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+    vuint8m1_t row04 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+    vuint8m1_t row05 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    vuint8m1_t row10 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+    vuint8m1_t row11 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+    vuint8m1_t row12 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+    vuint8m1_t row13 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+    vuint8m1_t row14 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+    vuint8m1_t row15 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    *p_dst0 = __riscv_vwaddu_vv_u16m2(row00, row05, vl);
+    vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row01, row04, vl);
+    vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row02, row03, vl);
+
+    *p_dst1 = __riscv_vwaddu_vv_u16m2(row10, row15, vl);
+    vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row11, row14, vl);
+    vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row12, row13, vl);
+
+    vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(*p_dst0);
+    vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(*p_dst1);
+
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+    dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+    dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+    *p_dst0 = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+    *p_dst1 = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+}
+
+__attribute__((always_inline)) static void h_lowpass_u8m1_l2src(vuint8m1_t *p_dst0, vuint8m1_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+    vuint8m1_t row00 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+    vuint8m1_t row01 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+    vuint8m1_t row02 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+    vuint8m1_t row03 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+    vuint8m1_t row04 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+    vuint8m1_t row05 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    vuint8m1_t row10 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+    vuint8m1_t row11 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+    vuint8m1_t row12 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+    vuint8m1_t row13 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+    vuint8m1_t row14 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+    vuint8m1_t row15 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    vuint16m2_t dst0_u = __riscv_vwaddu_vv_u16m2(row00, row05, vl);
+    vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row01, row04, vl);
+    vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row02, row03, vl);
+
+    vuint16m2_t dst1_u = __riscv_vwaddu_vv_u16m2(row10, row15, vl);
+    vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row11, row14, vl);
+    vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row12, row13, vl);
+
+    vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(dst0_u);
+    vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(dst1_u);
+
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+    dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+    dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+    dst0_u = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+    dst1_u = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+
+    *p_dst0 = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+    *p_dst1 = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+    *p_dst0 = __riscv_vaaddu_vv_u8m1(*p_dst0, row02, vl);
+    *p_dst1 = __riscv_vaaddu_vv_u8m1(*p_dst1, row12, vl);
+}
+
+__attribute__((always_inline)) static void h_lowpass_u8m1_l2src_shift(vuint8m1_t *p_dst0, vuint8m1_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+    vuint8m1_t row00 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+    vuint8m1_t row01 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+    vuint8m1_t row02 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+    vuint8m1_t row03 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+    vuint8m1_t row04 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+    vuint8m1_t row05 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    vuint8m1_t row10 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+    vuint8m1_t row11 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+    vuint8m1_t row12 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+    vuint8m1_t row13 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+    vuint8m1_t row14 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+    vuint8m1_t row15 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+    *pp_src += stride;
+
+    vuint16m2_t dst0_u = __riscv_vwaddu_vv_u16m2(row00, row05, vl);
+    vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row01, row04, vl);
+    vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row02, row03, vl);
+
+    vuint16m2_t dst1_u = __riscv_vwaddu_vv_u16m2(row10, row15, vl);
+    vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row11, row14, vl);
+    vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row12, row13, vl);
+
+    vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(dst0_u);
+    vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(dst1_u);
+
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+    dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+    dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+    dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+    dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+    dst0_u = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+    dst1_u = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+
+    *p_dst0 = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+    *p_dst1 = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+    *p_dst0 = __riscv_vaaddu_vv_u8m1(*p_dst0, row03, vl);
+    *p_dst1 = __riscv_vaaddu_vv_u8m1(*p_dst1, row13, vl);
+}
+#endif
diff --git a/libavcodec/riscv/h264_mc_luma.c b/libavcodec/riscv/h264_mc_luma.c
new file mode 100644
index 0000000000..4047c0ff4e
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_mc_luma.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+#include "h264_mc_luma_put16.h"
+#include "h264_mc_luma_avg16.h"
+#include "h264_mc_luma_put8.h"
+#include "h264_mc_luma_avg8.h"
+
+void put_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_copy_block16(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel16_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel16_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel16_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel16_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+    put_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_v_lowpass(temp, p_src, 16, stride);
+    put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+    put_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel16_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+    put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel16_hv_lowpass(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel16_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+    put_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_v_lowpass(temp, p_src + 1, 16, stride);
+    put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+    put_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_copy_block16(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel16_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel16_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel16_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel16_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+    avg_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_v_lowpass(temp, p_src, 16, stride);
+    avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+    avg_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel16_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+    avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel16_hv_lowpass(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+    avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel16_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+    avg_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_v_lowpass(temp, p_src + 1, 16, stride);
+    avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[256] = {0};
+    put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+    avg_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void put_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_copy_block8(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel8_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel8_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel8_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+    put_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_v_lowpass(temp, p_src, 8, stride);
+    put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+    put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel8_hv_lowpass(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+    put_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_v_lowpass(temp, p_src + 1, 8, stride);
+    put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_copy_block8(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel8_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel8_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel8_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+    avg_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_v_lowpass(temp, p_src, 8, stride);
+    avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+    avg_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+    avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel8_hv_lowpass(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+    avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+    avg_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_v_lowpass(temp, p_src + 1, 8, stride);
+    avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    uint8_t temp[64] = {0};
+    put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+    avg_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+#endif
diff --git a/libavcodec/riscv/h264_mc_luma.h b/libavcodec/riscv/h264_mc_luma.h
new file mode 100644
index 0000000000..78d7c41a5f
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_H
+#define AVCODEC_RISCV_H264_MC_LUMA_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void put_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+
+void avg_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+
+void put_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+
+void avg_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+#endif
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_avg16.h b/libavcodec/riscv/h264_mc_luma_avg16.h
new file mode 100644
index 0000000000..7f2aacd00d
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_avg16.h
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_AVG16_H
+#define AVCODEC_RISCV_H264_MC_LUMA_AVG16_H
+#include <riscv_vector.h>
+#include "h264_utility.h"
+#include "h264_lowpass.h"
+
+__attribute__((always_inline)) static void avg_copy_block16(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 8)
+        {
+            vuint8m1_t src0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+            vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 2, vl);
+            vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 3, vl);
+            vuint8m1_t dst4 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 4, vl);
+            vuint8m1_t dst5 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 5, vl);
+            vuint8m1_t dst6 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 6, vl);
+            vuint8m1_t dst7 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 7, vl);
+
+            dst0 = __riscv_vaaddu_vv_u8m1(dst0, src0, vl);
+            dst1 = __riscv_vaaddu_vv_u8m1(dst1, src1, vl);
+            dst2 = __riscv_vaaddu_vv_u8m1(dst2, src2, vl);
+            dst3 = __riscv_vaaddu_vv_u8m1(dst3, src3, vl);
+            dst4 = __riscv_vaaddu_vv_u8m1(dst4, src4, vl);
+            dst5 = __riscv_vaaddu_vv_u8m1(dst5, src5, vl);
+            dst6 = __riscv_vaaddu_vv_u8m1(dst6, src6, vl);
+            dst7 = __riscv_vaaddu_vv_u8m1(dst7, src7, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst2, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst3, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst4, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst5, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst6, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst7, vl);
+            p_dst_iter += stride;
+        }
+
+        p_src_iter = p_src_begin + vl;
+        p_dst_iter = p_dst_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 2)
+        {
+            vuint16m2_t dst0_u, dst1_u;
+            h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+            vuint8m1_t dst0_nrw = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+            vuint8m1_t dst1_nrw = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+            vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+
+            avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0_nrw, vl);
+            avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1_nrw, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += dst_stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += dst_stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride << 1);
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+        vuint8mf2_t dst0_u8, dst1_u8;
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        vuint8mf2_t avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        vuint8mf2_t avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+
+        h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    const uint8_t *p_l2_src_iter = p_l2_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        const uint8_t *p_l2_src_begin = p_l2_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride << 1);
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        vuint8mf2_t dst0_u8, dst1_u8;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+        h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        p_l2_src_iter = p_l2_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (src_stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    const uint8_t *p_l2_src_iter = p_l2_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        const uint8_t *p_l2_src_begin = p_l2_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        p_l2_src_iter = p_l2_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+            avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+            avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+            avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+            avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_avg8.h b/libavcodec/riscv/h264_mc_luma_avg8.h
new file mode 100644
index 0000000000..789bc90c44
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_avg8.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_AVG8_H
+#define AVCODEC_RISCV_H264_MC_LUMA_AVG8_H
+#include <riscv_vector.h>
+#include "h264_utility.h"
+#include "h264_lowpass.h"
+
+__attribute__((always_inline)) static void avg_copy_block8(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while(len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        vuint8m1_t src0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t src1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t src2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t src3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t src4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t src5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t src6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t src7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 2, vl);
+        vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 3, vl);
+        vuint8m1_t dst4 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 4, vl);
+        vuint8m1_t dst5 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 5, vl);
+        vuint8m1_t dst6 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 6, vl);
+        vuint8m1_t dst7 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 7, vl);
+
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, src0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, src1, vl);
+        dst2 = __riscv_vaaddu_vv_u8m1(dst2, src2, vl);
+        dst3 = __riscv_vaaddu_vv_u8m1(dst3, src3, vl);
+        dst4 = __riscv_vaaddu_vv_u8m1(dst4, src4, vl);
+        dst5 = __riscv_vaaddu_vv_u8m1(dst5, src5, vl);
+        dst6 = __riscv_vaaddu_vv_u8m1(dst6, src6, vl);
+        dst7 = __riscv_vaaddu_vv_u8m1(dst7, src7, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst2, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst3, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst4, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst5, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst6, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst7, vl);
+        p_dst_iter += stride;
+
+        p_src_iter = p_src_begin + vl;
+        p_dst_iter = p_dst_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while(len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            vuint16m2_t dst0_u, dst1_u;
+            h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+            vuint8m1_t dst0_nrw = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+            vuint8m1_t dst1_nrw = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+            vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+
+            avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0_nrw, vl);
+            avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1_nrw, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += dst_stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += dst_stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while(len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride << 1);
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+        vuint8mf2_t dst0_u8, dst1_u8;
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        vuint8mf2_t avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        vuint8mf2_t avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+        avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride << 1);
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        vuint8mf2_t dst0_u8, dst1_u8;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (src_stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += dst_stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);\
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+            avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+            avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+            avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+            avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+        vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+        avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+        avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_put16.h b/libavcodec/riscv/h264_mc_luma_put16.h
new file mode 100644
index 0000000000..5a03507b0a
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_put16.h
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_PUT16_H
+#define AVCODEC_RISCV_H264_MC_LUMA_PUT16_H
+#include <riscv_vector.h>
+#include "h264_lowpass.h"
+#include "h264_utility.h"
+
+__attribute__((always_inline)) static void put_copy_block16(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 8)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t row7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, row0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, row1, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, row2, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, row3, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, row4, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, row5, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, row6, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, row7, vl);
+            p_dst_iter += stride;
+        }
+
+        p_src_iter = p_src_begin + vl;
+        p_dst_iter = p_dst_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 2)
+        {
+            vuint16m2_t dst0_u, dst1_u;
+            h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl), vl);
+            p_dst_iter += dst_stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl), vl);
+            p_dst_iter += dst_stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while(len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride << 1);
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+        vuint8mf2_t dst0_u8, dst1_u8;
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+
+        h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+    const uint8_t *p_src_iter = p_src - (stride << 1);
+    const uint8_t *p_l2_src_iter = p_l2_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+        uint8_t *p_l2_src_begin = p_l2_src_iter;
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        vuint8mf2_t dst0_u8, dst1_u8;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+        h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        p_l2_src_iter = p_l2_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (src_stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    const uint8_t *p_l2_src_iter = p_l2_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        const uint8_t *p_l2_src_begin = p_l2_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+        p_l2_src_iter += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        p_l2_src_iter = p_l2_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 16; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 16;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 9th, 10th dst
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 11th, 12th dst
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 13th, 14th dst
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 15th, 16th dst
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_put8.h b/libavcodec/riscv/h264_mc_luma_put8.h
new file mode 100644
index 0000000000..d1cfb90f80
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_put8.h
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_PUT8_H
+#define AVCODEC_RISCV_H264_MC_LUMA_PUT8_H
+#include <riscv_vector.h>
+#include "h264_lowpass.h"
+#include "h264_utility.h"
+
+__attribute__((always_inline)) static void put_copy_block8(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        __riscv_vse8_v_u8m1(p_dst_iter, row0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, row1, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, row2, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, row3, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, row4, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, row5, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, row6, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, row7, vl);
+        p_dst_iter += stride;
+
+        p_src_iter = p_src_begin + vl;
+        p_dst_iter = p_dst_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            vuint16m2_t dst0_u, dst1_u;
+            h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl), vl);
+            p_dst_iter += dst_stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl), vl);
+            p_dst_iter += dst_stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride << 1);
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+        vuint8mf2_t dst0_u8, dst1_u8;
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+        p_dst_iter += stride;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride << 1);
+
+        vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+        vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+        h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+        h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+        vuint32m2_t dst0, dst1;
+        vuint8mf2_t dst0_u8, dst1_u8;
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+        u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+        put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (src_stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += dst_stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += src_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += dst_stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+        l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+        p_l2_src += l2_stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            vuint8m1_t dst0, dst1;
+            h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+            p_dst_iter += stride;
+        }
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+    const uint8_t *p_src_iter = p_src;
+    uint8_t *p_dst_iter = p_dst;
+    int len = 8;
+
+    while (len > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(len);
+        const uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        p_src_iter -= (stride * 2);
+
+        vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        vuint8m1_t dst0, dst1;
+        v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 3rd, 4th dst
+        row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 5th, 6th dst
+        row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        // 7th, 8th dst
+        row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+        row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+        p_src_iter += stride;
+
+        v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+        dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+        dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        p_dst_iter += stride;
+        __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+        p_dst_iter += stride;
+
+        p_dst_iter = p_dst_begin + vl;
+        p_src_iter = p_src_begin + vl;
+        len -= vl;
+    }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_qpel_init_riscv.c b/libavcodec/riscv/h264_qpel_init_riscv.c
new file mode 100644
index 0000000000..582a4a64dd
--- /dev/null
+++ b/libavcodec/riscv/h264_qpel_init_riscv.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264qpel.h"
+#include "h264_mc_luma.h"
+
+av_cold void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_INTRINSICS_RVV
+    const int high_bit_depth = bit_depth > 8;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (!high_bit_depth)
+    {
+        c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][1] = put_h264_qpel16_mc10_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][2] = put_h264_qpel16_mc20_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][3] = put_h264_qpel16_mc30_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][4] = put_h264_qpel16_mc01_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][5] = put_h264_qpel16_mc11_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][6] = put_h264_qpel16_mc21_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][7] = put_h264_qpel16_mc31_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][8] = put_h264_qpel16_mc02_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][9] = put_h264_qpel16_mc12_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][10] = put_h264_qpel16_mc22_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][11] = put_h264_qpel16_mc32_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][12] = put_h264_qpel16_mc03_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][13] = put_h264_qpel16_mc13_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][14] = put_h264_qpel16_mc23_8_rvv;
+        c->put_h264_qpel_pixels_tab[0][15] = put_h264_qpel16_mc33_8_rvv;
+
+        c->put_h264_qpel_pixels_tab[1][0] = put_h264_qpel8_mc00_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][1] = put_h264_qpel8_mc10_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][2] = put_h264_qpel8_mc20_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][3] = put_h264_qpel8_mc30_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][4] = put_h264_qpel8_mc01_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][5] = put_h264_qpel8_mc11_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][6] = put_h264_qpel8_mc21_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][7] = put_h264_qpel8_mc31_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][8] = put_h264_qpel8_mc02_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][9] = put_h264_qpel8_mc12_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][10] = put_h264_qpel8_mc22_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][11] = put_h264_qpel8_mc32_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][12] = put_h264_qpel8_mc03_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][13] = put_h264_qpel8_mc13_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][14] = put_h264_qpel8_mc23_8_rvv;
+        c->put_h264_qpel_pixels_tab[1][15] = put_h264_qpel8_mc33_8_rvv;
+
+        c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][1] = avg_h264_qpel16_mc10_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][2] = avg_h264_qpel16_mc20_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][3] = avg_h264_qpel16_mc30_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][4] = avg_h264_qpel16_mc01_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][5] = avg_h264_qpel16_mc11_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][6] = avg_h264_qpel16_mc21_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][7] = avg_h264_qpel16_mc31_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][8] = avg_h264_qpel16_mc02_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][9] = avg_h264_qpel16_mc12_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][10] = avg_h264_qpel16_mc22_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][11] = avg_h264_qpel16_mc32_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][12] = avg_h264_qpel16_mc03_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][13] = avg_h264_qpel16_mc13_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][14] = avg_h264_qpel16_mc23_8_rvv;
+        c->avg_h264_qpel_pixels_tab[0][15] = avg_h264_qpel16_mc33_8_rvv;
+
+        c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][1] = avg_h264_qpel8_mc10_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][2] = avg_h264_qpel8_mc20_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][3] = avg_h264_qpel8_mc30_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][4] = avg_h264_qpel8_mc01_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][5] = avg_h264_qpel8_mc11_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][6] = avg_h264_qpel8_mc21_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][7] = avg_h264_qpel8_mc31_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][8] = avg_h264_qpel8_mc02_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][9] = avg_h264_qpel8_mc12_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][10] = avg_h264_qpel8_mc22_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][11] = avg_h264_qpel8_mc32_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][12] = avg_h264_qpel8_mc03_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][13] = avg_h264_qpel8_mc13_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][14] = avg_h264_qpel8_mc23_8_rvv;
+        c->avg_h264_qpel_pixels_tab[1][15] = avg_h264_qpel8_mc33_8_rvv;
+    }
+#endif
+}
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_utility.h b/libavcodec/riscv/h264_utility.h
new file mode 100644
index 0000000000..31029a44ae
--- /dev/null
+++ b/libavcodec/riscv/h264_utility.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_UTILITY_H
+#define AVCODEC_RISCV_H264_UTILITY_H
+#include <riscv_vector.h>
+
+__attribute__((always_inline)) static void u32_to_u8(vuint8mf2_t *p_u8_1, vuint8mf2_t *p_u8_2,
+                                                     vuint32m2_t i32_1, vuint32m2_t i32_2, int vl)
+{
+    vuint16m1_t u16_1 = __riscv_vnclipu_wx_u16m1(i32_1, 10, vl);
+    vuint16m1_t u16_2 = __riscv_vnclipu_wx_u16m1(i32_2, 10, vl);
+    *p_u8_1 = __riscv_vnclipu_wx_u8mf2(u16_1, 0, vl);
+    *p_u8_2 = __riscv_vnclipu_wx_u8mf2(u16_2, 0, vl);
+}
+
+__attribute__((always_inline)) static void put_average_l2(uint8_t **pp_dst, uint8_t **pp_l2_src,
+                                                          ptrdiff_t dst_stride, ptrdiff_t l2_stride,
+                                                          vuint8mf2_t src_row0, vuint8mf2_t src_row1, int vl)
+{
+    vuint8mf2_t l2_row0 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+    *pp_l2_src += l2_stride;
+    vuint8mf2_t l2_row1 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+    *pp_l2_src += l2_stride;
+
+    src_row0 = __riscv_vaaddu_vv_u8mf2(src_row0, l2_row0, vl);
+    src_row1 = __riscv_vaaddu_vv_u8mf2(src_row1, l2_row1, vl);
+
+    __riscv_vse8_v_u8mf2(*pp_dst, src_row0, vl);
+    *pp_dst += dst_stride;
+    __riscv_vse8_v_u8mf2(*pp_dst, src_row1, vl);
+    *pp_dst += dst_stride;
+}
+
+__attribute__((always_inline)) static void avg_average_l2(uint8_t **pp_dst, uint8_t **pp_l2_src,
+                                                          ptrdiff_t dst_stride, ptrdiff_t l2_stride,
+                                                          vuint8mf2_t src_row0, vuint8mf2_t src_row1, int vl)
+{
+    vuint8mf2_t l2_row0 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+    *pp_l2_src += l2_stride;
+    vuint8mf2_t l2_row1 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+    *pp_l2_src += l2_stride;
+
+    vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(*pp_dst, vl);
+    vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(*pp_dst + dst_stride, vl);
+
+    src_row0 = __riscv_vaaddu_vv_u8mf2(src_row0, l2_row0, vl);
+    src_row1 = __riscv_vaaddu_vv_u8mf2(src_row1, l2_row1, vl);
+
+    src_row0 = __riscv_vaaddu_vv_u8mf2(src_row0, dst0, vl);
+    src_row1 = __riscv_vaaddu_vv_u8mf2(src_row1, dst1, vl);
+
+    __riscv_vse8_v_u8mf2(*pp_dst, src_row0, vl);
+    *pp_dst += dst_stride;
+    __riscv_vse8_v_u8mf2(*pp_dst, src_row1, vl);
+    *pp_dst += dst_stride;
+}
+#endif
\ No newline at end of file
-- 
2.17.1



More information about the ffmpeg-devel mailing list