[FFmpeg-devel] [PATCH 5/5] lavc/h264pred: Add vectorized implementation of intra prediction for RISC-V
Arnie Chang
arnie.chang at sifive.com
Tue May 9 12:50:30 EEST 2023
Optimize intra prediction using RISC-V vector intrinsics.
Although the intra prediction in the decoder is not a computational hotspot,
the FPS has further improved by 1% after vectorizing this part, as measured on 720P videos.
Signed-off-by: Arnie Chang <arnie.chang at sifive.com>
---
libavcodec/h264pred.c | 2 +
libavcodec/h264pred.h | 3 +-
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/h264_pred.c | 884 ++++++++++++++++++++++++
libavcodec/riscv/h264_pred.h | 53 ++
libavcodec/riscv/h264_pred_init_riscv.c | 67 ++
6 files changed, 1010 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/h264_pred.c
create mode 100644 libavcodec/riscv/h264_pred.h
create mode 100644 libavcodec/riscv/h264_pred_init_riscv.c
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 25f9995a0b..f4ad02c326 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -598,5 +598,7 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
ff_h264_pred_init_mips(h, codec_id, bit_depth, chroma_format_idc);
#elif ARCH_LOONGARCH
ff_h264_pred_init_loongarch(h, codec_id, bit_depth, chroma_format_idc);
+#elif ARCH_RISCV
+ ff_h264_pred_init_riscv(h, codec_id, bit_depth, chroma_format_idc);
#endif
}
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index cb008548fc..8ac5088b34 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -126,5 +126,6 @@ void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
const int bit_depth, const int chroma_format_idc);
void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
const int bit_depth, const int chroma_format_idc);
-
+void ff_h264_pred_init_riscv(H264PredContext *h, int codec_id,
+ const int bit_depth, const int chroma_format_idc);
#endif /* AVCODEC_H264PRED_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 4d54bf35e9..9f7d7289f3 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -28,3 +28,5 @@ OBJS-$(CONFIG_H264DSP) += riscv/h264_dsp_init_riscv.o
RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_weighted_sum.o
RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_inloop.o
RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_idct.o
+OBJS-$(CONFIG_H264PRED) += riscv/h264_pred_init_riscv.o
+RVV-OBJS-$(CONFIG_H264PRED) += riscv/h264_pred.o
diff --git a/libavcodec/riscv/h264_pred.c b/libavcodec/riscv/h264_pred.c
new file mode 100644
index 0000000000..0ba114d906
--- /dev/null
+++ b/libavcodec/riscv/h264_pred.c
@@ -0,0 +1,884 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_pred.h"
+
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+void pred8x8_vert_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ int width = 8;
+
+ while (width > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(width);
+ uint8_t *p_src_iter_next = p_src_iter + vl;
+
+ vuint8m1_t top = __riscv_vle8_v_u8m1(p_src_iter - stride, vl);
+
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+
+ width -= vl;
+ p_src_iter = p_src_iter_next;
+ }
+}
+
+void pred8x8_hor_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ int width = 8;
+
+ while (width > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(width);
+ vuint8m1_t left = __riscv_vlse8_v_u8m1(p_src_iter - 1, stride, width);
+
+ __riscv_vssseg8e8_v_u8m1(p_src_iter, stride, left, left, left, left, left, left, left, left, width);
+
+ width -= vl;
+ p_src_iter = p_src_iter + vl * stride;
+ }
+}
+
+void pred8x8_plane_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ int vl = __riscv_vsetvl_e8mf2(4);
+
+ const uint8_t index_data[] = {3, 2, 1, 0};
+ const int16_t weight1_data[] = {1, 2, 3, 4};
+ const int16_t weight2_data[] = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ vuint8mf2_t index = __riscv_vle8_v_u8mf2(index_data, vl);
+
+ vuint8mf2_t h_half2 = __riscv_vle8_v_u8mf2(p_src - stride + 4, vl);
+ vuint8mf2_t h_half1 = __riscv_vle8_v_u8mf2(p_src - stride - 1, vl);
+ h_half1 = __riscv_vrgather_vv_u8mf2(h_half1, index, vl);
+
+ vuint8mf2_t v_half2 = __riscv_vlse8_v_u8mf2(p_src - 1 + 4 * stride, stride, vl);
+ vuint8mf2_t v_half1 = __riscv_vlse8_v_u8mf2(p_src - 1 - stride, stride, vl);
+ v_half1 = __riscv_vrgather_vv_u8mf2(v_half1, index, vl);
+
+ vint16m1_t h_half2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(h_half2, 0, vl));
+ vint16m1_t h_half1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(h_half1, 0, vl));
+
+ vint16m1_t v_half2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(v_half2, 0, vl));
+ vint16m1_t v_half1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(v_half1, 0, vl));
+
+ // calculate H
+ vint16m1_t h = __riscv_vsub_vv_i16m1(h_half2_w, h_half1_w, vl);
+ vint16m1_t weight1 = __riscv_vle16_v_i16m1(weight1_data, vl);
+ h = __riscv_vmul_vv_i16m1(h, weight1, vl);
+
+ // calculate V
+ vint16m1_t v = __riscv_vsub_vv_i16m1(v_half2_w, v_half1_w, vl);
+ v = __riscv_vmul_vv_i16m1(v, weight1, vl);
+
+ vint32m1_t v_sum = __riscv_vand_vx_i32m1(v_sum, 0, vl);
+ vint32m1_t h_sum = __riscv_vand_vx_i32m1(h_sum, 0, vl);
+ v_sum = __riscv_vwredsum_vs_i16m1_i32m1(v, v_sum, vl);
+ h_sum = __riscv_vwredsum_vs_i16m1_i32m1(h, h_sum, vl);
+
+ int32_t h_sum_scalar = __riscv_vmv_x_s_i32m1_i32(h_sum);
+ h_sum_scalar = (17 * h_sum_scalar + 16) >> 5;
+ int32_t v_sum_scalar = __riscv_vmv_x_s_i32m1_i32(v_sum);
+ v_sum_scalar = (17 * v_sum_scalar + 16) >> 5;
+
+ // linear combination of H, V, and src
+ int32_t a = ((p_src[7 * stride - 1] + p_src[-stride + 7] + 1) << 4) - (3 * (v_sum_scalar + h_sum_scalar));
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+ vint16m1_t weight2 = __riscv_vle16_v_i16m1(weight2_data, 8);
+ vint16m1_t h_weighted = __riscv_vmv_v_x_i16m1(h_sum_scalar, 8);
+ h_weighted = __riscv_vmul_vv_i16m1(h_weighted, weight2, 8);
+
+ vint16m1_t result1 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result1 = __riscv_vmax_vx_i16m1(result1, 0, 8);
+ a += v_sum_scalar;
+
+ vint16m1_t result2 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result2 = __riscv_vmax_vx_i16m1(result2, 0, 8);
+ a += v_sum_scalar;
+
+ vint16m1_t result3 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result3 = __riscv_vmax_vx_i16m1(result3, 0, 8);
+ a += v_sum_scalar;
+
+ vint16m1_t result4 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result4 = __riscv_vmax_vx_i16m1(result4, 0, 8);
+ a += v_sum_scalar;
+
+ vint16m1_t result5 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result5 = __riscv_vmax_vx_i16m1(result5, 0, 8);
+ a += v_sum_scalar;
+
+ vint16m1_t result6 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result6 = __riscv_vmax_vx_i16m1(result6, 0, 8);
+ a += v_sum_scalar;
+
+ vint16m1_t result7 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result7 = __riscv_vmax_vx_i16m1(result7, 0, 8);
+ a += v_sum_scalar;
+
+ vint16m1_t result8 = __riscv_vadd_vx_i16m1(h_weighted, a, 8);
+ result8 = __riscv_vmax_vx_i16m1(result8, 0, 8);
+ a += v_sum_scalar;
+
+ vuint8mf2_t result1_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 5, 8);
+ vuint8mf2_t result2_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 5, 8);
+ vuint8mf2_t result3_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 5, 8);
+ vuint8mf2_t result4_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result4), 5, 8);
+ vuint8mf2_t result5_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result5), 5, 8);
+ vuint8mf2_t result6_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result6), 5, 8);
+ vuint8mf2_t result7_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result7), 5, 8);
+ vuint8mf2_t result8_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result8), 5, 8);
+
+ __riscv_vse8_v_u8mf2(p_src_iter, result1_n, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result2_n, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result3_n, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result4_n, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result5_n, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result6_n, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result7_n, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result8_n, 8);
+ p_src_iter += stride;
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+void pred8x8_128_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ int width = 8;
+
+ while (width > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(width);
+
+ vuint8m1_t dc = __riscv_vmv_v_x_u8m1(128, vl);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ __riscv_vse8_v_u8m1(p_src_iter + stride, dc, vl);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 2, dc, vl);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 3, dc, vl);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 4, dc, vl);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 5, dc, vl);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 6, dc, vl);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 7, dc, vl);
+
+ width -= vl;
+ p_src_iter = p_src_iter + vl;
+ }
+}
+
+void pred8x8_top_dc_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+
+ const uint8_t index_data[] = {0, 0, 0, 0, 1, 1, 1, 1};
+
+ vuint8m1_t top0, top1, top2, top3;
+ __riscv_vlseg4e8_v_u8m1(&top0, &top1, &top2, &top3, p_src - stride, 2);
+
+ vuint16m2_t sum1 = __riscv_vwaddu_vv_u16m2(top0, top1, 2);
+ vuint16m2_t sum2 = __riscv_vwaddu_vv_u16m2(top2, top3, 2);
+ vuint16m2_t sum = __riscv_vadd_vv_u16m2(sum1, sum2, 2);
+
+ vuint8m1_t dc01 = __riscv_vnclipu_wx_u8m1(sum, 2, 2);
+
+ vuint8m1_t index = __riscv_vle8_v_u8m1(index_data, 8);
+ dc01 = __riscv_vrgather_vv_u8m1(dc01, index, 8);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01, 8);
+ __riscv_vse8_v_u8m1(p_src_iter + stride, dc01, 8);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 2, dc01, 8);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 3, dc01, 8);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 4, dc01, 8);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 5, dc01, 8);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 6, dc01, 8);
+ __riscv_vse8_v_u8m1(p_src_iter + stride * 7, dc01, 8);
+}
+
+void pred8x8_left_dc_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+
+ int dc0_data = (p_src[-1] + p_src[-1 + stride] + p_src[-1 + 2 * stride] + p_src[-1 + 3 * stride] + 2) >> 2;
+ int dc2_data = (p_src[-1 + 4 * stride] + p_src[-1 + 5 * stride] + p_src[-1 + 6 * stride] + p_src[-1 + 7 * stride] + 2) >> 2;
+
+ vuint8m1_t dc0 = __riscv_vmv_v_x_u8m1(dc0_data, 8);
+ vuint8m1_t dc2 = __riscv_vmv_v_x_u8m1(dc2_data, 8);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+}
+
+void pred8x8_dc_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ uint8_t *p_top = p_src - stride;
+ uint8_t *p_left = p_src - 1;
+
+ uint16_t dc0 = p_top[0] + p_top[1] + p_top[2] + p_top[3];
+ uint16_t dc1 = p_top[4] + p_top[5] + p_top[6] + p_top[7];
+
+ dc0 += (p_left[0] + p_left[stride] + p_left[stride * 2] + p_left[stride * 3]);
+ uint16_t dc2 = p_left[stride * 4] + p_left[stride * 5] + p_left[stride * 6] + p_left[stride * 7];
+
+ dc0 = (dc0 + 4) >> 3;
+ uint16_t dc3 = (dc1 + dc2 + 4) >> 3;
+ dc1 = (dc1 + 2) >> 2;
+ dc2 = (dc2 + 2) >> 2;
+
+ uint8_t weight_data[] = {0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF};
+ vuint8m1_t weight = __riscv_vle8_v_u8m1(weight_data, 8);
+ vuint8m1_t weight2 = __riscv_vxor_vx_u8m1(weight, 0xFF, 8);
+
+ vuint8m1_t dc1_splat = __riscv_vmv_v_x_u8m1(dc1, 8);
+ vuint8m1_t dc3_splat = __riscv_vmv_v_x_u8m1(dc3, 8);
+
+ vuint8m1_t dc0_splat = __riscv_vmv_v_x_u8m1(dc0, 8);
+ vuint8m1_t dc2_splat = __riscv_vmv_v_x_u8m1(dc2, 8);
+
+ dc0_splat = __riscv_vand_vv_u8m1(dc0_splat, weight2, 8);
+ dc1_splat = __riscv_vand_vv_u8m1(dc1_splat, weight, 8);
+ vuint8m1_t dc01_splat = __riscv_vor_vv_u8m1(dc0_splat, dc1_splat, 8);
+
+ dc2_splat = __riscv_vand_vv_u8m1(dc2_splat, weight2, 8);
+ dc3_splat = __riscv_vand_vv_u8m1(dc3_splat, weight, 8);
+ vuint8m1_t dc23_splat = __riscv_vor_vv_u8m1(dc2_splat, dc3_splat, 8);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23_splat, 8);
+}
+
+void pred8x8_l0t_dc_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint16_t mask_data[] = {0xFFFF, 0, 0, 0, 0, 0, 0, 0};
+ const uint8_t index_data[] = {0, 0, 0, 0, 4, 4, 4, 4};
+ const uint8_t shift_data[] = {3, 3, 3, 3, 2, 2, 2, 2};
+
+ uint8_t *p_src_iter = p_src;
+ uint8_t *p_left = p_src - 1;
+ uint8_t *p_top = p_src - stride;
+
+ uint16_t left_sum = p_left[0] + p_left[stride] + p_left[stride << 1] + p_left[(stride << 1) + stride];
+
+ vuint8m1_t top = __riscv_vle8_v_u8m1(p_top, 8);
+
+ vuint8m1_t top_shift1 = __riscv_vslidedown_vx_u8m1(top, 1, 8);
+ vuint16m2_t dc01 = __riscv_vwaddu_vv_u16m2(top, top_shift1, 8);
+ vuint16m2_t top_shift2 = __riscv_vslidedown_vx_u16m2(dc01, 2, 8);
+ dc01 = __riscv_vadd_vv_u16m2(dc01, top_shift2, 8);
+
+ vuint16m2_t mask = __riscv_vle16_v_u16m2(mask_data, 8);
+ vuint16m2_t dc021 = __riscv_vmv_v_x_u16m2(left_sum, 8);
+ dc021 = __riscv_vand_vv_u16m2(dc021, mask, 8);
+ dc021 = __riscv_vadd_vv_u16m2(dc021, dc01 , 8);
+
+ vuint8m1_t shift = __riscv_vle8_v_u8m1(shift_data, 8);
+ vuint8m1_t dc01_splat = __riscv_vnclipu_wx_u8m1(dc01, 2, 8);
+ vuint8m1_t dc021_splat = __riscv_vnclipu_wv_u8m1(dc021, shift, 8);
+
+ vuint8m1_t index = __riscv_vle8_v_u8m1(index_data, 8);
+ dc01_splat = __riscv_vrgather_vv_u8m1(dc01_splat, index, 8);
+ dc021_splat = __riscv_vrgather_vv_u8m1(dc021_splat, index, 8);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc021_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc021_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc021_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc021_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01_splat, 8);
+}
+
+void pred8x8_0lt_dc_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint16_t mask_data[] = {0, 0, 0, 0, 0xFFFF, 0, 0, 0};
+ const uint8_t index_data[] = {0, 0, 0, 0, 4, 4, 4, 4};
+ const uint8_t shift_data[] = {2, 2, 2, 2, 3, 3, 3, 3};
+
+ uint8_t *p_src_iter = p_src;
+ uint8_t *p_left = p_src - 1 + (stride << 2);
+ uint8_t *p_top = p_src - stride;
+
+ uint16_t left2_sum = p_left[0] + p_left[stride] + p_left[stride << 1] + p_left[(stride << 1) + stride];
+
+ vuint8m1_t top = __riscv_vle8_v_u8m1(p_top, 8);
+
+ vuint8m1_t top_shift1 = __riscv_vslidedown_vx_u8m1(top, 1, 8);
+ vuint16m2_t top_sum = __riscv_vwaddu_vv_u16m2(top, top_shift1, 8);
+ vuint16m2_t top_shift2 = __riscv_vslidedown_vx_u16m2(top_sum, 2, 8);
+ top_sum = __riscv_vadd_vv_u16m2(top_sum, top_shift2, 8);
+
+ vuint16m2_t mask = __riscv_vle16_v_u16m2(mask_data, 8);
+
+ vuint16m2_t dc23_sum = __riscv_vand_vv_u16m2(top_sum, mask, 8);
+ dc23_sum = __riscv_vadd_vx_u16m2(dc23_sum, left2_sum , 8);
+
+ vuint8m1_t shift = __riscv_vle8_v_u8m1(shift_data, 8);
+ vuint8m1_t dc01 = __riscv_vnclipu_wx_u8m1(top_sum, 2, 8);
+ vuint8m1_t dc23 = __riscv_vnclipu_wv_u8m1(dc23_sum, shift, 8);
+
+ vuint8m1_t index = __riscv_vle8_v_u8m1(index_data, 8);
+ dc01 = __riscv_vrgather_vv_u8m1(dc01, index, 8);
+ dc23 = __riscv_vrgather_vv_u8m1(dc23, index, 8);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc01, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc23, 8);
+}
+
+void pred8x8_l00_dc_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ uint8_t *p_left = p_src - 1;
+
+ uint16_t left_sum = p_left[0] + p_left[stride] + p_left[stride << 1] + p_left[(stride << 1) + stride];
+
+ vuint8m1_t dc0 = __riscv_vmv_v_x_u8m1((left_sum + 2) >> 2, 8);
+ vuint8m1_t dc128 = __riscv_vmv_v_x_u8m1(128, 8);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc0, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+}
+
+void pred8x8_0l0_dc_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ uint8_t *p_left2 = p_src - 1 + (stride << 2);
+
+ uint16_t left_sum = p_left2[0] + p_left2[stride] + p_left2[stride << 1] + p_left2[(stride << 1) + stride];
+
+ vuint8m1_t dc2 = __riscv_vmv_v_x_u8m1((left_sum + 2) >> 2, 8);
+ vuint8m1_t dc128 = __riscv_vmv_v_x_u8m1(128, 8);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc128, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc2, 8);
+}
+
+void pred16x16_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+ vuint8m1_t left = __riscv_vlse8_v_u8m1(p_src_iter - 1, stride, 16);
+ vuint8m1_t top = __riscv_vle8_v_u8m1(p_src_iter - stride, 16);
+
+ vuint16m1_t sum = __riscv_vand_vx_u16m1(sum, 0, 8);
+
+ sum = __riscv_vwredsumu_vs_u8m1_u16m1(left, sum, 16);
+ sum = __riscv_vwredsumu_vs_u8m1_u16m1(top, sum, 16);
+
+ vuint16m1_t sum_n = __riscv_vssrl_vx_u16m1(sum, 5, 8);
+ vuint8m1_t dc_splat = __riscv_vrgather_vx_u8m1(__riscv_vreinterpret_v_u16m1_u8m1(sum_n), 0, 16);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+}
+
+void pred16x16_left_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+ vuint8m1_t left = __riscv_vlse8_v_u8m1(p_src_iter - 1, stride, 16);
+
+ vuint16m1_t sum = __riscv_vand_vx_u16m1(sum, 0, 16);
+ sum = __riscv_vwredsumu_vs_u8m1_u16m1(left, sum, 16);
+
+ vuint16m1_t dc = __riscv_vssrl_vx_u16m1(sum, 4, 8);
+ vuint8m1_t dc_splat = __riscv_vrgather_vx_u8m1(__riscv_vreinterpret_v_u16m1_u8m1(dc), 0, 16);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+}
+
+void pred16x16_top_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+ vuint8m1_t top = __riscv_vle8_v_u8m1(p_src_iter - stride, 16);
+
+ vuint16m1_t sum = __riscv_vand_vx_u16m1(sum, 0, 16);
+ sum = __riscv_vwredsumu_vs_u8m1_u16m1(top, sum, 16);
+
+ vuint16m1_t dc = __riscv_vssrl_vx_u16m1(sum, 4, 8);
+ vuint8m1_t dc_splat = __riscv_vrgather_vx_u8m1(__riscv_vreinterpret_v_u16m1_u8m1(dc), 0, 16);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc_splat, 16);
+}
+
+void pred16x16_128_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ int width = 16;
+
+ while (width > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(width);
+ uint8_t *p_src_iter_next = p_src + vl;
+
+ vuint8m1_t dc = __riscv_vmv_v_x_u8m1(128, vl);
+
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, dc, vl);
+
+ width -= vl;
+ p_src_iter = p_src_iter_next;
+ }
+}
+
+void pred16x16_vert_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ int width = 16;
+
+ while (width > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(width);
+ uint8_t *p_src_iter_next = p_src + vl;
+
+ vuint8m1_t top = __riscv_vle8_v_u8m1(p_src_iter - stride, vl);
+
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8m1(p_src_iter, top, vl);
+
+ width -= vl;
+ p_src_iter = p_src_iter_next;
+ }
+}
+
+void pred16x16_hor_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t *p_src_iter = p_src;
+ int width = 16;
+
+ while (width > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(width);
+ vuint8m1_t left = __riscv_vlse8_v_u8m1(p_src_iter - 1, stride, width);
+
+ __riscv_vssseg8e8_v_u8m1(p_src_iter, stride, left, left, left, left, left, left, left, left, width);
+ __riscv_vssseg8e8_v_u8m1(p_src_iter + 8, stride, left, left, left, left, left, left, left, left, width);
+
+ width -= vl;
+ p_src_iter = p_src_iter + vl * stride;
+ }
+}
+
+void pred16x16_plane_8_rvv(uint8_t *p_src, ptrdiff_t stride)
+{
+ int i = 0;
+ uint8_t *p_src_iter = p_src;
+ int vl = __riscv_vsetvl_e8mf2(8);
+
+ const uint8_t index_data[] = {7, 6, 5, 4, 3, 2, 1, 0};
+ const int16_t weight2_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ vuint8mf2_t index = __riscv_vle8_v_u8mf2(index_data, vl);
+ vuint16m1_t index_w = __riscv_vwaddu_vx_u16m1(index, 0, vl);
+
+ vuint8mf2_t h_half2 = __riscv_vle8_v_u8mf2(p_src - stride + 8, vl);
+ vuint8mf2_t h_half1 = __riscv_vle8_v_u8mf2(p_src - stride - 1, vl);
+ h_half1 = __riscv_vrgather_vv_u8mf2(h_half1, index, vl);
+
+ vuint8mf2_t v_half2 = __riscv_vlse8_v_u8mf2(p_src - 1 + 8 * stride, stride, vl);
+ vuint8mf2_t v_half1 = __riscv_vlse8_v_u8mf2(p_src - 1 - stride, stride, vl);
+ v_half1 = __riscv_vrgather_vv_u8mf2(v_half1, index, vl);
+
+ vint16m1_t h_half2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(h_half2, 0, vl));
+ vint16m1_t h_half1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(h_half1, 0, vl));
+
+ vint16m1_t v_half2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(v_half2, 0, vl));
+ vint16m1_t v_half1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(v_half1, 0, vl));
+
+ // calculate H
+ vint16m1_t h = __riscv_vsub_vv_i16m1(h_half2_w, h_half1_w, vl);
+ vint16m1_t weight = __riscv_vrsub_vx_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(index_w), 8, vl);
+ h = __riscv_vmul_vv_i16m1(h, weight, vl);
+
+ // calculate V
+ vint16m1_t v = __riscv_vsub_vv_i16m1(v_half2_w, v_half1_w, vl);
+ v = __riscv_vmul_vv_i16m1(v, weight, vl);
+
+ vint32m1_t v_sum = __riscv_vand_vx_i32m1(v_sum, 0, vl);
+ vint32m1_t h_sum = __riscv_vand_vx_i32m1(h_sum, 0, vl);
+ v_sum = __riscv_vwredsum_vs_i16m1_i32m1(v, v_sum, vl);
+ h_sum = __riscv_vwredsum_vs_i16m1_i32m1(h, h_sum, vl);
+
+ int32_t h_sum_scalar = __riscv_vmv_x_s_i32m1_i32(h_sum);
+ h_sum_scalar = (5 * h_sum_scalar + 32) >> 6;
+ int32_t v_sum_scalar = __riscv_vmv_x_s_i32m1_i32(v_sum);
+ v_sum_scalar = (5 * v_sum_scalar + 32) >> 6;
+
+ // linear combination of H, V, and src
+ int32_t a = ((p_src[15 * stride - 1] + p_src[-stride + 15] + 1) << 4) - (7 * (v_sum_scalar + h_sum_scalar));
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+ vint16m1_t weight2 = __riscv_vle16_v_i16m1(weight2_data, 16);
+ vint16m1_t h_weighted = __riscv_vmv_v_x_i16m1(h_sum_scalar, 16);
+ h_weighted = __riscv_vmul_vv_i16m1(h_weighted, weight2, 16);
+
+ for (i = 0; i < 16; i += 8)
+ {
+ vint16m1_t result1 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result1 = __riscv_vmax_vx_i16m1(result1, 0, 16);
+ a += v_sum_scalar;
+
+ vint16m1_t result2 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result2 = __riscv_vmax_vx_i16m1(result2, 0, 16);
+ a += v_sum_scalar;
+
+ vint16m1_t result3 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result3 = __riscv_vmax_vx_i16m1(result3, 0, 16);
+ a += v_sum_scalar;
+
+ vint16m1_t result4 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result4 = __riscv_vmax_vx_i16m1(result4, 0, 16);
+ a += v_sum_scalar;
+
+ vint16m1_t result5 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result5 = __riscv_vmax_vx_i16m1(result5, 0, 16);
+ a += v_sum_scalar;
+
+ vint16m1_t result6 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result6 = __riscv_vmax_vx_i16m1(result6, 0, 16);
+ a += v_sum_scalar;
+
+ vint16m1_t result7 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result7 = __riscv_vmax_vx_i16m1(result7, 0, 16);
+ a += v_sum_scalar;
+
+ vint16m1_t result8 = __riscv_vadd_vx_i16m1(h_weighted, a, 16);
+ result8 = __riscv_vmax_vx_i16m1(result8, 0, 16);
+ a += v_sum_scalar;
+
+ vuint8mf2_t result1_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 5, 16);
+ vuint8mf2_t result2_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 5, 16);
+ vuint8mf2_t result3_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 5, 16);
+ vuint8mf2_t result4_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result4), 5, 16);
+ vuint8mf2_t result5_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result5), 5, 16);
+ vuint8mf2_t result6_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result6), 5, 16);
+ vuint8mf2_t result7_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result7), 5, 16);
+ vuint8mf2_t result8_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result8), 5, 16);
+
+ __riscv_vse8_v_u8mf2(p_src_iter, result1_n, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result2_n, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result3_n, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result4_n, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result5_n, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result6_n, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result7_n, 16);
+ p_src_iter += stride;
+ __riscv_vse8_v_u8mf2(p_src_iter, result8_n, 16);
+ p_src_iter += stride;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+#endif
diff --git a/libavcodec/riscv/h264_pred.h b/libavcodec/riscv/h264_pred.h
new file mode 100644
index 0000000000..a3f1f6167f
--- /dev/null
+++ b/libavcodec/riscv/h264_pred.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_PRED_H
+#define AVCODEC_RISCV_H264_PRED_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void pred8x8_vert_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_hor_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_plane_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_128_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_top_dc_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_left_dc_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_dc_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_l0t_dc_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_0lt_dc_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_l00_dc_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred8x8_0l0_dc_rvv(uint8_t *p_src, ptrdiff_t stride);
+
+void pred16x16_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred16x16_top_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred16x16_left_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred16x16_128_dc_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred16x16_vert_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred16x16_hor_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+void pred16x16_plane_8_rvv(uint8_t *p_src, ptrdiff_t stride);
+#endif
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_pred_init_riscv.c b/libavcodec/riscv/h264_pred_init_riscv.c
new file mode 100644
index 0000000000..3be83613b0
--- /dev/null
+++ b/libavcodec/riscv/h264_pred_init_riscv.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+#include "config.h"
+#include "h264_pred.h"
+
+static av_cold void h264_pred_init_riscv(H264PredContext *h, int codec_id,
+ const int bit_depth,
+ const int chroma_format_idc)
+{
+#if HAVE_INTRINSICS_RVV
+ if (bit_depth == 8) {
+ if (chroma_format_idc <= 1) {
+ h->pred8x8[VERT_PRED8x8 ] = pred8x8_vert_8_rvv;
+ h->pred8x8[HOR_PRED8x8 ] = pred8x8_hor_8_rvv;
+ if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+ h->pred8x8[PLANE_PRED8x8] = pred8x8_plane_8_rvv;
+ h->pred8x8[DC_128_PRED8x8 ] = pred8x8_128_dc_8_rvv;
+ if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
+ codec_id != AV_CODEC_ID_VP8) {
+ h->pred8x8[TOP_DC_PRED8x8 ] = pred8x8_top_dc_rvv;
+ h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = pred8x8_l0t_dc_rvv;
+ h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = pred8x8_0lt_dc_rvv;
+ h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = pred8x8_l00_dc_rvv;
+ h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = pred8x8_0l0_dc_rvv;
+ }
+ }
+
+ h->pred16x16[DC_PRED8x8 ] = pred16x16_dc_8_rvv;
+ h->pred16x16[VERT_PRED8x8 ] = pred16x16_vert_8_rvv;
+ h->pred16x16[HOR_PRED8x8 ] = pred16x16_hor_8_rvv;
+ h->pred16x16[LEFT_DC_PRED8x8] = pred16x16_left_dc_8_rvv;
+ h->pred16x16[TOP_DC_PRED8x8 ] = pred16x16_top_dc_8_rvv;
+ h->pred16x16[DC_128_PRED8x8 ] = pred16x16_128_dc_8_rvv;
+ }
+#endif
+}
+
+av_cold void ff_h264_pred_init_riscv(H264PredContext *h, int codec_id,
+ int bit_depth, const int chroma_format_idc)
+{
+#if HAVE_INTRINSICS_RVV
+ h264_pred_init_riscv(h, codec_id, bit_depth, chroma_format_idc);
+#endif
+}
--
2.17.1
More information about the ffmpeg-devel
mailing list