[FFmpeg-devel] [PATCH 2/2] lavc/vvc_mc: R-V V dmvr
uk7b at foxmail.com
uk7b at foxmail.com
Sat Sep 28 12:41:25 EEST 2024
From: sunyuechi <sunyuechi at iscas.ac.cn>
k230 banana_f3
dmvr_8_12x20_c: 626.5 ( 1.00x) 621.7 ( 1.00x)
dmvr_8_12x20_rvv_i32: 126.3 ( 4.96x) 79.9 ( 7.78x)
dmvr_8_20x12_c: 608.0 ( 1.00x) 652.9 ( 1.00x)
dmvr_8_20x12_rvv_i32: 135.5 ( 4.49x) 90.4 ( 7.22x)
dmvr_8_20x20_c: 1006.0 ( 1.00x) 1079.9 ( 1.00x)
dmvr_8_20x20_rvv_i32: 228.3 ( 4.41x) 142.4 ( 7.58x)
dmvr_h_8_12x20_c: 2005.8 ( 1.00x) 2007.2 ( 1.00x)
dmvr_h_8_12x20_rvv_i32: 274.5 ( 7.31x) 184.2 (10.90x)
dmvr_h_8_20x12_c: 1987.5 ( 1.00x) 2006.9 ( 1.00x)
dmvr_h_8_20x12_rvv_i32: 302.3 ( 6.58x) 173.7 (11.56x)
dmvr_h_8_20x20_c: 3302.3 ( 1.00x) 3340.4 ( 1.00x)
dmvr_h_8_20x20_rvv_i32: 487.5 ( 6.77x) 267.4 (12.49x)
dmvr_hv_8_12x20_c: 3607.8 ( 1.00x) 3600.7 ( 1.00x)
dmvr_hv_8_12x20_rvv_i32: 459.8 ( 7.85x) 371.7 ( 9.69x)
dmvr_hv_8_20x12_c: 3626.3 ( 1.00x) 3621.7 ( 1.00x)
dmvr_hv_8_20x12_rvv_i32: 422.8 ( 8.58x) 298.7 (12.13x)
dmvr_hv_8_20x20_c: 5931.8 ( 1.00x) 5934.4 ( 1.00x)
dmvr_hv_8_20x20_rvv_i32: 672.5 ( 8.82x) 475.9 (12.47x)
dmvr_v_8_12x20_c: 2154.0 ( 1.00x) 2152.9 ( 1.00x)
dmvr_v_8_12x20_rvv_i32: 274.5 ( 7.85x) 183.9 (11.71x)
dmvr_v_8_20x12_c: 2774.5 ( 1.00x) 2152.9 ( 1.00x)
dmvr_v_8_20x12_rvv_i32: 302.3 ( 9.18x) 173.7 (12.40x)
dmvr_v_8_20x20_c: 3552.0 ( 1.00x) 3590.4 ( 1.00x)
dmvr_v_8_20x20_rvv_i32: 487.5 ( 7.29x) 267.4 (13.43x)
---
libavcodec/riscv/vvc/vvc_mc_rvv.S | 139 +++++++++++++++++++++++++++++
libavcodec/riscv/vvc/vvcdsp_init.c | 22 +++++
2 files changed, 161 insertions(+)
diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
index 18532616d9..61fe840c4d 100644
--- a/libavcodec/riscv/vvc/vvc_mc_rvv.S
+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
@@ -285,3 +285,142 @@ endfunc
func_w_avg 128
func_w_avg 256
#endif
+
+func dmvr zve32x, zbb, zba
+ lpad 0
+ li t0, 4
+1:
+ add t1, a1, a2
+ addi t4, a0, 128*2
+ add t2, t1, a2
+ addi t5, a0, 128*2*2
+ add t3, t2, a2
+ addi t6, a0, 128*2*3
+ vle8.v v0, (a1)
+ vle8.v v4, (t1)
+ vle8.v v8, (t2)
+ vle8.v v12, (t3)
+ addi a3, a3, -4
+ vwmulu.vx v16, v0, t0
+ vwmulu.vx v20, v4, t0
+ vwmulu.vx v24, v8, t0
+ vwmulu.vx v28, v12, t0
+ vse16.v v16, (a0)
+ vse16.v v20, (t4)
+ vse16.v v24, (t5)
+ vse16.v v28, (t6)
+ sh2add a1, a2, a1
+ add a0, a0, 128*2*4
+ bnez a3, 1b
+ ret
+endfunc
+
+.macro dmvr_h_v mn, type
+ lla t4, ff_vvc_inter_luma_dmvr_filters
+ sh1add t4, \mn, t4
+ lbu t5, (t4)
+ lbu t6, 1(t4)
+1:
+.ifc \type,h
+ addi t0, a1, 1
+ addi t1, a1, 2
+.else
+ add t0, a1, a2
+ add t1, t0, a2
+.endif
+ vle8.v v0, (a1)
+ vle8.v v4, (t0)
+ vle8.v v8, (t1)
+ addi a3, a3, -2
+ vzext.vf2 v12, v0
+ vzext.vf2 v16, v4
+ vzext.vf2 v20, v8
+ addi t2, a0, 128*2
+ vmul.vx v12, v12, t5
+ vmul.vx v24, v16, t5
+ vmacc.vx v12, t6, v16
+ vmacc.vx v24, t6, v20
+ vssrl.vi v12, v12, 2
+ vssrl.vi v24, v24, 2
+ vse16.v v12, (a0)
+ vse16.v v24, (t2)
+ add a0, a0, 128*4
+ sh1add a1, a2, a1
+ bnez a3, 1b
+ ret
+.endm
+
+func dmvr_h zve32x, zbb, zba
+ lpad 0
+ dmvr_h_v a4, h
+endfunc
+
+func dmvr_v zve32x, zbb, zba
+ lpad 0
+ dmvr_h_v a5, v
+endfunc
+
+.macro dmvr_load_h dst, filter0, filter1
+ addi a6, a1, 1
+ vle8.v \dst, (a1)
+ vle8.v v2, (a6)
+ vzext.vf2 v4, \dst
+ vzext.vf2 v8, v2
+ vmul.vx \dst, v4, \filter0
+ vmacc.vx \dst, \filter1, v8
+ vssrl.vi \dst, \dst, 2
+.endm
+
+func dmvr_hv zve32x, zbb, zba
+ lpad 0
+ lla t0, ff_vvc_inter_luma_dmvr_filters
+ sh1add t1, a4, t0
+ sh1add t2, a5, t0
+ lbu t3, (t1) // filter[mx][0]
+ lbu t4, 1(t1) // filter[mx][1]
+ lbu t5, (t2) // filter[my][0]
+ lbu t6, 1(t2) // filter[my][1]
+ dmvr_load_h v12, t3, t4
+ add a1, a1, a2
+1:
+ vmul.vx v28, v12, t5
+ addi a3, a3, -1
+ dmvr_load_h v12, t3, t4
+ vmacc.vx v28, t6, v12
+ vssrl.vi v28, v28, 4
+ vse16.v v28, (a0)
+ add a1, a1, a2
+ addi a0, a0, 128*2
+ bnez a3, 1b
+ ret
+endfunc
+
+.macro func_dmvr vlen, name
+func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x, zbb, zba
+ lpad 0
+ li t0, 20
+ beq a6, t0, DMVR20\vlen\name
+ .ifc \name, dmvr
+ vsetvlstatic8 12, \vlen
+ .else
+ csrwi vxrm, 0
+ vsetvlstatic16 12, \vlen
+ .endif
+ j \name
+DMVR20\vlen\name:
+ .ifc \name, dmvr
+ vsetvlstatic8 20, \vlen
+ .else
+ csrwi vxrm, 0
+ vsetvlstatic16 20, \vlen
+ .endif
+ j \name
+endfunc
+.endm
+
+.irp vlen,256,128
+func_dmvr \vlen, dmvr
+func_dmvr \vlen, dmvr_h
+func_dmvr \vlen, dmvr_v
+func_dmvr \vlen, dmvr_hv
+.endr
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
index ac1e7dda7d..7df3ce58db 100644
--- a/libavcodec/riscv/vvc/vvcdsp_init.c
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -37,6 +37,26 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
AVG_PROTOTYPES(8, rvv_128)
AVG_PROTOTYPES(8, rvv_256)
+#define DMVR_PROTOTYPES(bd, opt) \
+void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+
+DMVR_PROTOTYPES(8, rvv_128)
+DMVR_PROTOTYPES(8, rvv_256)
+
+#define DMVR_INIT(bd, opt) do { \
+ c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_##opt; \
+ c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_##opt; \
+ c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_##opt; \
+ c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_##opt; \
+} while (0)
+
void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
{
#if HAVE_RVV
@@ -51,6 +71,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
# if (__riscv_xlen == 64)
c->inter.w_avg = ff_vvc_w_avg_8_rvv_256;
# endif
+ DMVR_INIT(8, rvv_256);
break;
default:
break;
@@ -63,6 +84,7 @@ void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
# if (__riscv_xlen == 64)
c->inter.w_avg = ff_vvc_w_avg_8_rvv_128;
# endif
+ DMVR_INIT(8, rvv_128);
break;
default:
break;
--
2.46.2
More information about the ffmpeg-devel
mailing list