[FFmpeg-devel] [PATCH] lavu/float_dsp: avoid reg-stride in R-V V fmul_window
Rémi Denis-Courmont
remi at remlab.net
Sat Sep 30 22:04:36 EEST 2023
---
libavutil/riscv/float_dsp_rvv.S | 45 ++++++++++++++++++---------------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/libavutil/riscv/float_dsp_rvv.S b/libavutil/riscv/float_dsp_rvv.S
index 7e9e84d526..91b70bf148 100644
--- a/libavutil/riscv/float_dsp_rvv.S
+++ b/libavutil/riscv/float_dsp_rvv.S
@@ -75,32 +75,37 @@ endfunc
func ff_vector_fmul_window_rvv, zve32f
// a0: dst, a1: src0, a2: src1, a3: window, a4: length
- addi t0, a4, -1
- add t1, t0, a4
- sh2add a2, t0, a2
- sh2add t0, t1, a0
- sh2add t3, t1, a3
- li t1, -4 // byte stride
+ vsetvli t0, zero, e16, m4, ta, ma
+ sh2add a2, a4, a2
+ vid.v v0
+ sh3add t3, a4, a3
+ vadd.vi v0, v0, 1
+ sh3add t0, a4, a0
1:
- vsetvli t2, a4, e32, m4, ta, ma
- vle32.v v16, (a1)
+ vsetvli t2, a4, e16, m2, ta, ma
slli t4, t2, 2
- vlse32.v v20, (a2), t1
+ vrsub.vx v2, v0, t2
+ sub t3, t3, t4
+ vsetvli zero, zero, e32, m4, ta, ma
+ sub a2, a2, t4
+ vle32.v v8, (t3)
+ sub t0, t0, t4
+ vle32.v v4, (a2)
sub a4, a4, t2
- vle32.v v24, (a3)
+ vrgatherei16.vv v28, v8, v2
+ vle32.v v16, (a1)
add a1, a1, t4
- vlse32.v v28, (t3), t1
- sub a2, a2, t4
- vfmul.vv v0, v16, v28
+ vrgatherei16.vv v20, v4, v2
+ vle32.v v24, (a3)
add a3, a3, t4
- vfmul.vv v8, v16, v24
- sub t3, t3, t4
- vfnmsac.vv v0, v20, v24
- vfmacc.vv v8, v20, v28
- vse32.v v0, (a0)
+ vfmul.vv v12, v16, v28
+ vfmul.vv v16, v16, v24
+ vfnmsac.vv v12, v20, v24
+ vfmacc.vv v16, v20, v28
+ vrgatherei16.vv v8, v16, v2
+ vse32.v v12, (a0)
add a0, a0, t4
- vsse32.v v8, (t0), t1
- sub t0, t0, t4
+ vse32.v v8, (t0)
bnez a4, 1b
ret
--
2.42.0
More information about the ffmpeg-devel
mailing list