[FFmpeg-devel] [PATCH 1/2] lavc/ac3dsp: R-V V sum_square_butterfly_int32
Rémi Denis-Courmont
remi at remlab.net
Mon Apr 29 22:21:43 EEST 2024
ac3_sum_square_bufferfly_int32_c: 61.0
ac3_sum_square_bufferfly_int32_rvv_i64: 14.7
---
libavcodec/riscv/ac3dsp_init.c | 6 +++++
libavcodec/riscv/ac3dsp_rvv.S | 41 ++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+)
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index b9e14d56ca..be5e153fac 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -28,6 +28,8 @@
void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
+void ff_sum_square_butterfly_int32_rvv(int64_t *, const int32_t *,
+ const int32_t *, int);
av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
{
@@ -39,6 +41,10 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
c->extract_exponents = ff_extract_exponents_rvb;
if (flags & AV_CPU_FLAG_RVV_F32)
c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+# if __riscv_xlen >= 64
+ if (flags & AV_CPU_FLAG_RVV_I64)
+ c->sum_square_butterfly_int32 = ff_sum_square_butterfly_int32_rvv;
+# endif
}
#endif
}
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
index b8d32c4677..dd0b4cd797 100644
--- a/libavcodec/riscv/ac3dsp_rvv.S
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -37,3 +37,44 @@ func ff_float_to_fixed24_rvv, zve32f
ret
endfunc
+
+#if __riscv_xlen >= 64
+func ff_sum_square_butterfly_int32_rvv, zve64x
+ vsetvli t0, zero, e64, m8, ta, ma
+ vmv.v.x v0, zero
+ vmv.v.x v8, zero
+1:
+ vsetvli t0, a3, e32, m2, tu, ma
+ vle32.v v16, (a1)
+ sub a3, a3, t0
+ vle32.v v20, (a2)
+ sh2add a1, t0, a1
+ vadd.vv v24, v16, v20
+ sh2add a2, t0, a2
+ vsub.vv v28, v16, v20
+ vwmacc.vv v0, v16, v16
+ vwmacc.vv v4, v20, v20
+ vwmacc.vv v8, v24, v24
+ vwmacc.vv v12, v28, v28
+ bnez a3, 1b
+
+ vsetvli t0, zero, e64, m4, ta, ma
+ vmv.s.x v16, zero
+ vmv.s.x v17, zero
+ vredsum.vs v16, v0, v16
+ vmv.s.x v18, zero
+ vredsum.vs v17, v4, v17
+ vmv.s.x v19, zero
+ vredsum.vs v18, v8, v18
+ vmv.x.s t0, v16
+ vredsum.vs v19, v12, v19
+ vmv.x.s t1, v17
+ sd t0, (a0)
+ vmv.x.s t2, v18
+ sd t1, 8(a0)
+ vmv.x.s t3, v19
+ sd t2, 16(a0)
+ sd t3, 24(a0)
+ ret
+endfunc
+#endif
--
2.43.0
More information about the ffmpeg-devel
mailing list