[FFmpeg-devel] [PATCH 4/6] lavc/ac3dsp: RISC-V V ac3_sum_square_butterfly_float
Peiting Shen
shenpeiting at eswincomputing.com
Thu Jun 15 13:36:43 EEST 2023
From: Shen Peiting <shenpeiting at eswincomputing.com>
Scalar calculating float sum_square optimized by using RVV instructions
Benchmarks on Spike(cycles):
len=128
ac3_sum_square_butterfly_float_c: 7986
ac3_sum_square_butterfly_float_rvv: 146
len=1280
ac3_sum_square_butterfly_float_c: 79410
ac3_sum_square_butterfly_float_rvv: 1154
Co-Authored by: Yang Xiaojun <yangxiaojun at eswincomputing.com>
Co-Authored by: Huang Xing <huangxing1 at eswincomputing.com>
Co-Authored by: Zeng Fanchen <zengfanchen at eswincomputing.com>
Signed-off-by: Shen Peiting <shenpeiting at eswincomputing.com>
---
libavcodec/riscv/ac3dsp_init.c | 6 ++++
libavcodec/riscv/ac3dsp_rvv.S | 54 ++++++++++++++++++++++++++++++++++
2 files changed, 60 insertions(+)
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 4fd4abe83e..d3aa20623a 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -30,6 +30,10 @@ void ff_ac3_sum_square_butterfly_int32_rvv(int64_t sum[4],
const int32_t *coef0,
const int32_t *coef1,
int len);
+void ff_ac3_sum_square_butterfly_float_rvv(float sum[4],
+ const float *coef0,
+ const float *coef1,
+ int len);
av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
{
@@ -39,6 +43,8 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
c->ac3_exponent_min = ff_ac3_exponent_min_rvv;
c->float_to_fixed24 = ff_float_to_fixed24_rvv;
}
+ if (flags & AV_CPU_FLAG_RVV_F32)
+ c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_rvv;
#if (__riscv_xlen >= 64)
if (flags & AV_CPU_FLAG_RVV_I64)
c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_rvv;
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
index 4e0d238f85..05a4d44938 100644
--- a/libavcodec/riscv/ac3dsp_rvv.S
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -116,3 +116,57 @@ func ff_ac3_sum_square_butterfly_int32_rvv, zve64x
addi a0, a0, 8
ret
endfunc
+
+
+func ff_ac3_sum_square_butterfly_float_rvv, zve32f
+ #Round Up
+ li t1, 0x61
+ fscsr t1
+ vsetvli t0, a3, e32, m4
+ vle32.v v0, (a1)
+ vle32.v v4, (a2)
+ vfadd.vv v8, v0, v4
+ vfsub.vv v12, v0, v4
+ vfmul.vv v16, v0, v0
+ vfmul.vv v20, v4, v4
+ vfmul.vv v24, v8, v8
+ vfmul.vv v28, v12, v12
+ sub a3, a3, t0
+ slli t0, t0, 2
+ add a1, a1, t0
+ add a2, a2, t0
+ beq a3, x0, 2f
+1:
+ vsetvli t0, a3, e32, m4
+ vle32.v v0, (a1)
+ vle32.v v4, (a2)
+ vfadd.vv v8, v0, v4
+ vfsub.vv v12, v0, v4
+ vfmacc.vv v16, v0, v0
+ vfmacc.vv v20, v4, v4
+ vfmacc.vv v24, v8, v8
+ vfmacc.vv v28, v12, v12
+ sub a3, a3, t0
+ slli t0, t0, 2
+ add a1, a1, t0
+ add a2, a2, t0
+ bnez a3, 1b
+2:
+ vsetvli t0, x0, e32, m4
+ fcvt.s.w f0, x0
+ vfmv.v.f v0, f0
+ vfredsum.vs v0, v16, v0
+ vfredsum.vs v1, v20, v1
+ vfredsum.vs v2, v24, v2
+ vfredsum.vs v3, v28, v3
+ vsetivli t0, 1, e32, m1
+ vse32.v v0, (a0)
+ addi a0, a0, 4
+ vse32.v v1, (a0)
+ addi a0, a0, 4
+ vse32.v v2, (a0)
+ addi a0, a0, 4
+ vse32.v v3, (a0)
+ addi a0, a0, 4
+ ret
+endfunc
--
2.17.1
More information about the ffmpeg-devel
mailing list