[FFmpeg-devel] [PATCH v2 2/2] avcodec/loongarch: add LSX optimization for aac audio encode
pengxu
pengxu at loongson.cn
Thu Apr 18 10:36:09 EEST 2024
Add functions:
ff_abs_pow34_lsx
ff_aac_quantize_bands_lsx
./ffmpeg -f s16le -ac 2 -i ../../1.pcm -c:a aac -f null -
before:37.5x
after:48.1x
---
libavcodec/aacencdsp.h | 3 +
libavcodec/loongarch/Makefile | 2 +
.../loongarch/aacencdsp_init_loongarch.c | 33 +++
libavcodec/loongarch/aacencdsp_loongarch.S | 254 ++++++++++++++++++
libavcodec/loongarch/aacencdsp_loongarch.h | 35 +++
5 files changed, 327 insertions(+)
create mode 100644 libavcodec/loongarch/aacencdsp_init_loongarch.c
create mode 100644 libavcodec/loongarch/aacencdsp_loongarch.S
create mode 100644 libavcodec/loongarch/aacencdsp_loongarch.h
diff --git a/libavcodec/aacencdsp.h b/libavcodec/aacencdsp.h
index 67836d8cf7..5db27a95a9 100644
--- a/libavcodec/aacencdsp.h
+++ b/libavcodec/aacencdsp.h
@@ -34,6 +34,7 @@ typedef struct AACEncDSPContext {
void ff_aacenc_dsp_init_riscv(AACEncDSPContext *s);
void ff_aacenc_dsp_init_x86(AACEncDSPContext *s);
+void ff_aacenc_dsp_init_loongarch(AACEncDSPContext *s);
static inline void abs_pow34_v(float *out, const float *in, const int size)
{
@@ -66,6 +67,8 @@ static inline void ff_aacenc_dsp_init(AACEncDSPContext *s)
ff_aacenc_dsp_init_riscv(s);
#elif ARCH_X86
ff_aacenc_dsp_init_x86(s);
+#elif ARCH_LOONGARCH64
+ ff_aacenc_dsp_init_loongarch(s);
#endif
}
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 07da2964e4..068fd61810 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
OBJS-$(CONFIG_VIDEODSP) += loongarch/videodsp_init.o
OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
+OBJS-$(CONFIG_AAC_ENCODER) += loongarch/aacencdsp_init_loongarch.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
loongarch/h264_deblock_lasx.o
@@ -38,3 +39,4 @@ LSX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel.o \
loongarch/h264qpel_lsx.o
LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o
LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o
+LSX-OBJS-$(CONFIG_AAC_ENCODER) += loongarch/aacencdsp_loongarch.o
diff --git a/libavcodec/loongarch/aacencdsp_init_loongarch.c b/libavcodec/loongarch/aacencdsp_init_loongarch.c
new file mode 100644
index 0000000000..5f67a5857d
--- /dev/null
+++ b/libavcodec/loongarch/aacencdsp_init_loongarch.c
@@ -0,0 +1,33 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "aacencdsp_loongarch.h"
+
+av_cold void ff_aacenc_dsp_init_loongarch(AACEncDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ s->abs_pow34 = ff_abs_pow34_lsx;
+ s->quant_bands = ff_aac_quantize_bands_lsx;
+ }
+}
\ No newline at end of file
diff --git a/libavcodec/loongarch/aacencdsp_loongarch.S b/libavcodec/loongarch/aacencdsp_loongarch.S
new file mode 100644
index 0000000000..b80bb98aa9
--- /dev/null
+++ b/libavcodec/loongarch/aacencdsp_loongarch.S
@@ -0,0 +1,254 @@
+/*
+ * Loongarch LASX/LSX optimizeds AAC encoder DSP functions
+ *
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+
+/* void ff_abs_pow34_lsx(float *out, const float *in, const int size); */
+// Param, out:a0, in:a1, size:a2
+function ff_abs_pow34_lsx
+ move t0, zero //loop param
+ move t1, zero //data index
+
+ srai.d t2, a2, 2
+ beq zero, t2, .FAPL02
+
+.FAPL01:
+ add.d t3, a1, t1
+ fld.s f0, t3, 0x00
+ fld.s f1, t3, 0x04
+ fld.s f2, t3, 0x08
+ fld.s f3, t3, 0x0c
+
+ fabs.s f0, f0
+ fabs.s f1, f1
+ fabs.s f2, f2
+ fabs.s f3, f3
+
+ vextrins.w vr0, vr1, 0x10
+ vextrins.w vr0, vr2, 0x20
+ vextrins.w vr0, vr3, 0x30
+
+ vfsqrt.s vr4, vr0
+ vfmul.s vr5, vr0, vr4
+ vfsqrt.s vr6, vr5
+
+ vstx vr6, a0, t1
+
+ addi.d t1, t1, 16
+ addi.d t0, t0, 1
+ blt t0, t2, .FAPL01
+
+.FAPL02: /* &2 */
+ andi t0, a2, 2
+ beq zero, t0, .FAPL03
+
+ add.d t3, a1, t1
+ add.d t4, a0, t1
+
+ fld.s f0, t3, 0x00
+ fld.s f1, t3, 0x04
+
+ fabs.s f0, f0
+ fabs.s f1, f1
+
+ fsqrt.s f2, f0
+ fsqrt.s f3, f1
+
+ fmul.s f4, f0, f2
+ fmul.s f5, f1, f3
+
+ fsqrt.s f6, f4
+ fsqrt.s f7, f5
+
+ fld.s f6, t4, 0x00
+ fld.s f7, t4, 0x04
+
+ addi.d t1, t1, 8
+
+.FAPL03: /* &1 */
+ andi t0, a2, 1
+ beq zero, t0, .FAPL04
+
+ fldx.s f0, a1, t1
+
+ fabs.s f0, f0
+ fsqrt.s f2, f0
+ fmul.s f4, f0, f2
+ fsqrt.s f6, f4
+
+ fldx.s f6, a0, t1
+
+ addi.d t1, t1, 4
+
+.FAPL04:
+endfunc
+
+
+
+/* void ff_aac_quantize_bands_lsx(int *out, const float *in, const float *scaled,
+ int size, int is_signed, int maxval, const float Q34,
+ const float rounding) */
+// param:
+// out: a0
+// in: a1
+// scaled: a2
+// size: a3
+// is_signed: a4
+// maxval: a5
+// Q34: f0
+// rounding: f1
+function ff_aac_quantize_bands_lsx
+ move t0, zero //loop param
+ move t1, zero //data index
+
+ vpermi.w vr0, vr0, 0x00 //Q34
+ vpermi.w vr1, vr1, 0x00 //rounding
+
+ srai.d t2, a3, 2 ////loop max
+ beq zero, t2, .FAQBL02
+
+.FAQBL01: /* /4 */
+ vldx vr2, a2, t1
+ vfmul.s vr3, vr2, vr0 //qc
+ vfadd.s vr4, vr3, vr1
+
+ movgr2fr.w f5, a5
+ ffint.s.w f5, f5
+ vpermi.w vr5, vr5, 0x00 //maxval
+ vfmin.s vr6, vr4, vr5
+ vfrintrz.s vr7, vr6 //(float .0)tmp
+
+ beq a4, zero, .S4ISEND
+
+ fsub.s f8, f0, f0
+ vshuf4i.w vr8, vr8, 0x00 //0.0f
+ vldx vr9, a1, t1 //in
+ vextrins.w vr10, vr9, 0x01
+ vextrins.w vr11, vr9, 0x02
+ vextrins.w vr12, vr9, 0x03
+.S4IS00:
+ fcmp.clt.s $fcc0, f9, f8
+ bceqz $fcc0, .S4IS01
+ vextrins.w vr13, vr7, 0x00
+ fneg.s f13, f13
+ vextrins.w vr7, vr13, 0x00
+.S4IS01:
+ fcmp.clt.s $fcc1, f10, f8
+ bceqz $fcc1, .S4IS02
+ vextrins.w vr13, vr7, 0x01
+ fneg.s f13, f13
+ vextrins.w vr7, vr13, 0x10
+.S4IS02:
+ fcmp.clt.s $fcc2, f11, f8
+ bceqz $fcc2, .S4IS03
+ vextrins.w vr13, vr7, 0x02
+ fneg.s f13, f13
+ vextrins.w vr7, vr13, 0x20
+.S4IS03:
+ fcmp.clt.s $fcc3, f12, f8
+ bceqz $fcc3, .S4ISEND
+ vextrins.w vr13, vr7, 0x03
+ fneg.s f13, f13
+ vextrins.w vr7, vr13, 0x30
+.S4ISEND:
+ vftintrz.w.s vr14, vr7
+ vstx vr14, a0, t1
+ addi.d t1, t1, 16
+ addi.d t0, t0, 1
+ blt t0, t2, .FAQBL01
+
+.FAQBL02: /* /2 */
+ andi t2, a3, 2
+ beq $r0, t2, .FAQBL03
+
+ vldx vr2, a2, t1
+ vfmul.s vr3, vr2, vr0 //qc
+ vfadd.s vr4, vr3, vr1
+
+ movgr2fr.w f5, a5
+ ffint.s.w f5, f5
+ vpermi.w vr5, vr5, 0x00 //maxval
+ vfmin.s vr6, vr4, vr5
+ vfrintrz.s vr7, vr6 //(float .0)tmp
+
+ beq a4, zero, .S2ISEND
+
+ fsub.s f8, f0, f0
+ vshuf4i.w vr8, vr8, 0x00 //0.0f
+ vldx vr9, a1, t1 //in
+ vextrins.w vr10, vr9, 0x01
+.S2IS00:
+ fcmp.clt.s $fcc0, f9, f8
+ bceqz $fcc0, .S2IS01
+ vextrins.w vr13, vr7, 0x00
+ fneg.s f13, f13
+ vextrins.w vr7, vr13, 0x00
+.S2IS01:
+ fcmp.clt.s $fcc1, f10, f8
+ bceqz $fcc1, .S2ISEND
+ vextrins.w vr13, vr7, 0x01
+ fneg.s f13, f13
+ vextrins.w vr7, vr13, 0x10
+.S2ISEND:
+ vftintrz.w.s vr14, vr7
+ vpickve2gr.w t3, vr14, 0
+ vpickve2gr.w t4, vr14, 1
+ add.d t7, a0, t1
+ st.w t3, t7, 0x00
+ st.w t4, t7, 0x04
+ addi.d t1, t1, 8
+
+.FAQBL03: /* /1 */
+ andi t2, a3, 1
+ beq $r0, t2, .FAQBL04
+
+ vldx vr2, a2, t1
+ vfmul.s vr3, vr2, vr0 //qc
+ vfadd.s vr4, vr3, vr1
+
+ movgr2fr.w f5, a5
+ ffint.s.w f5, f5
+ vpermi.w vr5, vr5, 0x00 //maxval
+ vfmin.s vr6, vr4, vr5
+ vfrintrz.s vr7, vr6 //(float .0)tmp
+
+ beq a4, zero, .S1ISEND
+
+ fsub.s f8, f0, f0
+ vshuf4i.w vr8, vr8, 0x00 //0.0f
+ vldx vr9, a1, t1 //in
+.S1IS00:
+ fcmp.clt.s $fcc0, f9, f8
+ bceqz $fcc0, .S1ISEND
+ vextrins.w vr13, vr7, 0x00
+ fneg.s f13, f13
+ vextrins.w vr7, vr13, 0x00
+.S1ISEND:
+ vftintrz.w.s vr14, vr7
+ vpickve2gr.w t3, vr14, 0
+ stx.w t3, a0, t1
+ addi.d t1, t1, 4
+
+.FAQBL04:
+endfunc
\ No newline at end of file
diff --git a/libavcodec/loongarch/aacencdsp_loongarch.h b/libavcodec/loongarch/aacencdsp_loongarch.h
new file mode 100644
index 0000000000..076cd4d247
--- /dev/null
+++ b/libavcodec/loongarch/aacencdsp_loongarch.h
@@ -0,0 +1,35 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (c) 2024 Loongson Technology Corporation Limited
+ * Contributed by PengXu <pengxu at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_AACENC_H
+#define AVCODEC_LOONGARCH_AACENC_H
+
+#include "libavutil/float_dsp.h"
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_lsx(float *out, const float *in, const int size);
+void ff_aac_quantize_bands_lsx(int *out, const float *in, const float *scaled,
+ int size, int is_signed, int maxval, const float Q34,
+ const float rounding);
+
+#endif /* AVCODEC_LOONGARCH_AACENC_H */
\ No newline at end of file
--
2.20.1
More information about the ffmpeg-devel
mailing list