[FFmpeg-cvslog] aarch64: NEON float (i)MDCT
Janne Grunau
git at videolan.org
Tue Apr 22 23:33:45 CEST 2014
ffmpeg | branch: master | Janne Grunau <janne-libav at jannau.net> | Tue Apr 15 18:35:57 2014 +0200| [ee2bc5974fe64fd214f52574400ae01c85f4b855] | committer: Janne Grunau
aarch64: NEON float (i)MDCT
Approximately as fast as the ARM NEON version on Apple's A7.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ee2bc5974fe64fd214f52574400ae01c85f4b855
---
libavcodec/aarch64/Makefile | 1 +
libavcodec/aarch64/fft_init_aarch64.c | 10 +
libavcodec/aarch64/mdct_neon.S | 323 +++++++++++++++++++++++++++++++++
3 files changed, 334 insertions(+)
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 01f618f..ec1fd1e 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -18,3 +18,4 @@ NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
+NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c
index caa5a0d..589e82d 100644
--- a/libavcodec/aarch64/fft_init_aarch64.c
+++ b/libavcodec/aarch64/fft_init_aarch64.c
@@ -26,6 +26,10 @@
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
av_cold void ff_fft_init_aarch64(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -33,5 +37,11 @@ av_cold void ff_fft_init_aarch64(FFTContext *s)
if (have_neon(cpu_flags)) {
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
+#if CONFIG_MDCT
+ s->imdct_calc = ff_imdct_calc_neon;
+ s->imdct_half = ff_imdct_half_neon;
+ s->mdct_calc = ff_mdct_calc_neon;
+ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
}
}
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
new file mode 100644
index 0000000..bccd832
--- /dev/null
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -0,0 +1,323 @@
+/*
+ * AArch64 NEON optimised MDCT
+ * Copyright (c) 2009 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2014 Janne Grunau <janne-libav at jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_imdct_half_neon, export=1
+ sub sp, sp, #32
+ stp x19, x20, [sp]
+ str x30, [sp, #16]
+ mov x12, #1
+ ldr w14, [x0, #28] // mdct_bits
+ ldr x4, [x0, #32] // tcos
+ ldr x3, [x0, #8] // revtab
+ lsl x12, x12, x14 // n = 1 << nbits
+ lsr x14, x12, #2 // n4 = n >> 2
+ add x7, x2, x12, lsl #1
+ mov x12, #-16
+ sub x7, x7, #16
+
+ ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
+ ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
+ rev64 v17.2s, v17.2s
+ ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
+ fmul v6.2s, v17.2s, v2.2s
+ fmul v7.2s, v0.2s, v2.2s
+1:
+ subs x14, x14, #2
+ ldr w6, [x3], #4
+ fmul v4.2s, v0.2s, v3.2s
+ fmul v5.2s, v17.2s, v3.2s
+ fsub v4.2s, v6.2s, v4.2s
+ fadd v5.2s, v5.2s, v7.2s
+ ubfm x8, x6, #16, #31
+ ubfm x6, x6, #0, #15
+ add x8, x1, x8, lsl #3
+ add x6, x1, x6, lsl #3
+ b.eq 2f
+ ld2 {v16.2s,v17.2s}, [x7], x12
+ ld2 {v0.2s,v1.2s}, [x2], #16
+ rev64 v17.2s, v17.2s
+ ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
+ fmul v6.2s, v17.2s, v2.2s
+ fmul v7.2s, v0.2s, v2.2s
+ st2 {v4.s,v5.s}[0], [x6]
+ st2 {v4.s,v5.s}[1], [x8]
+ b 1b
+2:
+ st2 {v4.s,v5.s}[0], [x6]
+ st2 {v4.s,v5.s}[1], [x8]
+
+ mov x19, x0
+ mov x20, x1
+ bl X(ff_fft_calc_neon)
+
+ mov x12, #1
+ ldr w14, [x19, #28] // mdct_bits
+ ldr x4, [x19, #32] // tcos
+ lsl x12, x12, x14 // n = 1 << nbits
+ lsr x14, x12, #3 // n8 = n >> 3
+
+ add x4, x4, x14, lsl #3
+ add x6, x20, x14, lsl #3
+ sub x1, x4, #16
+ sub x3, x6, #16
+
+ mov x7, #-16
+ mov x8, x6
+ mov x0, x3
+
+ ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
+ ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
+ ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+3:
+ subs x14, x14, #2
+ fmul v7.2s, v0.2s, v17.2s
+ ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
+ fmul v4.2s, v1.2s, v17.2s
+ fmul v6.2s, v21.2s, v19.2s
+ fmul v5.2s, v20.2s, v19.2s
+ fmul v22.2s, v1.2s, v16.2s
+ fmul v23.2s, v21.2s, v18.2s
+ fmul v24.2s, v0.2s, v16.2s
+ fmul v25.2s, v20.2s, v18.2s
+ fadd v7.2s, v7.2s, v22.2s
+ fadd v5.2s, v5.2s, v23.2s
+ fsub v4.2s, v4.2s, v24.2s
+ fsub v6.2s, v6.2s, v25.2s
+ b.eq 4f
+ ld2 {v0.2s,v1.2s}, [x3], x7
+ ld2 {v20.2s,v21.2s},[x6], #16
+ ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0], x7
+ st2 {v6.2s,v7.2s}, [x8], #16
+ b 3b
+4:
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0]
+ st2 {v6.2s,v7.2s}, [x8]
+
+ ldp x19, x20, [sp]
+ ldr x30, [sp, #16]
+ add sp, sp, #32
+
+ ret
+endfunc
+
+function ff_imdct_calc_neon, export=1
+ sub sp, sp, #32
+ stp x19, x20, [sp]
+ str x30, [sp, #16]
+ ldr w3, [x0, #28] // mdct_bits
+ mov x19, #1
+ mov x20, x1
+ lsl x19, x19, x3
+ add x1, x1, x19
+
+ bl X(ff_imdct_half_neon)
+
+ add x0, x20, x19, lsl #2
+ add x1, x20, x19, lsl #1
+ sub x0, x0, #8
+ sub x2, x1, #16
+ mov x3, #-16
+ mov x6, #-8
+1:
+ ld1 {v0.4s}, [x2], x3
+ prfum pldl1keep, [x0, #-16]
+ rev64 v0.4s, v0.4s
+ ld1 {v2.2s,v3.2s}, [x1], #16
+ fneg v4.4s, v0.4s
+ prfum pldl1keep, [x2, #-16]
+ rev64 v2.2s, v2.2s
+ rev64 v3.2s, v3.2s
+ ext v4.16b, v4.16b, v4.16b, #8
+ st1 {v2.2s}, [x0], x6
+ st1 {v3.2s}, [x0], x6
+ st1 {v4.4s}, [x20], #16
+ subs x19, x19, #16
+ b.gt 1b
+
+ ldp x19, x20, [sp], #16
+ ldr x30, [sp], #16
+
+ ret
+endfunc
+
+
+function ff_mdct_calc_neon, export=1
+ sub sp, sp, #32
+ stp x19, x20, [sp]
+ str x30, [sp, #16]
+
+ mov x12, #1
+ ldr w14, [x0, #28] // mdct_bits
+ ldr x4, [x0, #32] // tcos
+ ldr x3, [x0, #8] // revtab
+ lsl x14, x12, x14 // n = 1 << nbits
+ add x7, x2, x14 // in4u
+ sub x9, x7, #16 // in4d
+ add x2, x7, x14, lsl #1 // in3u
+ add x8, x9, x14, lsl #1 // in3d
+ add x5, x4, x14, lsl #1
+ sub x5, x5, #16
+ sub x3, x3, #4
+ mov x12, #-16
+ lsr x13, x14, #1
+
+ ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
+ ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
+ ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
+ rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
+ rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
+ ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
+ fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
+ ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
+ rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
+ rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
+ ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
+ fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
+ fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
+ fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
+1:
+ fmul v7.2s, v0.2s, v21.2s // I*s
+ ldr w10, [x3, x13]
+ fmul v6.2s, v2.2s, v20.2s // -R*c
+ ldr w6, [x3, #4]!
+ fmul v4.2s, v2.2s, v21.2s // -R*s
+ fmul v5.2s, v0.2s, v20.2s // I*c
+ fmul v24.2s, v16.2s, v30.2s // R*c
+ fmul v25.2s, v18.2s, v31.2s // -I*s
+ fmul v22.2s, v16.2s, v31.2s // R*s
+ fmul v23.2s, v18.2s, v30.2s // I*c
+ subs x14, x14, #16
+ subs x13, x13, #8
+ fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
+ fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
+ fsub v24.2s, v25.2s, v24.2s // I*s-R*c
+ fadd v25.2s, v22.2s, v23.2s // R*s-I*c
+ b.eq 1f
+ mov x12, #-16
+ ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
+ ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
+ fneg v7.2s, v7.2s // R*s-I*c
+ ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
+ rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
+ rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
+ ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
+ fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
+ ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
+ rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
+ rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
+ ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
+ fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
+ fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
+ fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
+ ubfm x12, x6, #16, #31
+ ubfm x6, x6, #0, #15
+ add x12, x1, x12, lsl #3
+ add x6, x1, x6, lsl #3
+ st2 {v6.s,v7.s}[0], [x6]
+ st2 {v6.s,v7.s}[1], [x12]
+ ubfm x6, x10, #16, #31
+ ubfm x10, x10, #0, #15
+ add x6 , x1, x6, lsl #3
+ add x10, x1, x10, lsl #3
+ st2 {v24.s,v25.s}[0], [x10]
+ st2 {v24.s,v25.s}[1], [x6]
+ b 1b
+1:
+ fneg v7.2s, v7.2s // R*s-I*c
+ ubfm x12, x6, #16, #31
+ ubfm x6, x6, #0, #15
+ add x12, x1, x12, lsl #3
+ add x6, x1, x6, lsl #3
+ st2 {v6.s,v7.s}[0], [x6]
+ st2 {v6.s,v7.s}[1], [x12]
+ ubfm x6, x10, #16, #31
+ ubfm x10, x10, #0, #15
+ add x6 , x1, x6, lsl #3
+ add x10, x1, x10, lsl #3
+ st2 {v24.s,v25.s}[0], [x10]
+ st2 {v24.s,v25.s}[1], [x6]
+
+ mov x19, x0
+ mov x20, x1
+ bl X(ff_fft_calc_neon)
+
+ mov x12, #1
+ ldr w14, [x19, #28] // mdct_bits
+ ldr x4, [x19, #32] // tcos
+ lsl x12, x12, x14 // n = 1 << nbits
+ lsr x14, x12, #3 // n8 = n >> 3
+
+ add x4, x4, x14, lsl #3
+ add x6, x20, x14, lsl #3
+ sub x1, x4, #16
+ sub x3, x6, #16
+
+ mov x7, #-16
+ mov x8, x6
+ mov x0, x3
+
+ ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
+ ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
+ ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
+1:
+ subs x14, x14, #2
+ fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
+ ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
+ fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
+ fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
+ fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
+ fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
+ fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
+ fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
+ fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
+ fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
+ fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
+ fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
+ fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
+ fneg v4.2s, v4.2s
+ fneg v6.2s, v6.2s
+ b.eq 1f
+ ld2 {v0.2s, v1.2s}, [x3], x7
+ ld2 {v20.2s,v21.2s}, [x6], #16
+ ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0], x7
+ st2 {v6.2s,v7.2s}, [x8], #16
+ b 1b
+1:
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0]
+ st2 {v6.2s,v7.2s}, [x8]
+
+ ldp x19, x20, [sp], #16
+ ldr x30, [sp], #16
+ ret
+endfunc
More information about the ffmpeg-cvslog
mailing list