[FFmpeg-cvslog] Revert "x86: fft: convert sse inline asm to yasm"
Nicolas George
git at videolan.org
Tue Jun 26 13:07:06 CEST 2012
ffmpeg | branch: master | Nicolas George <nicolas.george at normalesup.org> | Tue Jun 26 13:00:14 2012 +0200| [fd91a3ec44de38251b2c15e03e26d14e983c4e44] | committer: Nicolas George
Revert "x86: fft: convert sse inline asm to yasm"
This reverts commit 82992604706144910f4a2f875d48cfc66c1b70d7.
It breaks shared builds on x86_64.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=fd91a3ec44de38251b2c15e03e26d14e983c4e44
---
libavcodec/x86/Makefile | 1 +
libavcodec/x86/fft_mmx.asm | 139 ++++----------------------------------------
libavcodec/x86/fft_sse.c | 110 +++++++++++++++++++++++++++++++++++
3 files changed, 121 insertions(+), 129 deletions(-)
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 8acbd07..f633cf6 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -43,6 +43,7 @@ YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
+YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
$(YASM-OBJS-FFT-yes)
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 7d046df..e1b485b 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -47,10 +47,6 @@ struc FFTContext
.mdctbits: resd 1
.tcos: pointer 1
.tsin: pointer 1
- .fftperm: pointer 1
- .fftcalc: pointer 1
- .imdctcalc:pointer 1
- .imdcthalf:pointer 1
endstruc
%define M_SQRT1_2 0.70710678118654752440
@@ -69,7 +65,6 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
ps_m1p1: dd 1<<31, 0
%assign i 16
@@ -538,16 +533,6 @@ DEFINE_ARGS z, w, n, o1, o3
rep ret
%endmacro
-%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
- lea r2, [dispatch_tab%1]
- mov r2, [r2 + (%2q-2)*gprsize]
-%ifdef PIC
- lea r3, [$$]
- add r2, r3
-%endif
- call r2
-%endmacro ; FFT_DISPATCH
-
INIT_YMM avx
%if HAVE_AVX
@@ -564,14 +549,6 @@ INIT_YMM avx
DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
-
-cglobal fft_calc, 2,5,8
- mov r3d, [r0 + FFTContext.nbits]
- mov r0, r1
- mov r1, r3
- FFT_DISPATCH _interleave %+ SUFFIX, r1
- REP_RET
-
%endif
INIT_XMM sse
@@ -589,112 +566,6 @@ INIT_XMM sse
DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0
-cglobal fft_calc, 2,5,8
- mov r3d, [r0 + FFTContext.nbits]
- PUSH r1
- PUSH r3
- mov r0, r1
- mov r1, r3
- FFT_DISPATCH _interleave %+ SUFFIX, r1
- POP rcx
- POP r4
- cmp rcx, 4
- jg .end
- mov r2, -1
- add rcx, 3
- shl r2, cl
- sub r4, r2
-.loop
- movaps xmm0, [r4 + r2]
- movaps xmm1, xmm0
- unpcklps xmm0, [r4 + r2 + 16]
- unpckhps xmm1, [r4 + r2 + 16]
- movaps [r4 + r2], xmm0
- movaps [r4 + r2 + 16], xmm1
- add r2, 32
- jl .loop
-.end:
- REP_RET
-
-cextern_naked memcpy
-
-cglobal fft_permute, 2,7,1
- mov r4, [r0 + FFTContext.revtab]
- mov r5, [r0 + FFTContext.tmpbuf]
- mov ecx, [r0 + FFTContext.nbits]
- mov r2, 1
- shl r2, cl
- xor r0, r0
-%if ARCH_X86_32
- mov r1, r1m
-%endif
-.loop:
- movaps xmm0, [r1 + 8*r0]
- movzx r6, word [r4 + 2*r0]
- movzx r3, word [r4 + 2*r0 + 2]
- movlps [r5 + 8*r6], xmm0
- movhps [r5 + 8*r3], xmm0
- add r0, 2
- cmp r0, r2
- jl .loop
- shl r2, 3
-%if ARCH_X86_64
- mov r0, r1
- mov r1, r5
-%else
- push r2
- push r5
- push r1
-%endif
-%if ARCH_X86_64 && WIN64 == 0
- jmp memcpy
-%else
- call memcpy
-%if ARCH_X86_32
- add esp, 12
-%endif
- REP_RET
-%endif
-
-cglobal imdct_calc, 3,5,3
- mov r3d, [r0 + FFTContext.mdctsize]
- mov r4, [r0 + FFTContext.imdcthalf]
- add r1, r3
- PUSH r3
- PUSH r1
-%if ARCH_X86_32
- push r2
- push r1
- push r0
-%else
- sub rsp, 8
-%endif
- call r4
-%if ARCH_X86_32
- add esp, 12
-%else
- add rsp, 8
-%endif
- POP r1
- POP r3
- lea r0, [r1 + 2*r3]
- mov r2, r3
- sub r3, 16
- neg r2
- movaps xmm2, [ps_m1m1m1m1]
-.loop:
- movaps xmm0, [r1 + r3]
- movaps xmm1, [r0 + r2]
- shufps xmm0, xmm0, 0x1b
- shufps xmm1, xmm1, 0x1b
- xorps xmm0, xmm2
- movaps [r0 + r3], xmm1
- movaps [r1 + r2], xmm0
- sub r3, 16
- add r2, 16
- jl .loop
- REP_RET
-
INIT_MMX 3dnow
%define mulps pfmul
%define addps pfadd
@@ -712,6 +583,16 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define SECTION_REL
%endif
+%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
+ lea r2, [dispatch_tab%1]
+ mov r2, [r2 + (%2q-2)*gprsize]
+%ifdef PIC
+ lea r3, [$$]
+ add r2, r3
+%endif
+ call r2
+%endmacro ; FFT_DISPATCH
+
%macro DECL_FFT 1-2 ; nbits, suffix
%ifidn %0, 1
%xdefine fullsuffix SUFFIX
diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c
new file mode 100644
index 0000000..13b992f
--- /dev/null
+++ b/libavcodec/x86/fft_sse.c
@@ -0,0 +1,110 @@
+/*
+ * FFT/MDCT transform with SSE optimizations
+ * Copyright (c) 2008 Loren Merritt
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "fft.h"
+#include "config.h"
+
+DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
+ { 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
+
+void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
+
+#if HAVE_AVX
+void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
+{
+ ff_fft_dispatch_interleave_avx(z, s->nbits);
+}
+#endif
+
+void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
+{
+ int n = 1 << s->nbits;
+
+ ff_fft_dispatch_interleave_sse(z, s->nbits);
+
+ if(n <= 16) {
+ x86_reg i = -8*n;
+ __asm__ volatile(
+ "1: \n"
+ "movaps (%0,%1), %%xmm0 \n"
+ "movaps %%xmm0, %%xmm1 \n"
+ "unpcklps 16(%0,%1), %%xmm0 \n"
+ "unpckhps 16(%0,%1), %%xmm1 \n"
+ "movaps %%xmm0, (%0,%1) \n"
+ "movaps %%xmm1, 16(%0,%1) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(z+n)
+ :"memory"
+ );
+ }
+}
+
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
+{
+ int n = 1 << s->nbits;
+ int i;
+ for(i=0; i<n; i+=2) {
+ __asm__ volatile(
+ "movaps %2, %%xmm0 \n"
+ "movlps %%xmm0, %0 \n"
+ "movhps %%xmm0, %1 \n"
+ :"=m"(s->tmp_buf[s->revtab[i]]),
+ "=m"(s->tmp_buf[s->revtab[i+1]])
+ :"m"(z[i])
+ );
+ }
+ memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
+}
+
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ x86_reg j, k;
+ long n = s->mdct_size;
+ long n4 = n >> 2;
+
+ s->imdct_half(s, output + n4, input);
+
+ j = -n;
+ k = n-16;
+ __asm__ volatile(
+ "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
+ "1: \n"
+ "movaps (%2,%1), %%xmm0 \n"
+ "movaps (%3,%0), %%xmm1 \n"
+ "shufps $0x1b, %%xmm0, %%xmm0 \n"
+ "shufps $0x1b, %%xmm1, %%xmm1 \n"
+ "xorps %%xmm7, %%xmm0 \n"
+ "movaps %%xmm1, (%3,%1) \n"
+ "movaps %%xmm0, (%2,%0) \n"
+ "sub $16, %1 \n"
+ "add $16, %0 \n"
+ "jl 1b \n"
+ :"+r"(j), "+r"(k)
+ :"r"(output+n4), "r"(output+n4*3)
+ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
+ );
+}
More information about the ffmpeg-cvslog
mailing list