[FFmpeg-cvslog] lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX
Justin Ruggles
git at videolan.org
Thu May 10 23:33:13 CEST 2012
ffmpeg | branch: master | Justin Ruggles <justin.ruggles at gmail.com> | Thu May 3 15:23:32 2012 -0400| [5cc6d5244d4ec89b3ac855abff4a3d19caee22f1] | committer: Justin Ruggles
lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX
The current SSE version is slower than the MMX version on Athlon64 and Sandy
Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5cc6d5244d4ec89b3ac855abff4a3d19caee22f1
---
libavresample/x86/audio_convert.asm | 30 ++++++++++++++++--------------
libavresample/x86/audio_convert_init.c | 13 +++++++++----
libavutil/x86/x86util.asm | 7 +++----
3 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm
index 809c5d1..ba59f33 100644
--- a/libavresample/x86/audio_convert.asm
+++ b/libavresample/x86/audio_convert.asm
@@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
mova m3, [srcq+src3q]
mova m4, [srcq+src4q]
mova m5, [srcq+src5q]
-%if cpuflag(sse)
+%if cpuflag(sse4)
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
- movaps m6, m4
- shufps m4, m0, q3210
+ blendps m6, m4, m0, 1100b
movlhps m0, m2
- movhlps m6, m2
- movaps [dstq ], m0
- movaps [dstq+16], m4
- movaps [dstq+32], m6
-
- movaps m6, m5
- shufps m5, m1, q3210
+ movhlps m4, m2
+ blendps m2, m5, m1, 1100b
movlhps m1, m3
- movhlps m6, m3
+ movhlps m5, m3
+
+ movaps [dstq ], m0
+ movaps [dstq+16], m6
+ movaps [dstq+32], m4
movaps [dstq+48], m1
- movaps [dstq+64], m5
- movaps [dstq+80], m6
+ movaps [dstq+64], m2
+ movaps [dstq+80], m5
%else ; mmx
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
@@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
INIT_MMX mmx
CONV_FLTP_TO_FLT_6CH
-INIT_XMM sse
+INIT_XMM sse4
+CONV_FLTP_TO_FLT_6CH
+%if HAVE_AVX
+INIT_XMM avx
CONV_FLTP_TO_FLT_6CH
+%endif
diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c
index 6883f10..206aede 100644
--- a/libavresample/x86/audio_convert_init.c
+++ b/libavresample/x86/audio_convert_init.c
@@ -22,8 +22,9 @@
#include "libavutil/cpu.h"
#include "libavresample/audio_convert.h"
-extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len);
-extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
{
@@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
}
- if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
+ if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
- 6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse);
+ 6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
+ }
+ if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
+ ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
+ 6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
}
#endif
}
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 55f4a93..508f24e 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -42,10 +42,9 @@
%endmacro
%macro SBUTTERFLYPS 3
- movaps m%3, m%1
- unpcklps m%1, m%2
- unpckhps m%3, m%2
- SWAP %2, %3
+ unpcklps m%3, m%1, m%2
+ unpckhps m%1, m%1, m%2
+ SWAP %1, %3, %2
%endmacro
%macro TRANSPOSE4x4B 5
More information about the ffmpeg-cvslog
mailing list