[FFmpeg-cvslog] lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX

Thu May 10 23:33:13 CEST 2012

ffmpeg | branch: master | Justin Ruggles <justin.ruggles at gmail.com> | Thu May  3 15:23:32 2012 -0400| [5cc6d5244d4ec89b3ac855abff4a3d19caee22f1] | committer: Justin Ruggles

lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX

The current SSE version is slower than the MMX version on Athlon64 and Sandy
Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5cc6d5244d4ec89b3ac855abff4a3d19caee22f1
---

 libavresample/x86/audio_convert.asm    |   30 ++++++++++++++++--------------
 libavresample/x86/audio_convert_init.c |   13 +++++++++----
 libavutil/x86/x86util.asm              |    7 +++----
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm
index 809c5d1..ba59f33 100644
--- a/libavresample/x86/audio_convert.asm
+++ b/libavresample/x86/audio_convert.asm
@@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
     mova      m3, [srcq+src3q]
     mova      m4, [srcq+src4q]
     mova      m5, [srcq+src5q]
-%if cpuflag(sse)
+%if cpuflag(sse4)
     SBUTTERFLYPS 0, 1, 6
     SBUTTERFLYPS 2, 3, 6
     SBUTTERFLYPS 4, 5, 6
 
-    movaps    m6, m4
-    shufps    m4, m0, q3210
+    blendps   m6, m4, m0, 1100b
     movlhps   m0, m2
-    movhlps   m6, m2
-    movaps [dstq   ], m0
-    movaps [dstq+16], m4
-    movaps [dstq+32], m6
-
-    movaps    m6, m5
-    shufps    m5, m1, q3210
+    movhlps   m4, m2
+    blendps   m2, m5, m1, 1100b
     movlhps   m1, m3
-    movhlps   m6, m3
+    movhlps   m5, m3
+
+    movaps [dstq   ], m0
+    movaps [dstq+16], m6
+    movaps [dstq+32], m4
     movaps [dstq+48], m1
-    movaps [dstq+64], m5
-    movaps [dstq+80], m6
+    movaps [dstq+64], m2
+    movaps [dstq+80], m5
 %else ; mmx
     SBUTTERFLY dq, 0, 1, 6
     SBUTTERFLY dq, 2, 3, 6
@@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
 
 INIT_MMX mmx
 CONV_FLTP_TO_FLT_6CH
-INIT_XMM sse
+INIT_XMM sse4
+CONV_FLTP_TO_FLT_6CH
+%if HAVE_AVX
+INIT_XMM avx
 CONV_FLTP_TO_FLT_6CH
+%endif
diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c
index 6883f10..206aede 100644
--- a/libavresample/x86/audio_convert_init.c
+++ b/libavresample/x86/audio_convert_init.c
@@ -22,8 +22,9 @@
 #include "libavutil/cpu.h"
 #include "libavresample/audio_convert.h"
 
-extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len);
-extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
+extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
 
 av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
 {
@@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
         ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
                                   6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
     }
-    if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
+    if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
         ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
-                                  6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse);
+                                  6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
+    }
+    if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
+        ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
+                                  6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
     }
 #endif
 }
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 55f4a93..508f24e 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -42,10 +42,9 @@
 %endmacro
 
 %macro SBUTTERFLYPS 3
-    movaps   m%3, m%1
-    unpcklps m%1, m%2
-    unpckhps m%3, m%2
-    SWAP %2, %3
+    unpcklps m%3, m%1, m%2
+    unpckhps m%1, m%1, m%2
+    SWAP %1, %3, %2
 %endmacro
 
 %macro TRANSPOSE4x4B 5