[FFmpeg-devel] [PATCH] x86/swr: replace sse4 instructions in pack_6ch with sse ones
James Almer
jamrial at gmail.com
Tue Nov 4 22:11:27 CET 2014
There's no benefit from using blendps here except on CPUs with AVX, where
it's faster than shufps according to Intel's documentation.
As such, rename the sse4 functions to sse/sse2 and use shufps instead.
Signed-off-by: James Almer <jamrial at gmail.com>
---
libswresample/x86/audio_convert.asm | 31 +++++++++++++++++++++----------
libswresample/x86/audio_convert_init.c | 23 ++++++++++++-----------
2 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/libswresample/x86/audio_convert.asm b/libswresample/x86/audio_convert.asm
index b6e9e5d..d77e934 100644
--- a/libswresample/x86/audio_convert.asm
+++ b/libswresample/x86/audio_convert.asm
@@ -245,15 +245,27 @@ pack_6ch_%2_to_%1_u_int %+ SUFFIX
mov%3 m4, [srcq+src4q]
mov%3 m5, [srcq+src5q]
%7 x,x,x,x,m7,x
-%if cpuflag(sse4)
+%if cpuflag(sse)
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
+%if cpuflag(avx)
blendps m6, m4, m0, 1100b
+%else
+ movaps m6, m4
+ shufps m4, m0, q3210
+ SWAP 4,6
+%endif
movlhps m0, m2
movhlps m4, m2
+%if cpuflag(avx)
blendps m2, m5, m1, 1100b
+%else
+ movaps m2, m5
+ shufps m5, m1, q3210
+ SWAP 2,5
+%endif
movlhps m1, m3
movhlps m5, m3
@@ -380,6 +392,10 @@ CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
+INIT_XMM sse
+PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
+PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
+
INIT_XMM sse2
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
@@ -431,6 +447,10 @@ UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
INIT_XMM ssse3
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
@@ -440,15 +460,6 @@ UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
-INIT_XMM sse4
-PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
-PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
-
-PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
-PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
-PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
-PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
-
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
diff --git a/libswresample/x86/audio_convert_init.c b/libswresample/x86/audio_convert_init.c
index a26cdf6..769575d 100644
--- a/libswresample/x86/audio_convert_init.c
+++ b/libswresample/x86/audio_convert_init.c
@@ -58,7 +58,12 @@ MULTI_CAPS_FUNC(SSE2, sse2)
ac->simd_f = ff_pack_6ch_float_to_float_a_mmx;
}
}
-
+ if(EXTERNAL_SSE(mm_flags)) {
+ if(channels == 6) {
+ if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+ ac->simd_f = ff_pack_6ch_float_to_float_a_sse;
+ }
+ }
if(EXTERNAL_SSE2(mm_flags)) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_int32_to_float_a_sse2;
@@ -105,6 +110,12 @@ MULTI_CAPS_FUNC(SSE2, sse2)
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLT)
ac->simd_f = ff_unpack_2ch_float_to_int16_a_sse2;
}
+ if(channels == 6) {
+ if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
+ ac->simd_f = ff_pack_6ch_int32_to_float_a_sse2;
+ if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
+ ac->simd_f = ff_pack_6ch_float_to_int32_a_sse2;
+ }
}
if(EXTERNAL_SSSE3(mm_flags)) {
if(channels == 2) {
@@ -116,16 +127,6 @@ MULTI_CAPS_FUNC(SSE2, sse2)
ac->simd_f = ff_unpack_2ch_int16_to_float_a_ssse3;
}
}
- if(EXTERNAL_SSE4(mm_flags)) {
- if(channels == 6) {
- if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
- ac->simd_f = ff_pack_6ch_float_to_float_a_sse4;
- if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
- ac->simd_f = ff_pack_6ch_int32_to_float_a_sse4;
- if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
- ac->simd_f = ff_pack_6ch_float_to_int32_a_sse4;
- }
- }
if(EXTERNAL_AVX(mm_flags)) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_int32_to_float_a_avx;
--
2.0.4
More information about the ffmpeg-devel
mailing list