[FFmpeg-devel] [PATCH 5/6] x86: lossless audio: SSE4 madd 32bits
Christophe Gisquet
christophe.gisquet at gmail.com
Mon Apr 18 15:07:30 CEST 2016
The unique user so far is wmalossless 24bits. The few samples tested show an
order of 8, so more unrolling or an avx2 version do not make sense.
Timings: 72 -> 49 cycles
---
libavcodec/x86/lossless_audiodsp.asm | 38 +++++++++++++++++++++++++++++++++
libavcodec/x86/lossless_audiodsp_init.c | 7 ++++++
2 files changed, 45 insertions(+)
diff --git a/libavcodec/x86/lossless_audiodsp.asm b/libavcodec/x86/lossless_audiodsp.asm
index 5597dad..1e295de 100644
--- a/libavcodec/x86/lossless_audiodsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -155,3 +155,41 @@ SCALARPRODUCT_LOOP 0
HADDD m6, m0
movd eax, m6
RET
+
+%macro SCALARPRODUCT32 0
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+; int order, int mul)
+cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
+ movd m7, mulm
+ SPLATD m7
+ pxor m6, m6
+ add v1q, orderq
+ add v2q, orderq
+ add v3q, orderq
+ neg orderq
+.loop:
+ movu m0, [v2q + orderq]
+ movu m1, [v2q + orderq + mmsize]
+ mova m4, [v1q + orderq]
+ mova m5, [v1q + orderq + mmsize]
+ movu m2, [v3q + orderq]
+ movu m3, [v3q + orderq + mmsize]
+ pmulld m0, m4
+ pmulld m1, m5
+ pmulld m2, m7
+ pmulld m3, m7
+ paddd m6, m0
+ paddd m6, m1
+ paddd m2, m4
+ paddd m3, m5
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ add orderq, mmsize*2
+ jl .loop
+ HADDD m6, m0
+ movd eax, m6
+ RET
+%endmacro
+
+INIT_XMM sse4
+SCALARPRODUCT32
diff --git a/libavcodec/x86/lossless_audiodsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c
index 197173c..85306cb 100644
--- a/libavcodec/x86/lossless_audiodsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -31,6 +31,10 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
+int32_t ff_scalarproduct_and_madd_int32_sse4(int32_t *v1, const int32_t *v2,
+ const int32_t *v3,
+ int order, int mul);
+
av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
{
#if HAVE_YASM
@@ -45,5 +49,8 @@ av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
if (EXTERNAL_SSSE3(cpu_flags) &&
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+
+ if (EXTERNAL_SSE4(cpu_flags))
+ c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
#endif
}
--
2.8.1
More information about the ffmpeg-devel
mailing list