[FFmpeg-devel] [PATCH 2/2] x86/aacps: add ff_ps_stereo_interpolate_ipdopd_sse3()
James Almer
jamrial at gmail.com
Tue May 23 22:01:18 EEST 2017
About 2x faster than the c version.
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/aacpsdsp.asm | 51 ++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/aacpsdsp_init.c | 4 ++++
2 files changed, 55 insertions(+)
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index e92cbbce08..bb8a7f5df0 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -117,6 +117,57 @@ align 16
.ret:
REP_RET
+;***************************************************************************
+;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+; float h[2][4], float h_step[2][4],
+; int len);
+;***************************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
+ cmp nd, 0
+ jle .ret
+ movaps m0, [hq]
+ movaps m1, [hq+mmsize]
+%if ARCH_X86_64
+ movaps m8, [h_stepq]
+ movaps m9, [h_stepq+mmsize]
+ %define H_STEP0 m8
+ %define H_STEP1 m9
+%else
+ %define H_STEP0 [h_stepq]
+ %define H_STEP1 [h_stepq+mmsize]
+%endif
+ shl nd, 3
+ add lq, nq
+ add rq, nq
+ neg nq
+
+align 16
+.loop:
+ addps m0, H_STEP0
+ addps m1, H_STEP1
+ movddup m2, [lq+nq]
+ movddup m3, [rq+nq]
+ shufps m4, m2, m2, q2301
+ shufps m5, m3, m3, q2301
+ unpcklps m6, m0, m0
+ unpckhps m7, m0, m0
+ mulps m2, m6
+ mulps m3, m7
+ unpcklps m6, m1, m1
+ unpckhps m7, m1, m1
+ mulps m4, m6
+ mulps m5, m7
+ addps m2, m3
+ addsubps m4, m5
+ addsubps m2, m4
+ movsd [lq+nq], m2
+ movhps [rq+nq], m2
+ add nq, 8
+ jl .loop
+.ret:
+ REP_RET
+
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
index f6d6c039c3..767ae6588e 100644
--- a/libavcodec/x86/aacpsdsp_init.c
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -37,6 +37,9 @@ void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
+void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
{
@@ -50,6 +53,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
if (EXTERNAL_SSE3(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse3;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
+ s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
}
}
--
2.12.1
More information about the ffmpeg-devel
mailing list