[FFmpeg-devel] [PATCH] x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse, sse4}
James Almer
jamrial at gmail.com
Mon Jun 12 21:09:27 EEST 2017
About 2x faster than the c version.
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/aacpsdsp.asm | 123 +++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/aacpsdsp_init.c | 8 +++
libavutil/x86/x86util.asm | 15 +++--
3 files changed, 140 insertions(+), 6 deletions(-)
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
index f7f22f274c..cdcadefcdc 100644
--- a/libavcodec/x86/aacpsdsp.asm
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -172,6 +172,129 @@ align 16
.ret:
REP_RET
+;***********************************************************
+;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
+; float (*in)[32][2],
+; int i, int len)
+;***********************************************************
+%macro HYBRID_SYNTHESIS_DEINT 0
+cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
+%if cpuflag(sse4)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+ movsxdifnidn iq, id
+ mov lend, 32 << 3
+ lea outq, [outq+iq*4]
+ mov tmpd, id
+ shl tmpd, 8
+ add inq, tmpq
+ mov tmpd, 64
+ sub tmpd, id
+ mov id, tmpd
+
+ test id, 1
+ jne .loop4
+ test id, 2
+ jne .loop8
+
+align 16
+.loop16:
+ mov out0q, outq
+ mov out1q, 38*64*4
+ add out1q, out0q
+ mov tmpd, lend
+
+.inner_loop16:
+ movaps m0, [inq]
+ movaps m1, [inq+lenq]
+ movaps m2, [inq+lenq*2]
+ movaps m3, [inq+3*32*2*4]
+ TRANSPOSE4x4PS 0, 1, 2, 3, 4
+ movaps [out0q], m0
+ movaps [out1q], m1
+ movaps [out0q+lenq], m2
+ movaps [out1q+lenq], m3
+ lea out0q, [out0q+lenq*2]
+ lea out1q, [out1q+lenq*2]
+ add inq, mmsize
+ sub tmpd, mmsize
+ jg .inner_loop16
+ add outq, 16
+ add inq, 3*32*2*4
+ sub id, 4
+ jg .loop16
+ RET
+
+align 16
+.loop8:
+ mov out0q, outq
+ mov out1q, 38*64*4
+ add out1q, out0q
+ mov tmpd, lend
+
+.inner_loop8:
+ movaps m0, [inq]
+ movaps m1, [inq+lenq]
+ SBUTTERFLYPS 0, 1, 2
+ SBUTTERFLYPD 0, 1, 2
+ MOVH [out0q], m0
+ MOVH [out1q], m1
+ movhps [out0q+lenq], m0
+ movhps [out1q+lenq], m1
+ lea out0q, [out0q+lenq*2]
+ lea out1q, [out1q+lenq*2]
+ add inq, mmsize
+ sub tmpd, mmsize
+ jg .inner_loop8
+ add outq, 8
+ add inq, lenq
+ sub id, 2
+ jg .loop16
+ RET
+
+align 16
+.loop4:
+ mov out0q, outq
+ mov out1q, 38*64*4
+ add out1q, out0q
+ mov tmpd, lend
+
+.inner_loop4:
+ movaps m0, [inq]
+ movss [out0q], m0
+%if cpuflag(sse4)
+ extractps [out1q], m0, 1
+ extractps [out0q+lenq], m0, 2
+ extractps [out1q+lenq], m0, 3
+%else
+ movhlps m1, m0
+ movss [out0q+lenq], m1
+ shufps m0, m0, 0xb1
+ movss [out1q], m0
+ movhlps m1, m0
+ movss [out1q+lenq], m1
+%endif
+ lea out0q, [out0q+lenq*2]
+ lea out1q, [out1q+lenq*2]
+ add inq, mmsize
+ sub tmpd, mmsize
+ jg .inner_loop4
+ add outq, 4
+ sub id, 1
+ test id, 2
+ jne .loop8
+ cmp id, 4
+ jge .loop16
+ RET
+%endmacro
+
+INIT_XMM sse
+HYBRID_SYNTHESIS_DEINT
+INIT_XMM sse4
+HYBRID_SYNTHESIS_DEINT
+
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
index 767ae6588e..25e089c395 100644
--- a/libavcodec/x86/aacpsdsp_init.c
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -40,6 +40,10 @@ void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
+void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
+ int i, int len);
+void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
+ int i, int len);
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
{
@@ -48,6 +52,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
if (EXTERNAL_SSE(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse;
s->mul_pair_single = ff_ps_mul_pair_single_sse;
+ s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
}
if (EXTERNAL_SSE3(cpu_flags)) {
@@ -56,4 +61,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
}
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
+ }
}
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index fe9a727e22..cc7d272cad 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -71,6 +71,12 @@
SWAP %1, %3, %2
%endmacro
+%macro SBUTTERFLYPD 3
+ movlhps m%3, m%1, m%2
+ movhlps m%2, m%2, m%1
+ SWAP %1, %3
+%endmacro
+
%macro TRANSPOSE4x4B 5
SBUTTERFLY bw, %1, %2, %5
SBUTTERFLY bw, %3, %4, %5
@@ -117,12 +123,9 @@
%macro TRANSPOSE4x4PS 5
SBUTTERFLYPS %1, %2, %5
SBUTTERFLYPS %3, %4, %5
- movlhps m%5, m%1, m%3
- movhlps m%3, m%1
- SWAP %5, %1
- movlhps m%5, m%2, m%4
- movhlps m%4, m%2
- SWAP %5, %2, %3
+ SBUTTERFLYPD %1, %3, %5
+ SBUTTERFLYPD %2, %4, %5
+ SWAP %2, %3
%endmacro
%macro TRANSPOSE8x4D 9-11
--
2.13.0
More information about the ffmpeg-devel
mailing list