[FFmpeg-devel] [PATCH] x86/dcadec: add ff_lfe_fir1_float_{sse3, avx}
James Almer
jamrial at gmail.com
Mon Feb 22 22:43:06 CET 2016
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/dcadsp.asm | 86 ++++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/dcadsp_init.c | 9 ++++-
2 files changed, 94 insertions(+), 1 deletion(-)
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index fb13957..bea834f 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -201,3 +201,89 @@ LFE_FIR0_FLOAT
INIT_XMM fma3
LFE_FIR0_FLOAT
%endif
+
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+ shr nblocksd, 2
+ sub lfeq, 3*sizeof_float
+ mov cnt1d, 64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ lea coeffq, [coeffq+cnt1q*4]
+ add samplesq, cnt1q
+ neg cnt1q
+
+.loop:
+%if cpuflag(avx)
+ cvtdq2ps m4, [lfeq]
+ shufps m5, m4, m4, q0123
+%elif cpuflag(sse2)
+ movu m4, [lfeq]
+ cvtdq2ps m4, m4
+ pshufd m5, m4, q0123
+%endif
+
+.inner_loop:
+%if ARCH_X86_64
+ movaps m6, [coeffq+cnt1q*4 ]
+ movaps m7, [coeffq+cnt1q*4+16]
+ movaps m8, [coeffq+cnt1q*4+32]
+ movaps m9, [coeffq+cnt1q*4+48]
+ mulps m0, m5, m6
+ mulps m1, m5, m7
+ mulps m2, m5, m8
+ mulps m3, m5, m9
+%else
+ movaps m6, [coeffq+cnt1q*4 ]
+ movaps m7, [coeffq+cnt1q*4+16]
+ mulps m0, m5, m6
+ mulps m1, m5, m7
+ mulps m2, m5, [coeffq+cnt1q*4+32]
+ mulps m3, m5, [coeffq+cnt1q*4+48]
+%endif
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
+
+%if ARCH_X86_64
+ mulps m6, m4
+ mulps m7, m4
+ mulps m8, m4
+ mulps m9, m4
+
+ haddps m6, m7
+ haddps m8, m9
+ haddps m6, m8
+ movaps [samplesq+cnt2q], m6
+%else
+ mulps m6, m4
+ mulps m7, m4
+ mulps m2, m4, [coeffq+cnt1q*4+32]
+ mulps m3, m4, [coeffq+cnt1q*4+48]
+
+ haddps m6, m7
+ haddps m2, m3
+ haddps m6, m2
+ movaps [samplesq+cnt2q], m6
+%endif
+
+ sub cnt2d, 16
+ add cnt1q, 16
+ jl .inner_loop
+
+ add lfeq, sizeof_float
+ add samplesq, 128*sizeof_float
+ mov cnt1q, -64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ sub nblocksd, 1
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
+%endif
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index bfe13e5..fc10fb8 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,10 +23,13 @@
#define LFE_FIR_FLOAT_FUNC(opt) \
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
+ const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks);
LFE_FIR_FLOAT_FUNC(sse)
LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
LFE_FIR_FLOAT_FUNC(avx)
LFE_FIR_FLOAT_FUNC(fma3)
@@ -38,8 +41,12 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
- if (EXTERNAL_AVX(cpu_flags))
+ if (EXTERNAL_SSE3(cpu_flags))
+ s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+ if (EXTERNAL_AVX(cpu_flags)) {
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+ s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
+ }
if (EXTERNAL_FMA3(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
}
--
2.7.0
More information about the ffmpeg-devel
mailing list