[FFmpeg-devel] [PATCH] x86/dcadec: add ff_lfe_fir1_float_{sse3, avx}

James Almer jamrial at gmail.com
Mon Feb 22 22:43:06 CET 2016


Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavcodec/x86/dcadsp.asm    | 86 ++++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/dcadsp_init.c |  9 ++++-
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index fb13957..bea834f 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -201,3 +201,89 @@ LFE_FIR0_FLOAT
 INIT_XMM fma3
 LFE_FIR0_FLOAT
 %endif
+
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 2
+    sub     lfeq, 3*sizeof_float
+    mov    cnt1d, 64*sizeof_float
+    mov    cnt2d, 64*sizeof_float-16
+    lea   coeffq, [coeffq+cnt1q*4]
+    add samplesq, cnt1q
+    neg    cnt1q
+
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq]
+    shufps    m5, m4, m4, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq]
+    cvtdq2ps  m4, m4
+    pshufd    m5, m4, q0123
+%endif
+
+.inner_loop:
+%if ARCH_X86_64
+    movaps    m6, [coeffq+cnt1q*4   ]
+    movaps    m7, [coeffq+cnt1q*4+16]
+    movaps    m8, [coeffq+cnt1q*4+32]
+    movaps    m9, [coeffq+cnt1q*4+48]
+    mulps     m0, m5, m6
+    mulps     m1, m5, m7
+    mulps     m2, m5, m8
+    mulps     m3, m5, m9
+%else
+    movaps    m6, [coeffq+cnt1q*4   ]
+    movaps    m7, [coeffq+cnt1q*4+16]
+    mulps     m0, m5, m6
+    mulps     m1, m5, m7
+    mulps     m2, m5, [coeffq+cnt1q*4+32]
+    mulps     m3, m5, [coeffq+cnt1q*4+48]
+%endif
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+
+%if ARCH_X86_64
+    mulps     m6, m4
+    mulps     m7, m4
+    mulps     m8, m4
+    mulps     m9, m4
+
+    haddps    m6, m7
+    haddps    m8, m9
+    haddps    m6, m8
+    movaps [samplesq+cnt2q], m6
+%else
+    mulps     m6, m4
+    mulps     m7, m4
+    mulps     m2, m4, [coeffq+cnt1q*4+32]
+    mulps     m3, m4, [coeffq+cnt1q*4+48]
+
+    haddps    m6, m7
+    haddps    m2, m3
+    haddps    m6, m2
+    movaps [samplesq+cnt2q], m6
+%endif
+
+    sub    cnt2d, 16
+    add    cnt1q, 16
+    jl .inner_loop
+
+    add     lfeq, sizeof_float
+    add samplesq, 128*sizeof_float
+    mov    cnt1q, -64*sizeof_float
+    mov    cnt2d,  64*sizeof_float-16
+    sub nblocksd, 1
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
+%endif
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index bfe13e5..fc10fb8 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,10 +23,13 @@
 
 #define LFE_FIR_FLOAT_FUNC(opt)                                               \
 void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
                              const float *filter_coeff, ptrdiff_t npcmblocks);
 
 LFE_FIR_FLOAT_FUNC(sse)
 LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
 LFE_FIR_FLOAT_FUNC(avx)
 LFE_FIR_FLOAT_FUNC(fma3)
 
@@ -38,8 +41,12 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
         s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
     if (EXTERNAL_SSE2(cpu_flags))
         s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
-    if (EXTERNAL_AVX(cpu_flags))
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+    if (EXTERNAL_AVX(cpu_flags)) {
         s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
+    }
     if (EXTERNAL_FMA3(cpu_flags))
         s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
 }
-- 
2.7.0



More information about the ffmpeg-devel mailing list