[FFmpeg-devel] [PATCH 10/10] dcadsp: x86: SSE implementation of decode_hf

Fri Feb 14 17:00:54 CET 2014

For x86 Arrandale:
        C  SSE SSE2 SSE4
win32: 260 162  119  104
win64: 242 N/A   89   72
---
 libavcodec/x86/dca.h         | 55 --------------------------------------------
 libavcodec/x86/dcadsp.asm    | 53 ++++++++++++++++++++++++++++++------------
 libavcodec/x86/dcadsp_init.c | 18 ++++++++++-----
 3 files changed, 50 insertions(+), 76 deletions(-)
 delete mode 100644 libavcodec/x86/dca.h

diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index 6415129..0000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet at gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
-    __asm__ volatile (
-        "cvtsi2ss        %2, %%xmm0 \n\t"
-        "mulss           %3, %%xmm0 \n\t"
-        "movq          (%1), %%xmm1 \n\t"
-        "punpcklbw   %%xmm1, %%xmm1 \n\t"
-        "movaps      %%xmm1, %%xmm2 \n\t"
-        "punpcklwd   %%xmm1, %%xmm1 \n\t"
-        "punpckhwd   %%xmm2, %%xmm2 \n\t"
-        "psrad          $24, %%xmm1 \n\t"
-        "psrad          $24, %%xmm2 \n\t"
-        "shufps  $0, %%xmm0, %%xmm0 \n\t"
-        "cvtdq2ps    %%xmm1, %%xmm1 \n\t"
-        "cvtdq2ps    %%xmm2, %%xmm2 \n\t"
-        "mulps       %%xmm0, %%xmm1 \n\t"
-        "mulps       %%xmm0, %%xmm2 \n\t"
-        "movaps      %%xmm1,  0(%0) \n\t"
-        "movaps      %%xmm2, 16(%0) \n\t"
-        :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
-        XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
-    );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 9aa7ab6..8c2f62a 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -26,18 +26,35 @@ pf_inv16:  times 4 dd 0x3D800000 ; 1/16
 
 SECTION_TEXT
 
-; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
-%macro INT8X8_FMUL_INT32 0
-cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
-    cvtsi2ss    m0, scalem
+; decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
+;                     const int8_t hf_vq[1024][32], intptr_t vq_offset,
+;                     int32_t scale[DCA_SUBBANDS][2],
+;                     intptr_t start, intptr_t end)
+%macro DECODE_HF 0
+cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
+    lea       srcq, [srcq + offsetq]
+    shl     startq, 2
+    mov    offsetd, endm
+%define DICT offsetq
+    shl    offsetq, 2
+    mov       endm, offsetq
+.loop:
+%if ARCH_X86_64
+    mov    offsetd, [scaleq + 2*startq]
+    cvtsi2ss    m0, offsetd
+%else
+    cvtsi2ss    m0, [scaleq + 2*startq]
+%endif
+    mov    offsetd, [numq + startq]
     mulss       m0, [pf_inv16]
+    shl       DICT, 5
     shufps      m0, m0, 0
 %if cpuflag(sse2)
 %if cpuflag(sse4)
-    pmovsxbd    m1, [srcq+0]
-    pmovsxbd    m2, [srcq+4]
+    pmovsxbd    m1, [srcq + DICT + 0]
+    pmovsxbd    m2, [srcq + DICT + 4]
 %else
-    movq        m1, [srcq]
+    movq        m1, [srcq + DICT]
     punpcklbw   m1, m1
     mova        m2, m1
     punpcklwd   m1, m1
@@ -48,8 +65,8 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
     cvtdq2ps    m1, m1
     cvtdq2ps    m2, m2
 %else
-    movd       mm0, [srcq+0]
-    movd       mm1, [srcq+4]
+    movd       mm0, [srcq + DICT + 0]
+    movd       mm1, [srcq + DICT + 4]
     punpcklbw  mm0, mm0
     punpcklbw  mm1, mm1
     movq       mm2, mm0
@@ -67,27 +84,33 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
     cvtpi2ps    m3, mm2
     cvtpi2ps    m4, mm3
     shufps      m0, m0, 0
-    emms
     shufps      m1, m3, q1010
     shufps      m2, m4, q1010
 %endif
     mulps       m1, m0
     mulps       m2, m0
-    mova [dstq+ 0], m1
-    mova [dstq+16], m2
+    mova [dstq + 8*startq +  0], m1
+    mova [dstq + 8*startq + 16], m2
+    add     startq, 4
+    cmp     startq, endm
+    jl       .loop
+.end:
+%if notcpuflag(sse2)
+    emms
+%endif
     REP_RET
 %endmacro
 
 %if ARCH_X86_32
 INIT_XMM sse
-INT8X8_FMUL_INT32
+DECODE_HF
 %endif
 
 INIT_XMM sse2
-INT8X8_FMUL_INT32
+DECODE_HF
 
 INIT_XMM sse4
-INT8X8_FMUL_INT32
+DECODE_HF
 
 ; %1=v0/v1  %2=in1  %3=in2
 %macro FIR_LOOP 2-3
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 06c31a0..f2b845b 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,9 +23,15 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
+void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                      int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                       int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
+                       int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
 void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
 
@@ -35,18 +41,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 
     if (EXTERNAL_SSE(cpu_flags)) {
 #if ARCH_X86_32
-        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
+        s->decode_hf = ff_decode_hf_sse;
 #endif
         s->lfe_fir[0]        = ff_dca_lfe_fir0_sse;
         s->lfe_fir[1]        = ff_dca_lfe_fir1_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2;
+        s->decode_hf = ff_decode_hf_sse2;
     }
 
     if (EXTERNAL_SSE4(cpu_flags)) {
-        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4;
+        s->decode_hf = ff_decode_hf_sse4;
     }
 }
 
-- 
1.8.0.msysgit.0