[FFmpeg-cvslog] ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents().

Justin Ruggles git at videolan.org
Sat Jul 2 03:38:05 CEST 2011


ffmpeg | branch: master | Justin Ruggles <justin.ruggles at gmail.com> | Thu Jun 30 17:48:44 2011 -0400| [f99a5ef92e5aba87a2d861822274147c994041d5] | committer: Justin Ruggles

ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents().

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f99a5ef92e5aba87a2d861822274147c994041d5
---

 libavcodec/x86/ac3dsp.asm   |  102 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/ac3dsp_mmx.c |    9 ++++
 2 files changed, 111 insertions(+), 0 deletions(-)

diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 6892ec2..c1b0906 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -32,6 +32,11 @@ cextern ac3_bap_bits
 pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
 
+; used in ff_ac3_extract_exponents()
+pd_1:   times 4 dd 1
+pd_151: times 4 dd 151
+pb_shuf_4dwb: db 0, 4, 8, 12
+
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -346,3 +351,100 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
     movd       eax, m0
     add        eax, sumd
     RET
+
+;------------------------------------------------------------------------------
+; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
+;------------------------------------------------------------------------------
+
+%macro PABSD_MMX 2 ; src/dst, tmp
+    pxor     %2, %2
+    pcmpgtd  %2, %1
+    pxor     %1, %2
+    psubd    %1, %2
+%endmacro
+
+%macro PABSD_SSSE3 1-2 ; src/dst, unused
+    pabsd    %1, %1
+%endmacro
+
+%ifdef HAVE_AMD3DNOW
+INIT_MMX
+cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
+    add      expq, lenq
+    lea     coefq, [coefq+4*lenq]
+    neg      lenq
+    movq       m3, [pd_1]
+    movq       m4, [pd_151]
+.loop:
+    movq       m0, [coefq+4*lenq  ]
+    movq       m1, [coefq+4*lenq+8]
+    PABSD_MMX  m0, m2
+    PABSD_MMX  m1, m2
+    pslld      m0, 1
+    por        m0, m3
+    pi2fd      m2, m0
+    psrld      m2, 23
+    movq       m0, m4
+    psubd      m0, m2
+    pslld      m1, 1
+    por        m1, m3
+    pi2fd      m2, m1
+    psrld      m2, 23
+    movq       m1, m4
+    psubd      m1, m2
+    packssdw   m0, m0
+    packuswb   m0, m0
+    packssdw   m1, m1
+    packuswb   m1, m1
+    punpcklwd  m0, m1
+    movd  [expq+lenq], m0
+    add      lenq, 4
+    jl .loop
+    REP_RET
+%endif
+
+%macro AC3_EXTRACT_EXPONENTS 1
+cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
+    add     expq, lenq
+    lea    coefq, [coefq+4*lenq]
+    neg     lenq
+    mova      m2, [pd_1]
+    mova      m3, [pd_151]
+%ifidn %1, ssse3 ;
+    movd      m4, [pb_shuf_4dwb]
+%endif
+.loop:
+    ; move 4 32-bit coefs to xmm0
+    mova      m0, [coefq+4*lenq]
+    ; absolute value
+    PABSD     m0, m1
+    ; convert to float and extract exponents
+    pslld     m0, 1
+    por       m0, m2
+    cvtdq2ps  m1, m0
+    psrld     m1, 23
+    mova      m0, m3
+    psubd     m0, m1
+    ; move the lowest byte in each of 4 dwords to the low dword
+%ifidn %1, ssse3
+    pshufb    m0, m4
+%else
+    packssdw  m0, m0
+    packuswb  m0, m0
+%endif
+    movd  [expq+lenq], m0
+
+    add     lenq, 4
+    jl .loop
+    REP_RET
+%endmacro
+
+%ifdef HAVE_SSE
+INIT_XMM
+%define PABSD PABSD_MMX
+AC3_EXTRACT_EXPONENTS sse2
+%ifdef HAVE_SSSE3
+%define PABSD PABSD_SSSE3
+AC3_EXTRACT_EXPONENTS ssse3
+%endif
+%endif
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 2664736..692d240 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -44,6 +44,10 @@ extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned i
 
 extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
 
+extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
+extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
+extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
+
 av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
 {
     int mm_flags = av_get_cpu_flags();
@@ -56,6 +60,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
         c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
     }
     if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
+        c->extract_exponents = ff_ac3_extract_exponents_3dnow;
         if (!bit_exact) {
             c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
         }
@@ -72,6 +77,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
         c->float_to_fixed24 = ff_float_to_fixed24_sse2;
         c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
+        c->extract_exponents = ff_ac3_extract_exponents_sse2;
         if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
             c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
             c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
@@ -79,6 +85,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
     }
     if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
+        if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
+            c->extract_exponents = ff_ac3_extract_exponents_ssse3;
+        }
     }
 #endif
 }



More information about the ffmpeg-cvslog mailing list