[FFmpeg-cvslog] x86: dsputil: Move ff_apply_window_int16_* bits to ac3dsp, where they belong

Sat Apr 5 19:43:12 CEST 2014

ffmpeg | branch: master | Diego Biurrun <diego at biurrun.de> | Wed Mar 26 04:41:28 2014 -0700| [57b5b84e208ad61ffdd74ad849bed212deb92bc5] | committer: Diego Biurrun

x86: dsputil: Move ff_apply_window_int16_* bits to ac3dsp, where they belong

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=57b5b84e208ad61ffdd74ad849bed212deb92bc5
---

 libavcodec/x86/ac3dsp.asm  |  131 ++++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/dsputil.asm |  130 -------------------------------------------
 2 files changed, 131 insertions(+), 130 deletions(-)

diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index d0ab3e7..817d5a3 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -35,6 +35,10 @@ pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
 pd_1:   times 4 dd 1
 pd_151: times 4 dd 151
 
+; used in ff_apply_window_int16()
+pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
+pd_16384: times 4 dd 16384
+
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -419,3 +423,130 @@ AC3_EXTRACT_EXPONENTS
 INIT_XMM ssse3
 AC3_EXTRACT_EXPONENTS
 %endif
+
+;-----------------------------------------------------------------------------
+; void ff_apply_window_int16(int16_t *output, const int16_t *input,
+;                            const int16_t *window, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro REVERSE_WORDS 1-2
+%if cpuflag(ssse3) && notcpuflag(atom)
+    pshufb  %1, %2
+%elif cpuflag(sse2)
+    pshuflw  %1, %1, 0x1B
+    pshufhw  %1, %1, 0x1B
+    pshufd   %1, %1, 0x4E
+%elif cpuflag(mmxext)
+    pshufw   %1, %1, 0x1B
+%endif
+%endmacro
+
+%macro MUL16FIXED 3
+%if cpuflag(ssse3) ; dst, src, unused
+; dst = ((dst * src) + (1<<14)) >> 15
+    pmulhrsw   %1, %2
+%elif cpuflag(mmxext) ; dst, src, temp
+; dst = (dst * src) >> 15
+; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
+; in from the pmullw result.
+    mova    %3, %1
+    pmulhw  %1, %2
+    pmullw  %3, %2
+    psrlw   %3, 15
+    psllw   %1, 1
+    por     %1, %3
+%endif
+%endmacro
+
+%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
+%if %1
+cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
+%else
+cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
+%endif
+    lea     offset2q, [offsetq-mmsize]
+%if cpuflag(ssse3) && notcpuflag(atom)
+    mova          m5, [pb_revwords]
+    ALIGN 16
+%elif %1
+    mova          m5, [pd_16384]
+%endif
+.loop:
+%if cpuflag(ssse3)
+    ; This version does the 16x16->16 multiplication in-place without expanding
+    ; to 32-bit. The ssse3 version is bit-identical.
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    pmulhrsw      m1, m0
+    REVERSE_WORDS m0, m5
+    pmulhrsw      m0, [ inputq+offsetq ]
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m0
+%elif %1
+    ; This version expands 16-bit to 32-bit, multiplies by the window,
+    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
+    ; save to the output. The window is reversed for the second half.
+    mova          m3, [windowq+offset2q]
+    mova          m4, [ inputq+offset2q]
+    pxor          m0, m0
+    punpcklwd     m0, m3
+    punpcklwd     m1, m4
+    pmaddwd       m0, m1
+    paddd         m0, m5
+    psrad         m0, 15
+    pxor          m2, m2
+    punpckhwd     m2, m3
+    punpckhwd     m1, m4
+    pmaddwd       m2, m1
+    paddd         m2, m5
+    psrad         m2, 15
+    packssdw      m0, m2
+    mova  [outputq+offset2q], m0
+    REVERSE_WORDS m3
+    mova          m4, [ inputq+offsetq]
+    pxor          m0, m0
+    punpcklwd     m0, m3
+    punpcklwd     m1, m4
+    pmaddwd       m0, m1
+    paddd         m0, m5
+    psrad         m0, 15
+    pxor          m2, m2
+    punpckhwd     m2, m3
+    punpckhwd     m1, m4
+    pmaddwd       m2, m1
+    paddd         m2, m5
+    psrad         m2, 15
+    packssdw      m0, m2
+    mova  [outputq+offsetq], m0
+%else
+    ; This version does the 16x16->16 multiplication in-place without expanding
+    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
+    ; therefore are not bit-identical to the C version.
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    mova          m2, [ inputq+offsetq ]
+    MUL16FIXED    m1, m0, m3
+    REVERSE_WORDS m0
+    MUL16FIXED    m2, m0, m3
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m2
+%endif
+    add      offsetd, mmsize
+    sub     offset2d, mmsize
+    jae .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 0
+INIT_XMM sse2
+APPLY_WINDOW_INT16 0
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 1
+INIT_XMM sse2
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3, atom
+APPLY_WINDOW_INT16 1
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 414d212..a8c8fda 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -27,8 +27,6 @@ pb_zzzzzzzz77777777: times 8 db -1
 pb_7: times 8 db 7
 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
-pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
-pd_16384: times 4 dd 16384
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
 SECTION_TEXT
@@ -205,134 +203,6 @@ SCALARPRODUCT_LOOP 0
     RET
 
 
-;-----------------------------------------------------------------------------
-; void ff_apply_window_int16(int16_t *output, const int16_t *input,
-;                            const int16_t *window, unsigned int len)
-;-----------------------------------------------------------------------------
-
-%macro REVERSE_WORDS 1-2
-%if cpuflag(ssse3) && notcpuflag(atom)
-    pshufb  %1, %2
-%elif cpuflag(sse2)
-    pshuflw  %1, %1, 0x1B
-    pshufhw  %1, %1, 0x1B
-    pshufd   %1, %1, 0x4E
-%elif cpuflag(mmxext)
-    pshufw   %1, %1, 0x1B
-%endif
-%endmacro
-
-%macro MUL16FIXED 3
-%if cpuflag(ssse3) ; dst, src, unused
-; dst = ((dst * src) + (1<<14)) >> 15
-    pmulhrsw   %1, %2
-%elif cpuflag(mmxext) ; dst, src, temp
-; dst = (dst * src) >> 15
-; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
-; in from the pmullw result.
-    mova    %3, %1
-    pmulhw  %1, %2
-    pmullw  %3, %2
-    psrlw   %3, 15
-    psllw   %1, 1
-    por     %1, %3
-%endif
-%endmacro
-
-%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
-%if %1
-cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
-%else
-cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
-%endif
-    lea     offset2q, [offsetq-mmsize]
-%if cpuflag(ssse3) && notcpuflag(atom)
-    mova          m5, [pb_revwords]
-    ALIGN 16
-%elif %1
-    mova          m5, [pd_16384]
-%endif
-.loop:
-%if cpuflag(ssse3)
-    ; This version does the 16x16->16 multiplication in-place without expanding
-    ; to 32-bit. The ssse3 version is bit-identical.
-    mova          m0, [windowq+offset2q]
-    mova          m1, [ inputq+offset2q]
-    pmulhrsw      m1, m0
-    REVERSE_WORDS m0, m5
-    pmulhrsw      m0, [ inputq+offsetq ]
-    mova  [outputq+offset2q], m1
-    mova  [outputq+offsetq ], m0
-%elif %1
-    ; This version expands 16-bit to 32-bit, multiplies by the window,
-    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
-    ; save to the output. The window is reversed for the second half.
-    mova          m3, [windowq+offset2q]
-    mova          m4, [ inputq+offset2q]
-    pxor          m0, m0
-    punpcklwd     m0, m3
-    punpcklwd     m1, m4
-    pmaddwd       m0, m1
-    paddd         m0, m5
-    psrad         m0, 15
-    pxor          m2, m2
-    punpckhwd     m2, m3
-    punpckhwd     m1, m4
-    pmaddwd       m2, m1
-    paddd         m2, m5
-    psrad         m2, 15
-    packssdw      m0, m2
-    mova  [outputq+offset2q], m0
-    REVERSE_WORDS m3
-    mova          m4, [ inputq+offsetq]
-    pxor          m0, m0
-    punpcklwd     m0, m3
-    punpcklwd     m1, m4
-    pmaddwd       m0, m1
-    paddd         m0, m5
-    psrad         m0, 15
-    pxor          m2, m2
-    punpckhwd     m2, m3
-    punpckhwd     m1, m4
-    pmaddwd       m2, m1
-    paddd         m2, m5
-    psrad         m2, 15
-    packssdw      m0, m2
-    mova  [outputq+offsetq], m0
-%else
-    ; This version does the 16x16->16 multiplication in-place without expanding
-    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
-    ; therefore are not bit-identical to the C version.
-    mova          m0, [windowq+offset2q]
-    mova          m1, [ inputq+offset2q]
-    mova          m2, [ inputq+offsetq ]
-    MUL16FIXED    m1, m0, m3
-    REVERSE_WORDS m0
-    MUL16FIXED    m2, m0, m3
-    mova  [outputq+offset2q], m1
-    mova  [outputq+offsetq ], m2
-%endif
-    add      offsetd, mmsize
-    sub     offset2d, mmsize
-    jae .loop
-    REP_RET
-%endmacro
-
-INIT_MMX mmxext
-APPLY_WINDOW_INT16 0
-INIT_XMM sse2
-APPLY_WINDOW_INT16 0
-
-INIT_MMX mmxext
-APPLY_WINDOW_INT16 1
-INIT_XMM sse2
-APPLY_WINDOW_INT16 1
-INIT_XMM ssse3
-APPLY_WINDOW_INT16 1
-INIT_XMM ssse3, atom
-APPLY_WINDOW_INT16 1
-
-
 ; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
 ;                                           const uint8_t *diff, int w,
 ;                                           int *left, int *left_top)