[FFmpeg-devel] [PATCH] SSE2 version of vf_idet's filter_line()

Wed Sep 3 17:50:32 CEST 2014

Michael,

On Wed, Sep 3, 2014 at 4:29 PM, Michael Niedermayer <michaelni at gmx.at>
wrote:

> On Wed, Sep 03, 2014 at 11:42:10AM +0200, Pascal Massimino wrote:
> > On Wed, Sep 3, 2014 at 11:32 AM, Benoit Fouet <benoit.fouet at free.fr>
> wrote:
> >
> > > Hi,
> > >
> > > ----- Mail original -----
> > > > Hi,
> > > >
> > > >
> > > > updated patch, sorry for the broken format in the previous one. Hope
> > > > it's
> > > > ok now.
> > > >
> > >
> > > This is just missing the new header file vf_idet.h
> > >
> >
> > that's embarrassing...
> > Attached, new one.
>
> [...]
> > diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
> > new file mode 100644
> > index 0000000..180f88e
> > --- /dev/null
> > +++ b/libavfilter/x86/vf_idet.asm
> > @@ -0,0 +1,146 @@
> > +;;
> *****************************************************************************
> > +;; * x86-optimized functions for idet filter
> > +;; *
> > +;; * This file is part of FFmpeg.
> > +;; *
> > +;; * FFmpeg is free software; you can redistribute it and/or modify
> > +;; * it under the terms of the GNU General Public License as published
> by
> > +;; * the Free Software Foundation; either version 2 of the License, or
> > +;; * (at your option) any later version.
> > +;; *
> > +;; * FFmpeg is distributed in the hope that it will be useful,
> > +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +;; * GNU General Public License for more details.
> > +;; *
> > +;; * You should have received a copy of the GNU General Public License
> along
> > +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> > +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> > +;;
> ******************************************************************************
> > +
> > +%include "libavutil/x86/x86util.asm"
> > +
> > +SECTION .text
> > +
> > +%macro DECLARE_VAR 2
> > +    %define  %1 %2
> > +    %define  %1d %2d
> > +%endmacro
> > +
> > +;; Mappings for common variables
> > +DECLARE_VAR index, r5
> > +DECLARE_VAR total, r4
> > +DECLARE_VAR tmp1,  r6
>
> maybe iam missing something but why dont you add the 3 named
> identifers to the end of the cglobal line instead of these
> DECLARE_VAR* ? (would also need d/q postfixes to the uses of the vars)
>

indeed, this just shows my ignorance of how cglobal work exactly.
-> fixed, got right of 'total' and 'tmp' var actually.

>
>
> > +
> > +; Implementation that does 8-bytes at a time using single-word
> operations.
> > +%macro IDET_FILTER_LINE 0
> > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width
> > +    xor       index, index
> > +%define   m_zero m7
> > +%define   m_sum  m6
> > +    pxor      m_sum, m_sum
> > +    pxor      m_zero, m_zero
> > +
> > +.loop:
> > +    movq      m0, [aq+index*1]
> > +    movq      m1, m0
> > +    punpcklbw m0, m_zero
> > +    punpckhbw m1, m_zero
> > +
> > +    movq      m3, [cq+index*1]
> > +    movq      m4, m3
> > +    punpcklbw m3, m_zero
> > +    punpckhbw m4, m_zero
> > +
> > +    paddsw    m0, m3
> > +    paddsw    m1, m4
> > +
> > +    movq      m3, [bq+index*1]
> > +    movq      m4, m3
> > +    punpcklbw m3, m_zero
> > +    punpckhbw m4, m_zero
> > +
>
> > +    psllw     m3, 0x1
> > +    psllw     m4, 0x1
>
> paddw might be faster
>

done

>
>
> > +    psubsw    m0, m3
> > +    psubsw    m1, m4
> > +
> > +    ABS1      m0, m5
> > +    ABS1      m1, m5
> > +    paddw     m0, m1
> > +    movq      m1, m0
> > +    punpcklwd m0, m_zero
> > +    punpckhwd m1, m_zero
> > +    paddd     m0, m1
> > +    paddd     m_sum, m0
> > +
> > +    add       index, 0x8
> > +    CMP       widthd, indexd
> > +    jg        .loop
> > +
> > +    movd      totald, m_sum
> > +    psrlq     m_sum, 0x20
> > +    movd      tmp1d, m_sum
> > +    add       totald, tmp1d
> > +    mov       eax, totald
> > +    RET
> > +%endmacro
> > +
> > +%if ARCH_X86_32
> > +INIT_MMX mmxext
> > +IDET_FILTER_LINE
> > +%endif
> > +
> > +INIT_MMX mmx
> > +IDET_FILTER_LINE
>
> shouldnt the mmx version be under ARCH_X86_32 too?
> if the intend is to not build functions that wont be needed for
> x86_64
>

indeed, merged the mmx version under X86_32

>
>
> > +
> > +;; SSE2 8-bit implementation that does 16-bytes at a time:
> > +;;
> > +;; const int w2 = w >> 4
> > +;; for (int i = 0; i < w2, ++i) {
> > +;;   const __m128i A = _mm_loadu_si128(&p_a[i])
> > +;;   const __m128i B = _mm_loadu_si128(&p_b[i])
> > +;;   const __m128i C = _mm_loadu_si128(&p_c[i])
> > +;;   const __m128i ab = _mm_subs_epu8(A, B)
> > +;;   const __m128i ba = _mm_subs_epu8(B, A)
> > +;;   const __m128i bc = _mm_subs_epu8(B, C)
> > +;;   const __m128i cb = _mm_subs_epu8(C, B)
> > +;;   const __m128i s1 = _mm_sad_epu8(ab, bc)
> > +;;   const __m128i s2 = _mm_sad_epu8(ba, cb)
> > +;;   result1 = _mm_add_epi64(result1, s1)
> > +;;   result2 = _mm_add_epi64(result2, s2)
> > +;; }
> > +INIT_XMM sse2
> > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width
> > +    xor       index, index
> > +    pxor      m0, m0
> > +    pxor      m1, m1
> > +
> > +.sse2_loop:
> > +    movdqu    m2, [bq+index*1]  ; B
> > +    movdqu    m3, [aq+index*1]  ; A
> > +    movdqa    m5, m2
> > +    movdqa    m6, m2
> > +    movdqa    m4, m3
> > +    psubusb   m5, m3            ; ba
> > +
> > +    movdqu    m3, [cq+index*1]  ; C
> > +    add       index, 0x10
> > +    psubusb   m4, m2            ; ab
> > +    CMP       indexd, widthd
> > +
> > +    psubusb   m6, m3            ; bc
> > +    psubusb   m3, m2            ; cb
> > +
> > +    psadbw    m4, m6            ; |ab - bc|
> > +    paddq     m0, m4
> > +    psadbw    m5, m3            ; |ba - cb|
> > +    paddq     m1, m5
> > +    jl       .sse2_loop
> > +
> > +    paddq     m0, m1
> > +    movhlps   m1, m0
> > +    paddq     m0, m1
> > +    movd      total, m0
>
> > +    mov       eax, totald
>
> this is unneeded, m0 could be stored into eax directly
>

removed this step in both mmx and sse2 version.

-> new patch attached.

/skal
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-MMX-MMXEXT-SSE2-implementation-of-idet-s-filter_line.patch
Type: text/x-patch
Size: 12559 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140903/bdce55e7/attachment.bin>