[FFmpeg-devel] [PATCH] avfilter/vf_w3fdif: add x86 SIMD

Thu Oct 8 20:22:58 CEST 2015

On 10/8/15, James Almer <jamrial at gmail.com> wrote:
> On 10/8/2015 2:02 PM, Paul B Mahol wrote:
>> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
>> new file mode 100644
>> index 0000000..96b61d7
>> --- /dev/null
>> +++ b/libavfilter/x86/vf_w3fdif.asm
>> @@ -0,0 +1,284 @@
>> +;*****************************************************************************
>> +;* x86-optimized functions for w3fdif filter
>> +;*
>> +;* Copyright (c) 2015 Paul B Mahol
>> +;*
>> +;* This file is part of FFmpeg.
>> +;*
>> +;* FFmpeg is free software; you can redistribute it and/or
>> +;* modify it under the terms of the GNU Lesser General Public
>> +;* License as published by the Free Software Foundation; either
>> +;* version 2.1 of the License, or (at your option) any later version.
>> +;*
>> +;* FFmpeg is distributed in the hope that it will be useful,
>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +;* Lesser General Public License for more details.
>> +;*
>> +;* You should have received a copy of the GNU Lesser General Public
>> +;* License along with FFmpeg; if not, write to the Free Software
>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> +;******************************************************************************
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +SECTION_RODATA
>> +
>> +pd_0: times 4 dd 0
>
> Just use pxor to zero a register.
>
>> +pd_2_23: times 4 dd 256*256*128
>> +
>> +SECTION .text
>> +
>> +INIT_XMM sse4
>> +cglobal w3fdif_scale, 3, 3, 3, 0, out_pixel, work_pixel, linesize
>> +    mova                  m1, [pd_0]
>> +    mova                  m2, [pd_2_23]
>> +    shr            linesized, 2
>> +
>> +    .loop
>> +    mova                         m0, [work_pixelq]
>> +    pmaxsd                       m0, m1
>> +    pminsd                       m0, m2
>
> You can emulate these two using sse2 instructions. See CLIPD_SSE2 (using
> float conversion) and CLIPD_MMX in x86util.asm
>
>> +    psrld                        m0, 15
>> +    packusdw                     m0, m0
>> +    packuswb                     m0, m0
>> +    movd               [out_pixelq], m0
>> +    add                  out_pixelq, mmsize/4
>> +    add                 work_pixelq, mmsize
>> +    sub                   linesized, 1
>> +    jg .loop
>> +REP_RET
>> +
>> +INIT_XMM sse2
>> +cglobal w3fdif_simple_low, 4, 6, 5, 0, work_line, in_lines_cur0, coef,
>> linesize
>> +    movd                  m0, [coefq+0]
>> +    movd                  m1, [coefq+2]
>
> movd m1, [coefq]
> SPLATW m0, m1, 0
> SPLATW m1, m1, 1
>
>> +    SPLATW                m0, m0
>> +    SPLATW                m1, m1
>> +    shr            linesized, 3
>> +    mov                  r4q, 0
>> +    mov                  r5q, [in_lines_cur0q + gprsize]
>> +    mov       in_lines_cur0q, [in_lines_cur0q]
>> +    %define   in_lines_cur1q  r5q
>> +
>> +    .loop
>> +    movh                            m2, [in_lines_cur0q+r4q]
>> +    movh                            m3, [in_lines_cur1q+r4q]
>> +    pxor                            m4, m4
>> +    punpcklbw                       m2, m4
>> +    punpcklbw                       m3, m4
>> +    SBUTTERFLY                      wd, 2, 3, 4
>> +    pmaddwd                         m2, m0
>> +    pmaddwd                         m3, m1
>> +    mova            [work_lineq+r4q*4], m2
>> +    mova     [work_lineq+r4q*4+mmsize], m3
>> +    add                            r4q, 8
>> +    sub                      linesized, 1
>> +    jg .loop
>> +REP_RET
>> +
>> +cglobal w3fdif_simple_high, 5, 10, 8, 0, work_line, in_lines_cur0,
>> in_lines_adj0, coef, linesize
>
> This is clearly not x86_32 friendly, so you will either have to get it
> working
> using 7 regs, or mark it as x86_64 only.
>
>> +    movd                  m0, [coefq+0]
>> +    movd                  m1, [coefq+2]
>> +    movd                  m2, [coefq+4]
>
> movq m2, [coefq]
> SPLATW m0, m2, 0
> SPLATW m1, m2, 1
> SPLATW m2, m2, 2
>
> And so for every function.
>
>> +    SPLATW                m0, m0
>> +    SPLATW                m1, m1
>> +    SPLATW                m2, m2
>> +    SBUTTERFLY            wd, 0, 1, 7
>> +    shr            linesized, 3
>
> Seems pointless if the only other instruction using this reg is a sub at the
> end of the loop.
> Can't you do the neg trick on linesize and use that as part of the effective
> addresses inside the loop, instead of a zeroed r5q?
>
>> +    mov                  r5q, 0
>> +    mov                  r7q, [in_lines_cur0q+gprsize*2]
>> +    mov                  r6q, [in_lines_cur0q+gprsize]
>> +    mov       in_lines_cur0q, [in_lines_cur0q]
>> +    %define   in_lines_cur1q  r6q
>> +    %define   in_lines_cur2q  r7q
>
> Instead of defining their names here, just name them in the cglobal line.
> You can name registers there that aren't function arguments just fine.
>
> Both the above suggestions apply to other functions as well.

I prefer this approach, its more readable.

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>