[FFmpeg-devel] [PATCH] avfilter/vf_w3fdif: add x86 SIMD

Fri Oct 9 16:20:45 CEST 2015

On 10/9/15, Ronald S. Bultje <rsbultje at gmail.com> wrote:
> Hi,
>
> looking better already!
>
> On Fri, Oct 9, 2015 at 3:58 AM, Paul B Mahol <onemda at gmail.com> wrote:
>
>> +INIT_XMM sse4
>> +cglobal w3fdif_scale, 3, 3, 3, 0, out_pixel, work_pixel, linesize
>> +    pxor                  m1, m1
>> +    mova                  m2, [pd_2_23]
>> +    shr            linesized, 2
>>
>
> Is linesize guaranteed to be a multiple of 4? If not, just sub 4 below and
> remove this line.

Done.

>
> +    .loop
>> +    mova                         m0, [work_pixelq]
>> +    pmaxsd                       m0, m1
>>
>
> Not necessary if you use packssdw instead of packusdw and use psrad instead
> of psrld.

Done.

>
> +    pminsd                       m0, m2
>>
>
> Why? packusdw+packuswb already clip within the same range. This line is
> probably unnecessary.
>
> +    psrld                        m0, 15
>> +    packusdw                     m0, m0
>>
>
> If you use packssdw, this works on sse2 instead of sse4. It doesn't affect
> the output, since you clip using packusbw right after.

Done.

>
>
>> +    packuswb                     m0, m0
>> +    movd               [out_pixelq], m0
>> +    add                  out_pixelq, mmsize/4
>> +    add                 work_pixelq, mmsize
>> +    sub                   linesized, 1
>> +    jg .loop
>> +REP_RET
>>
>
> In some cases, like here, where the prototype for each function is
> different, it's sometimes helpful for observers (like me) if you put the
> function prototype as a comment directly above the cglobal line, so that we
> know the types of arguments. E.g.:
>
> ; void ff_w3fdif_scale_<opt>(uint8_t *out_pixel, int32_t *work_pixel,
> ptrdiff_t linesize);
>
> Now, as for assembly, you'll notice you're not using a lot of registers
> here, and your registers are there's a lot of dependency on m0, so I'd
> unroll this x2 and do two pixels at a time. That also allows you to store
> using movh insetad of movd.

Done.

>
> +INIT_XMM sse2
>> +cglobal w3fdif_simple_low, 4, 6, 5, 0, work_line, in_lines_cur0, coef,
>> linesize
>> +    movd                  m1, [coefq+0]
>> +    SPLATW                m0, m1, 0
>> +    SPLATW                m1, m1, 1
>> +    mov                  r4q, 0
>> +    mov                  r5q, [in_lines_cur0q + gprsize]
>> +    mov       in_lines_cur0q, [in_lines_cur0q]
>> +    %define   in_lines_cur1q  r5q
>>
>
> You can use coef as in_lines_cur1q, then the function uses 5 instead of 6
> GPRs. Then, don't use define, but use x86inc's DEFINE_ARGS:
>
> cglobal funcname, 4, 5, 5, 0, work_line, in_lines_cur0, coef, linesize,
> offset
>     use coef
>     DEFINE_ARGS work_line, in_lines_cur0, in_lines_cur1, linesize, offset
>
> and then use offsetq instead of r4q, and in_lines_cur1q is now r3 instead
> of r5.
>
> +    .loop
>>
>
> Labels should be -4 indented:

Done.

>
> .loop:
>     code
>     dec cntr
>     jg .loop
>
>
>> +    movh                            m2, [in_lines_cur0q+r4q]
>> +    movh                            m3, [in_lines_cur1q+r4q]
>> +    pxor                            m4, m4
>> +    punpcklbw                       m2, m4
>> +    punpcklbw                       m3, m4
>> +    SBUTTERFLY                      wd, 2, 3, 4
>> +    pmaddwd                         m2, m0
>> +    pmaddwd                         m3, m1
>> +    mova            [work_lineq+r4q*4], m2
>> +    mova     [work_lineq+r4q*4+mmsize], m3
>> +    add                            r4q, 8
>
> +    sub                      linesized, 8
>>
>
> You're using two counters here, you should be able to merge them and use
> another argument (plus another instruction in your inner loop) less than
> you do now.
>
>
>> +cglobal w3fdif_simple_high, 5, 10, 8, 0, work_line, in_lines_cur0,
>> in_lines_adj0, coef, linesize
>> +    movq                  m2, [coefq+0]
>> +    SPLATW                m0, m2, 0
>> +    SPLATW                m1, m2, 1
>> +    SPLATW                m2, m2, 2
>> +    SBUTTERFLY            wd, 0, 1, 7
>> +    mov                  r5q, 0
>> +    mov                  r7q, [in_lines_cur0q+gprsize*2]
>> +    mov                  r6q, [in_lines_cur0q+gprsize]
>> +    mov       in_lines_cur0q, [in_lines_cur0q]
>> +    %define   in_lines_cur1q  r6q
>> +    %define   in_lines_cur2q  r7q
>> +    mov                  r9q, [in_lines_adj0q+gprsize*2]
>> +    mov                  r8q, [in_lines_adj0q+gprsize]
>> +    mov       in_lines_adj0q, [in_lines_adj0q]
>> +    %define   in_lines_adj1q  r8q
>> +    %define   in_lines_adj2q  r9q
>>
>
> Same as above, use DEFINE_ARGS, not %define, and reuse coef so you use one
> GPR less.
>
> +    .loop
>>
>
> Indent.
>
> +    movh                            m3, [in_lines_cur0q+r5q]
>> +    movh                            m4, [in_lines_cur1q+r5q]
>> +    pxor                            m7, m7
>> +    punpcklbw                       m3, m7
>> +    punpcklbw                       m4, m7
>> +    SBUTTERFLY                      wd, 3, 4, 7
>> +    pmaddwd                         m3, m0
>> +    pmaddwd                         m4, m1
>> +    movh                            m5, [in_lines_adj0q+r5q]
>> +    movh                            m6, [in_lines_adj1q+r5q]
>> +    pxor                            m7, m7
>> +    punpcklbw                       m5, m7
>> +    punpcklbw                       m6, m7
>> +    SBUTTERFLY                      wd, 5, 6, 7
>> +    pmaddwd                         m5, m0
>> +    pmaddwd                         m6, m1
>> +    paddd                           m3, m5
>> +    paddd                           m4, m6
>> +    movh                            m5, [in_lines_cur2q+r5q]
>> +    movh                            m6, [in_lines_adj2q+r5q]
>> +    pxor                            m7, m7
>> +    punpcklbw                       m5, m7
>> +    punpcklbw                       m6, m7
>> +    SBUTTERFLY                      wd, 5, 6, 7
>> +    pmaddwd                         m5, m2
>> +    pmaddwd                         m6, m2
>> +    paddd                           m3, m5
>> +    paddd                           m4, m6
>> +    movu                            m5, [work_lineq+r5q*4]
>> +    movu                            m6, [work_lineq+r5q*4+mmsize]
>> +    paddd                           m3, m5
>> +    paddd                           m4, m6
>> +    mova            [work_lineq+r5q*4], m3
>> +    mova     [work_lineq+r5q*4+mmsize], m4
>> +    add                            r5q, 8
>> +    sub                      linesized, 8
>>
>
> Merge counters.
>
> +cglobal w3fdif_complex_low, 4, 8, 9, 0, work_line, in_lines_cur0, coef,
>> linesize
>>
> [..]
>
>> +cglobal w3fdif_complex_high, 5, 14, 10, 0, work_line, in_lines_cur0,
>> in_lines_adj0, coef, linesize
>
>
> Same comments as above.

Rest not done, as it imho complicates already complicated asm a lot.