[FFmpeg-devel] [PATCH] avfilter/vf_overlay: add x86 SIMD

Tue May 1 22:46:29 EEST 2018

On 5/1/2018 5:02 AM, Paul B Mahol wrote:
> Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha
> is straight.
> 
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
>  libavfilter/vf_overlay.c          |  75 +++++-------------
>  libavfilter/vf_overlay.h          |  85 +++++++++++++++++++++
>  libavfilter/x86/Makefile          |   2 +
>  libavfilter/x86/vf_overlay.asm    | 157 ++++++++++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_overlay_init.c |  63 +++++++++++++++
>  5 files changed, 326 insertions(+), 56 deletions(-)
>  create mode 100644 libavfilter/vf_overlay.h
>  create mode 100644 libavfilter/x86/vf_overlay.asm
>  create mode 100644 libavfilter/x86/vf_overlay_init.c

[...]

> diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm
> new file mode 100644
> index 0000000000..d639cce9e5
> --- /dev/null
> +++ b/libavfilter/x86/vf_overlay.asm
> @@ -0,0 +1,157 @@
> +;*****************************************************************************
> +;* x86-optimized functions for overlay filter
> +;*
> +;* Copyright (C) 2018 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pw_128:   times 8 dw 128
> +pw_255:   times 8 dw 255
> +pw_257:   times 8 dw 257
> +pw_65280: times 8 dw 65280
> +
> +SECTION .text
> +
> +INIT_XMM sse4
> +cglobal overlay_row_44, 6, 8, 6, 0, d, da, s, a, w, alinesize, r, x

You're not using the alinesize parameter here. Make this 5, 7, 8 and use
that reg for r. That way this can work on x86_32.

Also, pointless 0 after xmm reg amount. Just remove it.

> +    xor          xq, xq
> +    movsxdifnidn wq, wd
> +    mov          rq, wq
> +    and          rq, mmsize/2 - 1
> +    cmp          wq, mmsize/2
> +    jl .end
> +    sub          wq, rq
> +    mova         m3, [pw_255]
> +    mova         m4, [pw_128]
> +    mova         m5, [pw_257]
> +    .loop0:
> +        pmovzxbw    m0, [sq+xq]
> +        pmovzxbw    m2, [aq+xq]
> +        pmovzxbw    m1, [dq+xq]
> +        pmullw      m0, m2
> +        pxor        m2, m3
> +        pmullw      m1, m2
> +        paddw       m0, m4
> +        paddw       m0, m1
> +        pmulhuw     m0, m5
> +        packuswb    m0, m0
> +        movq   [dq+xq], m0
> +        add         xq, mmsize/2
> +        cmp         xq, wq
> +        jl .loop0
> +
> +    .end:
> +    mov    eax, xd
> +    RET
> +
> +INIT_XMM sse4
> +cglobal overlay_row_22, 6, 8, 8, 0, d, da, s, a, w, al, r, x

Same here with al.

> +    xor          xq, xq
> +    movsxdifnidn wq, wd
> +    sub          wq, 1
> +    mov          rq, wq
> +    and          rq, mmsize/2 - 1
> +    cmp          wq, mmsize/2
> +    jl .end
> +    sub          wq, rq
> +    mova         m3, [pw_255]
> +    mova         m4, [pw_128]
> +    mova         m5, [pw_257]
> +    mova         m7, [pw_65280]
> +    .loop0:
> +        pmovzxbw    m0, [sq+xq]
> +        movu        m2, [aq+2*xq]
> +        pand        m2, m3
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        psrlw       m2, 1
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m3
> +        paddw       m2, m6
> +        psrlw       m2, 1
> +        pmovzxbw    m1, [dq+xq]
> +        pmullw      m0, m2
> +        pxor        m2, m3
> +        pmullw      m1, m2
> +        paddw       m0, m4
> +        paddw       m0, m1
> +        pmulhuw     m0, m5
> +        packuswb    m0, m0
> +        movq   [dq+xq], m0
> +        add         xq, mmsize/2
> +        cmp         xq, wq
> +        jl .loop0
> +
> +    .end:
> +    mov    eax, xd
> +    RET
> +
> +INIT_XMM sse4
> +cglobal overlay_row_20, 6, 8, 8, 0, d, da, s, a, w, al, r, x
> +    xor          xq, xq
> +    movsxdifnidn wq, wd
> +    sub          wq, 1
> +    mov          rq, wq
> +    and          rq, mmsize/2 - 1
> +    cmp          wq, mmsize/2
> +    jl .end
> +    sub          wq, rq
> +    mov         daq, aq
> +    add         daq, alq

Use al straight from memory here, and use the gpr for r, much like above.

> +    mova         m3, [pw_255]
> +    mova         m4, [pw_128]
> +    mova         m5, [pw_257]
> +    mova         m7, [pw_65280]
> +    .loop0:
> +        pmovzxbw    m0, [sq+xq]
> +        movu        m2, [aq+2*xq]
> +        pand        m2, m3
> +        movu        m6, [aq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        movu        m6, [daq+2*xq]
> +        pand        m6, m3
> +        paddw       m2, m6
> +        movu        m6, [daq+2*xq]
> +        pand        m6, m7
> +        psrlw       m6, 8
> +        paddw       m2, m6
> +        psrlw       m2, 2
> +        pmovzxbw    m1, [dq+xq]
> +        pmullw      m0, m2
> +        pxor        m2, m3
> +        pmullw      m1, m2
> +        paddw       m0, m4
> +        paddw       m0, m1
> +        pmulhuw     m0, m5
> +        packuswb    m0, m0
> +        movq   [dq+xq], m0
> +        add         xq, mmsize/2
> +        cmp         xq, wq
> +        jl .loop0
> +
> +    .end:
> +    mov    eax, xd
> +    RET
> diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c
> new file mode 100644
> index 0000000000..865fd035f6
> --- /dev/null
> +++ b/libavfilter/x86/vf_overlay_init.c
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright (c) 2018 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/vf_overlay.h"
> +
> +int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> +                           int w, ptrdiff_t alinesize);
> +
> +int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> +                           int w, ptrdiff_t alinesize);
> +
> +int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> +                           int w, ptrdiff_t alinesize);
> +
> +av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> +        (format == OVERLAY_FORMAT_YUV444 ||
> +         format == OVERLAY_FORMAT_GBRP) &&
> +        alpha_format == 0 && main_has_alpha == 0) {
> +        s->blend_row[0] = ff_overlay_row_44_sse4;
> +        s->blend_row[1] = ff_overlay_row_44_sse4;
> +        s->blend_row[2] = ff_overlay_row_44_sse4;
> +    }
> +
> +    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> +        (format == OVERLAY_FORMAT_YUV420) &&
> +        alpha_format == 0 && main_has_alpha == 0) {
> +        s->blend_row[0] = ff_overlay_row_44_sse4;
> +        s->blend_row[1] = ff_overlay_row_20_sse4;
> +        s->blend_row[2] = ff_overlay_row_20_sse4;
> +    }
> +
> +    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> +        (format == OVERLAY_FORMAT_YUV422) &&
> +        alpha_format == 0 && main_has_alpha == 0) {
> +        s->blend_row[0] = ff_overlay_row_44_sse4;
> +        s->blend_row[1] = ff_overlay_row_22_sse4;
> +        s->blend_row[2] = ff_overlay_row_22_sse4;
> +    }

You can remove all the x86_64 checks after the changes described above.

> +}
>