[FFmpeg-devel] [PATCH] avfilter/vf_overlay: add x86 SIMD
James Almer
jamrial at gmail.com
Tue May 1 22:46:29 EEST 2018
On 5/1/2018 5:02 AM, Paul B Mahol wrote:
> Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha
> is straight.
>
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
> libavfilter/vf_overlay.c | 75 +++++-------------
> libavfilter/vf_overlay.h | 85 +++++++++++++++++++++
> libavfilter/x86/Makefile | 2 +
> libavfilter/x86/vf_overlay.asm | 157 ++++++++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_overlay_init.c | 63 +++++++++++++++
> 5 files changed, 326 insertions(+), 56 deletions(-)
> create mode 100644 libavfilter/vf_overlay.h
> create mode 100644 libavfilter/x86/vf_overlay.asm
> create mode 100644 libavfilter/x86/vf_overlay_init.c
[...]
> diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm
> new file mode 100644
> index 0000000000..d639cce9e5
> --- /dev/null
> +++ b/libavfilter/x86/vf_overlay.asm
> @@ -0,0 +1,157 @@
> +;*****************************************************************************
> +;* x86-optimized functions for overlay filter
> +;*
> +;* Copyright (C) 2018 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pw_128: times 8 dw 128
> +pw_255: times 8 dw 255
> +pw_257: times 8 dw 257
> +pw_65280: times 8 dw 65280
> +
> +SECTION .text
> +
> +INIT_XMM sse4
> +cglobal overlay_row_44, 6, 8, 6, 0, d, da, s, a, w, alinesize, r, x
You're not using the alinesize parameter here. Make this 5, 7, 8 and use
that reg for r. That way this can work on x86_32.
Also, pointless 0 after xmm reg amount. Just remove it.
> + xor xq, xq
> + movsxdifnidn wq, wd
> + mov rq, wq
> + and rq, mmsize/2 - 1
> + cmp wq, mmsize/2
> + jl .end
> + sub wq, rq
> + mova m3, [pw_255]
> + mova m4, [pw_128]
> + mova m5, [pw_257]
> + .loop0:
> + pmovzxbw m0, [sq+xq]
> + pmovzxbw m2, [aq+xq]
> + pmovzxbw m1, [dq+xq]
> + pmullw m0, m2
> + pxor m2, m3
> + pmullw m1, m2
> + paddw m0, m4
> + paddw m0, m1
> + pmulhuw m0, m5
> + packuswb m0, m0
> + movq [dq+xq], m0
> + add xq, mmsize/2
> + cmp xq, wq
> + jl .loop0
> +
> + .end:
> + mov eax, xd
> + RET
> +
> +INIT_XMM sse4
> +cglobal overlay_row_22, 6, 8, 8, 0, d, da, s, a, w, al, r, x
Same here with al.
> + xor xq, xq
> + movsxdifnidn wq, wd
> + sub wq, 1
> + mov rq, wq
> + and rq, mmsize/2 - 1
> + cmp wq, mmsize/2
> + jl .end
> + sub wq, rq
> + mova m3, [pw_255]
> + mova m4, [pw_128]
> + mova m5, [pw_257]
> + mova m7, [pw_65280]
> + .loop0:
> + pmovzxbw m0, [sq+xq]
> + movu m2, [aq+2*xq]
> + pand m2, m3
> + movu m6, [aq+2*xq]
> + pand m6, m7
> + psrlw m6, 8
> + paddw m2, m6
> + psrlw m2, 1
> + movu m6, [aq+2*xq]
> + pand m6, m3
> + paddw m2, m6
> + psrlw m2, 1
> + pmovzxbw m1, [dq+xq]
> + pmullw m0, m2
> + pxor m2, m3
> + pmullw m1, m2
> + paddw m0, m4
> + paddw m0, m1
> + pmulhuw m0, m5
> + packuswb m0, m0
> + movq [dq+xq], m0
> + add xq, mmsize/2
> + cmp xq, wq
> + jl .loop0
> +
> + .end:
> + mov eax, xd
> + RET
> +
> +INIT_XMM sse4
> +cglobal overlay_row_20, 6, 8, 8, 0, d, da, s, a, w, al, r, x
> + xor xq, xq
> + movsxdifnidn wq, wd
> + sub wq, 1
> + mov rq, wq
> + and rq, mmsize/2 - 1
> + cmp wq, mmsize/2
> + jl .end
> + sub wq, rq
> + mov daq, aq
> + add daq, alq
Use al straight from memory here, and use the gpr for r, much like above.
> + mova m3, [pw_255]
> + mova m4, [pw_128]
> + mova m5, [pw_257]
> + mova m7, [pw_65280]
> + .loop0:
> + pmovzxbw m0, [sq+xq]
> + movu m2, [aq+2*xq]
> + pand m2, m3
> + movu m6, [aq+2*xq]
> + pand m6, m7
> + psrlw m6, 8
> + paddw m2, m6
> + movu m6, [daq+2*xq]
> + pand m6, m3
> + paddw m2, m6
> + movu m6, [daq+2*xq]
> + pand m6, m7
> + psrlw m6, 8
> + paddw m2, m6
> + psrlw m2, 2
> + pmovzxbw m1, [dq+xq]
> + pmullw m0, m2
> + pxor m2, m3
> + pmullw m1, m2
> + paddw m0, m4
> + paddw m0, m1
> + pmulhuw m0, m5
> + packuswb m0, m0
> + movq [dq+xq], m0
> + add xq, mmsize/2
> + cmp xq, wq
> + jl .loop0
> +
> + .end:
> + mov eax, xd
> + RET
> diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c
> new file mode 100644
> index 0000000000..865fd035f6
> --- /dev/null
> +++ b/libavfilter/x86/vf_overlay_init.c
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright (c) 2018 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/vf_overlay.h"
> +
> +int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> + int w, ptrdiff_t alinesize);
> +
> +int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> + int w, ptrdiff_t alinesize);
> +
> +int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
> + int w, ptrdiff_t alinesize);
> +
> +av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> + (format == OVERLAY_FORMAT_YUV444 ||
> + format == OVERLAY_FORMAT_GBRP) &&
> + alpha_format == 0 && main_has_alpha == 0) {
> + s->blend_row[0] = ff_overlay_row_44_sse4;
> + s->blend_row[1] = ff_overlay_row_44_sse4;
> + s->blend_row[2] = ff_overlay_row_44_sse4;
> + }
> +
> + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> + (format == OVERLAY_FORMAT_YUV420) &&
> + alpha_format == 0 && main_has_alpha == 0) {
> + s->blend_row[0] = ff_overlay_row_44_sse4;
> + s->blend_row[1] = ff_overlay_row_20_sse4;
> + s->blend_row[2] = ff_overlay_row_20_sse4;
> + }
> +
> + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
> + (format == OVERLAY_FORMAT_YUV422) &&
> + alpha_format == 0 && main_has_alpha == 0) {
> + s->blend_row[0] = ff_overlay_row_44_sse4;
> + s->blend_row[1] = ff_overlay_row_22_sse4;
> + s->blend_row[2] = ff_overlay_row_22_sse4;
> + }
You can remove all the x86_64 checks after the changes described above.
> +}
>
More information about the ffmpeg-devel
mailing list