[FFmpeg-devel] [PATCH] avfilter/avf_showcqt: cqt_calc optimization on x86
James Almer
jamrial at gmail.com
Tue Jun 7 05:36:03 CEST 2016
On 6/4/2016 4:36 AM, Muhammad Faiz wrote:
> benchmark on x86_64
> cqt_time:
> plain = 3.292 s
> SSE = 1.640 s
> SSE3 = 1.631 s
> AVX = 1.395 s
> FMA3 = 1.271 s
> FMA4 = not available
Try using the START_TIMER and STOP_TIMER macros to wrap the s->cqt_calc
call in libavfilter/avf_showcqt.c
It will potentially give more accurate results than the current
UPDATE_TIME(s->cqt_time) check.
>
> untested on x86_32
Do you have a sample command to test this? As Michael said FATE doesn't
cover showcqt.
>
> Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
> ---
> libavfilter/avf_showcqt.c | 7 ++
> libavfilter/avf_showcqt.h | 3 +
> libavfilter/x86/Makefile | 2 +
> libavfilter/x86/avf_showcqt.asm | 206 +++++++++++++++++++++++++++++++++++++
> libavfilter/x86/avf_showcqt_init.c | 63 ++++++++++++
> 5 files changed, 281 insertions(+)
> create mode 100644 libavfilter/x86/avf_showcqt.asm
> create mode 100644 libavfilter/x86/avf_showcqt_init.c
>
> diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
> index b88c83c..62d5b09 100644
> --- a/libavfilter/avf_showcqt.c
> +++ b/libavfilter/avf_showcqt.c
> @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
> w *= sign * (1.0 / s->fft_len);
> s->coeffs[m].val[x - s->coeffs[m].start] = w;
> }
> +
> + if (s->permute_coeffs)
> + s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
> }
>
> av_expr_free(expr);
> @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
>
> s->cqt_align = 1;
> s->cqt_calc = cqt_calc;
> + s->permute_coeffs = NULL;
> s->draw_sono = draw_sono;
> if (s->format == AV_PIX_FMT_RGB24) {
> s->draw_bar = draw_bar_rgb;
> @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
> s->update_sono = update_sono_yuv;
> }
>
> + if (ARCH_X86)
> + ff_showcqt_init_x86(s);
> +
> if ((ret = init_cqt(s)) < 0)
> return ret;
>
> diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
> index b945f49..588830f 100644
> --- a/libavfilter/avf_showcqt.h
> +++ b/libavfilter/avf_showcqt.h
> @@ -74,6 +74,7 @@ typedef struct {
> /* callback */
> void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
> int len, int fft_len);
> + void (*permute_coeffs)(float *v, int len);
> void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
> const ColorFloat *c, int bar_h);
> void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
> @@ -112,4 +113,6 @@ typedef struct {
> int axis;
> } ShowCQTContext;
>
> +void ff_showcqt_init_x86(ShowCQTContext *s);
> +
> #endif
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 4486b79..b6195f8 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
> OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
> OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
> OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o
> +OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o
> OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
> OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
> OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o
> @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
> ifdef CONFIG_GPL
> YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o
> endif
> +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o
> YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o
> YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o
> YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
> diff --git a/libavfilter/x86/avf_showcqt.asm b/libavfilter/x86/avf_showcqt.asm
> new file mode 100644
> index 0000000..ba30786
> --- /dev/null
> +++ b/libavfilter/x86/avf_showcqt.asm
> @@ -0,0 +1,206 @@
> +;*****************************************************************************
> +;* x86-optimized functions for showcqt filter
> +;*
> +;* Copyright (C) 2016 Muhammad Faiz <mfcc64 at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +%if ARCH_X86_64
> +%define pointer resq
> +%else
> +%define pointer resd
> +%endif
> +
> +struc Coeffs
> + .val: pointer 1
> + .start: resd 1
> + .len: resd 1
> + .sizeof:
> +endstruc
> +
> +%macro EMULATE_HADDPS 3 ; dst, src, tmp
> +%if cpuflag(sse3)
> + haddps %1, %2
> +%else
> + movaps %3, %1
> + shufps %1, %2, q2020
> + shufps %3, %2, q3131
> + addps %1, %3
This is great. Much better and more efficient than other attempts to
emulate haddps scattered across the codebase.
It also makes me wonder if haddps, a ~5 cycles latency instruction is
really better than the combination of a mostly free mov, two 1 cycle
shuffles and one 3 cycle add to justify extra functions with it as the
only difference, at least in cases where there are no register
constrains.
Your benchmarks above suggest it is although barely, so I'm curious
about what the timer.h macros will show.
> +%endif
> +%endmacro ; EMULATE_HADDPS
> +
> +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
> +%if cpuflag(fma3) || cpuflag(fma4)
> + fmaddps %1, %2, %3, %4
> +%else
> + mulps %5, %2, %3
> + addps %1, %4, %5
> +%endif
> +%endmacro ; EMULATE_FMADDPS
> +
> +%macro CQT_CALC 9
> +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
> +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
> + mov id, xd
> + add id, [coeffsq + Coeffs.start + %9]
> + movaps m%5, [srcq + 8 * iq]
> + movaps m%7, [srcq + 8 * iq + mmsize]
> + shufps m%6, m%5, m%7, q3131
> + shufps m%5, m%5, m%7, q2020
> + sub id, fft_lend
> + EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
> + neg id
Is this supposed to turn a positive value negative? If so then it should
be "neg iq", otherwise on x86_64 the high 32 bits of iq used in the
effective addresses below would be zero.
> + EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
> + movups m%5, [srcq + 8 * iq - mmsize + 8]
> + movups m%7, [srcq + 8 * iq - 2*mmsize + 8]
> + %if mmsize == 32
> + vperm2f128 m%5, m%5, m%5, 1
> + vperm2f128 m%7, m%7, m%7, 1
> + %endif
> + shufps m%6, m%5, m%7, q1313
> + shufps m%5, m%5, m%7, q0202
> + EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
> + EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
> +%endmacro ; CQT_CALC
> +
> +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
> + addps m%5, m%4, m%2
> + subps m%6, m%3, m%1
> + addps m%1, m%3
> + subps m%2, m%4
> + EMULATE_HADDPS m%5, m%6, m%3
> + EMULATE_HADDPS m%1, m%2, m%3
> + EMULATE_HADDPS m%1, m%5, m%2
> + %if mmsize == 32
> + vextractf128 xmm%2, m%1, 1
> + addps xmm%1, xmm%2
> + %endif
> +%endmacro ; CQT_SEPARATE
> +
> +%macro DECLARE_CQT_CALC 0
> +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
> +%if ARCH_X86_64
> +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
> + align 16
> + .loop_k:
> + mov xd, [coeffsq + Coeffs.len]
> + xorps m0, m0
> + movaps m1, m0
> + movaps m2, m0
> + mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
> + movaps m3, m0
> + movaps m8, m0
> + cmp coeffs_lend, xd
> + movaps m9, m0
> + movaps m10, m0
> + movaps m11, m0
> + cmova coeffs_lend, xd
> + xor xd, xd
> + test coeffs_lend, coeffs_lend
> + jz .check_loop_b
> + mov coeffs_valq, [coeffsq + Coeffs.val]
> + mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
> + align 16
> + .loop_ab:
> + movaps m7, [coeffs_valq + 4 * xq]
> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
> + movaps m7, [coeffs_val2q + 4 * xq]
> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
> + add xd, mmsize/4
> + cmp xd, coeffs_lend
> + jb .loop_ab
> + .check_loop_b:
> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
> + jae .check_loop_a
> + align 16
> + .loop_b:
> + movaps m7, [coeffs_val2q + 4 * xq]
> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
> + add xd, mmsize/4
> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
> + jb .loop_b
> + .loop_end:
> + CQT_SEPARATE 0, 1, 2, 3, 4, 5
> + CQT_SEPARATE 8, 9, 10, 11, 4, 5
> + mulps xmm0, xmm0
> + mulps xmm8, xmm8
> + EMULATE_HADDPS xmm0, xmm8, xmm1
> + movaps [dstq], xmm0
> + sub lend, 2
> + lea dstq, [dstq + 16]
Use add
> + lea coeffsq, [coeffsq + 2*Coeffs.sizeof]
Same, assuming sizeof is an immediate.
> + jnz .loop_k
> + REP_RET
> + align 16
> + .check_loop_a:
> + cmp xd, [coeffsq + Coeffs.len]
> + jae .loop_end
> + align 16
> + .loop_a:
> + movaps m7, [coeffs_valq + 4 * xq]
> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
> + add xd, mmsize/4
> + cmp xd, [coeffsq + Coeffs.len]
> + jb .loop_a
> + jmp .loop_end
> +%else
> +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
> +%define fft_lend r4m
> + align 16
> + .loop_k:
> + mov xd, [coeffsq + Coeffs.len]
> + xorps m0, m0
> + movaps m1, m0
> + movaps m2, m0
> + movaps m3, m0
> + test xd, xd
> + jz .store
> + mov coeffs_valq, [coeffsq + Coeffs.val]
> + xor xd, xd
> + align 16
> + .loop_x:
> + movaps m7, [coeffs_valq + 4 * xq]
> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
> + add xd, mmsize/4
> + cmp xd, [coeffsq + Coeffs.len]
> + jb .loop_x
> + CQT_SEPARATE 0, 1, 2, 3, 4, 5
> + mulps xmm0, xmm0
> + EMULATE_HADDPS xmm0, xmm0, xmm1
> + .store:
> + movlps [dstq], xmm0
> + sub lend, 1
> + lea dstq, [dstq + 8]
> + lea coeffsq, [coeffsq + Coeffs.sizeof]
Same as above for both of these leas.
> + jnz .loop_k
> + REP_RET
> +%endif ; ARCH_X86_64
> +%endmacro ; DECLARE_CQT_CALC
> +
> +INIT_XMM sse
> +DECLARE_CQT_CALC
> +INIT_XMM sse3
> +DECLARE_CQT_CALC
> +INIT_YMM avx
> +DECLARE_CQT_CALC
> +INIT_YMM fma3
> +DECLARE_CQT_CALC
> +INIT_YMM fma4
All CPUs supporting FMA4 underperform in functions using ymm registers.
Make it xmm instead.
> +DECLARE_CQT_CALC
> diff --git a/libavfilter/x86/avf_showcqt_init.c b/libavfilter/x86/avf_showcqt_init.c
> new file mode 100644
> index 0000000..664c6ac
> --- /dev/null
> +++ b/libavfilter/x86/avf_showcqt_init.c
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright (c) 2016 Muhammad Faiz <mfcc64 at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/avf_showcqt.h"
> +
> +#define DECLARE_CQT_CALC(type) \
> +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
> + const Coeffs *coeffs, int len, int fft_len)
> +
> +DECLARE_CQT_CALC(sse);
> +DECLARE_CQT_CALC(sse3);
> +DECLARE_CQT_CALC(avx);
> +DECLARE_CQT_CALC(fma3);
> +DECLARE_CQT_CALC(fma4);
> +
> +#define permute_coeffs_0 NULL
> +
> +static void permute_coeffs_01452367(float *v, int len)
> +{
> + int k;
> + for (k = 0; k < len; k += 8) {
> + FFSWAP(float, v[k+2], v[k+4]);
> + FFSWAP(float, v[k+3], v[k+5]);
> + }
> +}
> +
> +av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
> +{
> + int cpuflags = av_get_cpu_flags();
> +
> +#define SELECT_CQT_CALC(type, TYPE, align, perm) \
> +if (EXTERNAL_##TYPE(cpuflags)) { \
> + s->cqt_calc = ff_showcqt_cqt_calc_##type; \
> + s->cqt_align = align; \
> + s->permute_coeffs = permute_coeffs_##perm; \
> +}
> +
> + SELECT_CQT_CALC(sse, SSE, 4, 0);
> + SELECT_CQT_CALC(sse3, SSE3, 4, 0);
> + SELECT_CQT_CALC(avx, AVX, 8, 01452367);
Use AVX_FAST, so this function will not be used on CPUs that set the
AV_CPU_FLAG_AVXSLOW flag.
> + SELECT_CQT_CALC(fma3, FMA3, 8, 01452367);
Same, use FMA3_FAST. The result will then be the FMA3 version used by
Intel CPUs and hopefully AMD Zen, and the FMA4 one by Bulldozer based
CPUs.
> + SELECT_CQT_CALC(fma4, FMA4, 8, 01452367);
> +}
>
More information about the ffmpeg-devel
mailing list