[FFmpeg-devel] [PATCH] avfilter/avf_showcqt: cqt_calc optimization on x86

Tue Jun 7 05:36:03 CEST 2016

On 6/4/2016 4:36 AM, Muhammad Faiz wrote:
> benchmark on x86_64
> cqt_time:
> plain = 3.292 s
> SSE   = 1.640 s
> SSE3  = 1.631 s
> AVX   = 1.395 s
> FMA3  = 1.271 s
> FMA4  = not available

Try using the START_TIMER and STOP_TIMER macros to wrap the s->cqt_calc
call in libavfilter/avf_showcqt.c
It will potentially give more accurate results than the current
UPDATE_TIME(s->cqt_time) check.

> 
> untested on x86_32

Do you have a sample command to test this? As Michael said FATE doesn't
cover showcqt.

> 
> Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
> ---
>  libavfilter/avf_showcqt.c          |   7 ++
>  libavfilter/avf_showcqt.h          |   3 +
>  libavfilter/x86/Makefile           |   2 +
>  libavfilter/x86/avf_showcqt.asm    | 206 +++++++++++++++++++++++++++++++++++++
>  libavfilter/x86/avf_showcqt_init.c |  63 ++++++++++++
>  5 files changed, 281 insertions(+)
>  create mode 100644 libavfilter/x86/avf_showcqt.asm
>  create mode 100644 libavfilter/x86/avf_showcqt_init.c
> 
> diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
> index b88c83c..62d5b09 100644
> --- a/libavfilter/avf_showcqt.c
> +++ b/libavfilter/avf_showcqt.c
> @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
>              w *= sign * (1.0 / s->fft_len);
>              s->coeffs[m].val[x - s->coeffs[m].start] = w;
>          }
> +
> +        if (s->permute_coeffs)
> +            s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
>      }
>  
>      av_expr_free(expr);
> @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
>  
>      s->cqt_align = 1;
>      s->cqt_calc = cqt_calc;
> +    s->permute_coeffs = NULL;
>      s->draw_sono = draw_sono;
>      if (s->format == AV_PIX_FMT_RGB24) {
>          s->draw_bar = draw_bar_rgb;
> @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
>          s->update_sono = update_sono_yuv;
>      }
>  
> +    if (ARCH_X86)
> +        ff_showcqt_init_x86(s);
> +
>      if ((ret = init_cqt(s)) < 0)
>          return ret;
>  
> diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
> index b945f49..588830f 100644
> --- a/libavfilter/avf_showcqt.h
> +++ b/libavfilter/avf_showcqt.h
> @@ -74,6 +74,7 @@ typedef struct {
>      /* callback */
>      void                (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
>                                      int len, int fft_len);
> +    void                (*permute_coeffs)(float *v, int len);
>      void                (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
>                                      const ColorFloat *c, int bar_h);
>      void                (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
> @@ -112,4 +113,6 @@ typedef struct {
>      int                 axis;
>  } ShowCQTContext;
>  
> +void ff_showcqt_init_x86(ShowCQTContext *s);
> +
>  #endif
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 4486b79..b6195f8 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
>  OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
>  OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
>  OBJS-$(CONFIG_REMOVEGRAIN_FILTER)            += x86/vf_removegrain_init.o
> +OBJS-$(CONFIG_SHOWCQT_FILTER)                += x86/avf_showcqt_init.o
>  OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
>  OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
>  OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
> @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
>  ifdef CONFIG_GPL
>  YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER)       += x86/vf_removegrain.o
>  endif
> +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER)           += x86/avf_showcqt.o
>  YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o
>  YASM-OBJS-$(CONFIG_STEREO3D_FILTER)          += x86/vf_stereo3d.o
>  YASM-OBJS-$(CONFIG_TBLEND_FILTER)            += x86/vf_blend.o
> diff --git a/libavfilter/x86/avf_showcqt.asm b/libavfilter/x86/avf_showcqt.asm
> new file mode 100644
> index 0000000..ba30786
> --- /dev/null
> +++ b/libavfilter/x86/avf_showcqt.asm
> @@ -0,0 +1,206 @@
> +;*****************************************************************************
> +;* x86-optimized functions for showcqt filter
> +;*
> +;* Copyright (C) 2016 Muhammad Faiz <mfcc64 at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +%if ARCH_X86_64
> +%define pointer resq
> +%else
> +%define pointer resd
> +%endif
> +
> +struc Coeffs
> +    .val:   pointer 1
> +    .start: resd 1
> +    .len:   resd 1
> +    .sizeof:
> +endstruc
> +
> +%macro EMULATE_HADDPS 3 ; dst, src, tmp
> +%if cpuflag(sse3)
> +    haddps  %1, %2
> +%else
> +    movaps  %3, %1
> +    shufps  %1, %2, q2020
> +    shufps  %3, %2, q3131
> +    addps   %1, %3

This is great. Much better and more efficient than other attempts to
emulate haddps scattered across the codebase.
It also makes me wonder if haddps, a ~5 cycles latency instruction is
really better than the combination of a mostly free mov, two 1 cycle
shuffles and one 3 cycle add to justify extra functions with it as the
only difference, at least in cases where there are no register
constrains.

Your benchmarks above suggest it is although barely, so I'm curious
about what the timer.h macros will show.

> +%endif
> +%endmacro ; EMULATE_HADDPS
> +
> +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
> +%if cpuflag(fma3) || cpuflag(fma4)
> +    fmaddps %1, %2, %3, %4
> +%else
> +    mulps   %5, %2, %3
> +    addps   %1, %4, %5
> +%endif
> +%endmacro ; EMULATE_FMADDPS
> +
> +%macro CQT_CALC 9
> +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
> +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
> +    mov     id, xd
> +    add     id, [coeffsq + Coeffs.start + %9]
> +    movaps  m%5, [srcq + 8 * iq]
> +    movaps  m%7, [srcq + 8 * iq + mmsize]
> +    shufps  m%6, m%5, m%7, q3131
> +    shufps  m%5, m%5, m%7, q2020
> +    sub     id, fft_lend
> +    EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
> +    neg     id

Is this supposed to turn a positive value negative? If so then it should
be "neg iq", otherwise on x86_64 the high 32 bits of iq used in the
effective addresses below would be zero.

> +    EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
> +    movups  m%5, [srcq + 8 * iq - mmsize + 8]
> +    movups  m%7, [srcq + 8 * iq - 2*mmsize + 8]
> +    %if mmsize == 32
> +    vperm2f128 m%5, m%5, m%5, 1
> +    vperm2f128 m%7, m%7, m%7, 1
> +    %endif
> +    shufps  m%6, m%5, m%7, q1313
> +    shufps  m%5, m%5, m%7, q0202
> +    EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
> +    EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
> +%endmacro ; CQT_CALC
> +
> +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
> +    addps   m%5, m%4, m%2
> +    subps   m%6, m%3, m%1
> +    addps   m%1, m%3
> +    subps   m%2, m%4
> +    EMULATE_HADDPS m%5, m%6, m%3
> +    EMULATE_HADDPS m%1, m%2, m%3
> +    EMULATE_HADDPS m%1, m%5, m%2
> +    %if mmsize == 32
> +    vextractf128 xmm%2, m%1, 1
> +    addps   xmm%1, xmm%2
> +    %endif
> +%endmacro ; CQT_SEPARATE
> +
> +%macro DECLARE_CQT_CALC 0
> +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
> +%if ARCH_X86_64
> +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
> +    align   16
> +    .loop_k:
> +        mov     xd, [coeffsq + Coeffs.len]
> +        xorps   m0, m0
> +        movaps  m1, m0
> +        movaps  m2, m0
> +        mov     coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
> +        movaps  m3, m0
> +        movaps  m8, m0
> +        cmp     coeffs_lend, xd
> +        movaps  m9, m0
> +        movaps  m10, m0
> +        movaps  m11, m0
> +        cmova   coeffs_lend, xd
> +        xor     xd, xd
> +        test    coeffs_lend, coeffs_lend
> +        jz      .check_loop_b
> +        mov     coeffs_valq, [coeffsq + Coeffs.val]
> +        mov     coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
> +        align   16
> +        .loop_ab:
> +            movaps  m7, [coeffs_valq + 4 * xq]
> +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
> +            movaps  m7, [coeffs_val2q + 4 * xq]
> +            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
> +            add     xd, mmsize/4
> +            cmp     xd, coeffs_lend
> +            jb      .loop_ab
> +        .check_loop_b:
> +        cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
> +        jae     .check_loop_a
> +        align   16
> +        .loop_b:
> +            movaps  m7, [coeffs_val2q + 4 * xq]
> +            CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
> +            add     xd, mmsize/4
> +            cmp     xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
> +            jb      .loop_b
> +        .loop_end:
> +        CQT_SEPARATE 0, 1, 2, 3, 4, 5
> +        CQT_SEPARATE 8, 9, 10, 11, 4, 5
> +        mulps   xmm0, xmm0
> +        mulps   xmm8, xmm8
> +        EMULATE_HADDPS xmm0, xmm8, xmm1
> +        movaps  [dstq], xmm0
> +        sub     lend, 2
> +        lea     dstq, [dstq + 16]

Use add

> +        lea     coeffsq, [coeffsq + 2*Coeffs.sizeof]

Same, assuming sizeof is an immediate.

> +        jnz     .loop_k
> +        REP_RET
> +        align   16
> +        .check_loop_a:
> +        cmp     xd, [coeffsq + Coeffs.len]
> +        jae     .loop_end
> +        align   16
> +        .loop_a:
> +            movaps  m7, [coeffs_valq + 4 * xq]
> +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
> +            add     xd, mmsize/4
> +            cmp     xd, [coeffsq + Coeffs.len]
> +            jb      .loop_a
> +        jmp     .loop_end
> +%else
> +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
> +%define fft_lend r4m
> +    align   16
> +    .loop_k:
> +        mov     xd, [coeffsq + Coeffs.len]
> +        xorps   m0, m0
> +        movaps  m1, m0
> +        movaps  m2, m0
> +        movaps  m3, m0
> +        test    xd, xd
> +        jz      .store
> +        mov     coeffs_valq, [coeffsq + Coeffs.val]
> +        xor     xd, xd
> +        align   16
> +        .loop_x:
> +            movaps  m7, [coeffs_valq + 4 * xq]
> +            CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
> +            add     xd, mmsize/4
> +            cmp     xd, [coeffsq + Coeffs.len]
> +            jb      .loop_x
> +        CQT_SEPARATE 0, 1, 2, 3, 4, 5
> +        mulps   xmm0, xmm0
> +        EMULATE_HADDPS xmm0, xmm0, xmm1
> +        .store:
> +        movlps  [dstq], xmm0
> +        sub     lend, 1
> +        lea     dstq, [dstq + 8]
> +        lea     coeffsq, [coeffsq + Coeffs.sizeof]

Same as above for both of these leas.

> +        jnz     .loop_k
> +        REP_RET
> +%endif ; ARCH_X86_64
> +%endmacro ; DECLARE_CQT_CALC
> +
> +INIT_XMM sse
> +DECLARE_CQT_CALC
> +INIT_XMM sse3
> +DECLARE_CQT_CALC
> +INIT_YMM avx
> +DECLARE_CQT_CALC
> +INIT_YMM fma3
> +DECLARE_CQT_CALC
> +INIT_YMM fma4

All CPUs supporting FMA4 underperform in functions using ymm registers.
Make it xmm instead.

> +DECLARE_CQT_CALC
> diff --git a/libavfilter/x86/avf_showcqt_init.c b/libavfilter/x86/avf_showcqt_init.c
> new file mode 100644
> index 0000000..664c6ac
> --- /dev/null
> +++ b/libavfilter/x86/avf_showcqt_init.c
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright (c) 2016 Muhammad Faiz <mfcc64 at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/avf_showcqt.h"
> +
> +#define DECLARE_CQT_CALC(type) \
> +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
> +                                const Coeffs *coeffs, int len, int fft_len)
> +
> +DECLARE_CQT_CALC(sse);
> +DECLARE_CQT_CALC(sse3);
> +DECLARE_CQT_CALC(avx);
> +DECLARE_CQT_CALC(fma3);
> +DECLARE_CQT_CALC(fma4);
> +
> +#define permute_coeffs_0 NULL
> +
> +static void permute_coeffs_01452367(float *v, int len)
> +{
> +    int k;
> +    for (k = 0; k < len; k += 8) {
> +        FFSWAP(float, v[k+2], v[k+4]);
> +        FFSWAP(float, v[k+3], v[k+5]);
> +    }
> +}
> +
> +av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
> +{
> +    int cpuflags = av_get_cpu_flags();
> +
> +#define SELECT_CQT_CALC(type, TYPE, align, perm) \
> +if (EXTERNAL_##TYPE(cpuflags)) { \
> +    s->cqt_calc = ff_showcqt_cqt_calc_##type; \
> +    s->cqt_align = align; \
> +    s->permute_coeffs = permute_coeffs_##perm; \
> +}
> +
> +    SELECT_CQT_CALC(sse,  SSE,  4, 0);
> +    SELECT_CQT_CALC(sse3, SSE3, 4, 0);
> +    SELECT_CQT_CALC(avx,  AVX,  8, 01452367);

Use AVX_FAST, so this function will not be used on CPUs that set the
AV_CPU_FLAG_AVXSLOW flag.

> +    SELECT_CQT_CALC(fma3, FMA3, 8, 01452367);

Same, use FMA3_FAST. The result will then be the FMA3 version used by
Intel CPUs and hopefully AMD Zen, and the FMA4 one by Bulldozer based
CPUs.

> +    SELECT_CQT_CALC(fma4, FMA4, 8, 01452367);
> +}
>