[FFmpeg-devel] [PATCH] avfilter/avf_showcqt: cqt_calc optimization on x86
Muhammad Faiz
mfcc64 at gmail.com
Tue Jun 7 11:18:09 CEST 2016
On Tue, Jun 7, 2016 at 10:36 AM, James Almer <jamrial at gmail.com> wrote:
> On 6/4/2016 4:36 AM, Muhammad Faiz wrote:
>> benchmark on x86_64
>> cqt_time:
>> plain = 3.292 s
>> SSE = 1.640 s
>> SSE3 = 1.631 s
>> AVX = 1.395 s
>> FMA3 = 1.271 s
>> FMA4 = not available
>
> Try using the START_TIMER and STOP_TIMER macros to wrap the s->cqt_calc
> call in libavfilter/avf_showcqt.c
> It will potentially give more accurate results than the current
> UPDATE_TIME(s->cqt_time) check.
>
OK, but probably I will check it privately (not sending patch)
>>
>> untested on x86_32
>
> Do you have a sample command to test this? As Michael said FATE doesn't
> cover showcqt.
>
I check it using psnr filter (avg psnr above 90dB means OK)
#!/bin/bash
# example usage: ./psnr-check audio.mp3 yuv420p "-cpuflags -fma3-fma4-avx-sse3"
mkfifo in0.y4m
mkfifo in1.y4m
# this is new ffmpeg
build_path=$HOME/Documents/sources/ffmpeg/ffmpeg-build
LD_LIBRARY_PATH=$build_path/libavcodec:$build_path/libavdevice:$build_path/libavfilter
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libavformat:$build_path/libavutil
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libpostproc:$build_path/libswresample
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libswscale
export LD_LIBRARY_PATH
$build_path/ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2,
format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in0.y4m >/dev/null
2>&1 </dev/null &
# this is old ffmpeg
unset LD_LIBRARY_PATH
ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2,
format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in1.y4m >/dev/null
2>&1 </dev/null &
ffmpeg -i $dir/in0.y4m -i $dir/in1.y4m -filter_complex "psnr=f=-" -f
null -y /dev/null
>>
>> Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
>> ---
>> libavfilter/avf_showcqt.c | 7 ++
>> libavfilter/avf_showcqt.h | 3 +
>> libavfilter/x86/Makefile | 2 +
>> libavfilter/x86/avf_showcqt.asm | 206 +++++++++++++++++++++++++++++++++++++
>> libavfilter/x86/avf_showcqt_init.c | 63 ++++++++++++
>> 5 files changed, 281 insertions(+)
>> create mode 100644 libavfilter/x86/avf_showcqt.asm
>> create mode 100644 libavfilter/x86/avf_showcqt_init.c
>>
>> diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
>> index b88c83c..62d5b09 100644
>> --- a/libavfilter/avf_showcqt.c
>> +++ b/libavfilter/avf_showcqt.c
>> @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s)
>> w *= sign * (1.0 / s->fft_len);
>> s->coeffs[m].val[x - s->coeffs[m].start] = w;
>> }
>> +
>> + if (s->permute_coeffs)
>> + s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len);
>> }
>>
>> av_expr_free(expr);
>> @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink)
>>
>> s->cqt_align = 1;
>> s->cqt_calc = cqt_calc;
>> + s->permute_coeffs = NULL;
>> s->draw_sono = draw_sono;
>> if (s->format == AV_PIX_FMT_RGB24) {
>> s->draw_bar = draw_bar_rgb;
>> @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink)
>> s->update_sono = update_sono_yuv;
>> }
>>
>> + if (ARCH_X86)
>> + ff_showcqt_init_x86(s);
>> +
>> if ((ret = init_cqt(s)) < 0)
>> return ret;
>>
>> diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
>> index b945f49..588830f 100644
>> --- a/libavfilter/avf_showcqt.h
>> +++ b/libavfilter/avf_showcqt.h
>> @@ -74,6 +74,7 @@ typedef struct {
>> /* callback */
>> void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs,
>> int len, int fft_len);
>> + void (*permute_coeffs)(float *v, int len);
>> void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h,
>> const ColorFloat *c, int bar_h);
>> void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off);
>> @@ -112,4 +113,6 @@ typedef struct {
>> int axis;
>> } ShowCQTContext;
>>
>> +void ff_showcqt_init_x86(ShowCQTContext *s);
>> +
>> #endif
>> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
>> index 4486b79..b6195f8 100644
>> --- a/libavfilter/x86/Makefile
>> +++ b/libavfilter/x86/Makefile
>> @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
>> OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
>> OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
>> OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o
>> +OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o
>> OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
>> OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
>> OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o
>> @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
>> ifdef CONFIG_GPL
>> YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o
>> endif
>> +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o
>> YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o
>> YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o
>> YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
>> diff --git a/libavfilter/x86/avf_showcqt.asm b/libavfilter/x86/avf_showcqt.asm
>> new file mode 100644
>> index 0000000..ba30786
>> --- /dev/null
>> +++ b/libavfilter/x86/avf_showcqt.asm
>> @@ -0,0 +1,206 @@
>> +;*****************************************************************************
>> +;* x86-optimized functions for showcqt filter
>> +;*
>> +;* Copyright (C) 2016 Muhammad Faiz <mfcc64 at gmail.com>
>> +;*
>> +;* This file is part of FFmpeg.
>> +;*
>> +;* FFmpeg is free software; you can redistribute it and/or
>> +;* modify it under the terms of the GNU Lesser General Public
>> +;* License as published by the Free Software Foundation; either
>> +;* version 2.1 of the License, or (at your option) any later version.
>> +;*
>> +;* FFmpeg is distributed in the hope that it will be useful,
>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> +;* Lesser General Public License for more details.
>> +;*
>> +;* You should have received a copy of the GNU Lesser General Public
>> +;* License along with FFmpeg; if not, write to the Free Software
>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> +;******************************************************************************
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +%if ARCH_X86_64
>> +%define pointer resq
>> +%else
>> +%define pointer resd
>> +%endif
>> +
>> +struc Coeffs
>> + .val: pointer 1
>> + .start: resd 1
>> + .len: resd 1
>> + .sizeof:
>> +endstruc
>> +
>> +%macro EMULATE_HADDPS 3 ; dst, src, tmp
>> +%if cpuflag(sse3)
>> + haddps %1, %2
>> +%else
>> + movaps %3, %1
>> + shufps %1, %2, q2020
>> + shufps %3, %2, q3131
>> + addps %1, %3
>
> This is great. Much better and more efficient than other attempts to
> emulate haddps scattered across the codebase.
> It also makes me wonder if haddps, a ~5 cycles latency instruction is
> really better than the combination of a mostly free mov, two 1 cycle
> shuffles and one 3 cycle add to justify extra functions with it as the
> only difference, at least in cases where there are no register
> constrains.
>
> Your benchmarks above suggest it is although barely, so I'm curious
> about what the timer.h macros will show.
>
Even probably haddps is slower than shufps+addps on some machine
But, there is haddps and I use it.
Updated using SSE3_FAST
>> +%endif
>> +%endmacro ; EMULATE_HADDPS
>> +
>> +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
>> +%if cpuflag(fma3) || cpuflag(fma4)
>> + fmaddps %1, %2, %3, %4
>> +%else
>> + mulps %5, %2, %3
>> + addps %1, %4, %5
>> +%endif
>> +%endmacro ; EMULATE_FMADDPS
>> +
>> +%macro CQT_CALC 9
>> +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
>> +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
>> + mov id, xd
>> + add id, [coeffsq + Coeffs.start + %9]
>> + movaps m%5, [srcq + 8 * iq]
>> + movaps m%7, [srcq + 8 * iq + mmsize]
>> + shufps m%6, m%5, m%7, q3131
>> + shufps m%5, m%5, m%7, q2020
>> + sub id, fft_lend
>> + EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
>> + neg id
>
> Is this supposed to turn a positive value negative? If so then it should
> be "neg iq", otherwise on x86_64 the high 32 bits of iq used in the
> effective addresses below would be zero.
>
This is the intended behavior (i = fft_len - i) evaluated with int not int64_t
>> + EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
>> + movups m%5, [srcq + 8 * iq - mmsize + 8]
>> + movups m%7, [srcq + 8 * iq - 2*mmsize + 8]
>> + %if mmsize == 32
>> + vperm2f128 m%5, m%5, m%5, 1
>> + vperm2f128 m%7, m%7, m%7, 1
>> + %endif
>> + shufps m%6, m%5, m%7, q1313
>> + shufps m%5, m%5, m%7, q0202
>> + EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
>> + EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
>> +%endmacro ; CQT_CALC
>> +
>> +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
>> + addps m%5, m%4, m%2
>> + subps m%6, m%3, m%1
>> + addps m%1, m%3
>> + subps m%2, m%4
>> + EMULATE_HADDPS m%5, m%6, m%3
>> + EMULATE_HADDPS m%1, m%2, m%3
>> + EMULATE_HADDPS m%1, m%5, m%2
>> + %if mmsize == 32
>> + vextractf128 xmm%2, m%1, 1
>> + addps xmm%1, xmm%2
>> + %endif
>> +%endmacro ; CQT_SEPARATE
>> +
>> +%macro DECLARE_CQT_CALC 0
>> +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
>> +%if ARCH_X86_64
>> +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
>> + align 16
>> + .loop_k:
>> + mov xd, [coeffsq + Coeffs.len]
>> + xorps m0, m0
>> + movaps m1, m0
>> + movaps m2, m0
>> + mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
>> + movaps m3, m0
>> + movaps m8, m0
>> + cmp coeffs_lend, xd
>> + movaps m9, m0
>> + movaps m10, m0
>> + movaps m11, m0
>> + cmova coeffs_lend, xd
>> + xor xd, xd
>> + test coeffs_lend, coeffs_lend
>> + jz .check_loop_b
>> + mov coeffs_valq, [coeffsq + Coeffs.val]
>> + mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
>> + align 16
>> + .loop_ab:
>> + movaps m7, [coeffs_valq + 4 * xq]
>> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
>> + movaps m7, [coeffs_val2q + 4 * xq]
>> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
>> + add xd, mmsize/4
>> + cmp xd, coeffs_lend
>> + jb .loop_ab
>> + .check_loop_b:
>> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
>> + jae .check_loop_a
>> + align 16
>> + .loop_b:
>> + movaps m7, [coeffs_val2q + 4 * xq]
>> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
>> + add xd, mmsize/4
>> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
>> + jb .loop_b
>> + .loop_end:
>> + CQT_SEPARATE 0, 1, 2, 3, 4, 5
>> + CQT_SEPARATE 8, 9, 10, 11, 4, 5
>> + mulps xmm0, xmm0
>> + mulps xmm8, xmm8
>> + EMULATE_HADDPS xmm0, xmm8, xmm1
>> + movaps [dstq], xmm0
>> + sub lend, 2
>> + lea dstq, [dstq + 16]
>
> Use add
>
>> + lea coeffsq, [coeffsq + 2*Coeffs.sizeof]
>
> Same, assuming sizeof is an immediate.
>
This is optimization to separate sub and jnz with lea.
Using add will clobber flag register.
Also lea does not need rex prefix
>> + jnz .loop_k
>> + REP_RET
>> + align 16
>> + .check_loop_a:
>> + cmp xd, [coeffsq + Coeffs.len]
>> + jae .loop_end
>> + align 16
>> + .loop_a:
>> + movaps m7, [coeffs_valq + 4 * xq]
>> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
>> + add xd, mmsize/4
>> + cmp xd, [coeffsq + Coeffs.len]
>> + jb .loop_a
>> + jmp .loop_end
>> +%else
>> +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
>> +%define fft_lend r4m
>> + align 16
>> + .loop_k:
>> + mov xd, [coeffsq + Coeffs.len]
>> + xorps m0, m0
>> + movaps m1, m0
>> + movaps m2, m0
>> + movaps m3, m0
>> + test xd, xd
>> + jz .store
>> + mov coeffs_valq, [coeffsq + Coeffs.val]
>> + xor xd, xd
>> + align 16
>> + .loop_x:
>> + movaps m7, [coeffs_valq + 4 * xq]
>> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
>> + add xd, mmsize/4
>> + cmp xd, [coeffsq + Coeffs.len]
>> + jb .loop_x
>> + CQT_SEPARATE 0, 1, 2, 3, 4, 5
>> + mulps xmm0, xmm0
>> + EMULATE_HADDPS xmm0, xmm0, xmm1
>> + .store:
>> + movlps [dstq], xmm0
>> + sub lend, 1
>> + lea dstq, [dstq + 8]
>> + lea coeffsq, [coeffsq + Coeffs.sizeof]
>
> Same as above for both of these leas.
>
Same answer.
>> + jnz .loop_k
>> + REP_RET
>> +%endif ; ARCH_X86_64
>> +%endmacro ; DECLARE_CQT_CALC
>> +
>> +INIT_XMM sse
>> +DECLARE_CQT_CALC
>> +INIT_XMM sse3
>> +DECLARE_CQT_CALC
>> +INIT_YMM avx
>> +DECLARE_CQT_CALC
>> +INIT_YMM fma3
>> +DECLARE_CQT_CALC
>> +INIT_YMM fma4
>
> All CPUs supporting FMA4 underperform in functions using ymm registers.
> Make it xmm instead.
>
OK, updated
>> +DECLARE_CQT_CALC
>> diff --git a/libavfilter/x86/avf_showcqt_init.c b/libavfilter/x86/avf_showcqt_init.c
>> new file mode 100644
>> index 0000000..664c6ac
>> --- /dev/null
>> +++ b/libavfilter/x86/avf_showcqt_init.c
>> @@ -0,0 +1,63 @@
>> +/*
>> + * Copyright (c) 2016 Muhammad Faiz <mfcc64 at gmail.com>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/attributes.h"
>> +#include "libavutil/cpu.h"
>> +#include "libavutil/x86/cpu.h"
>> +#include "libavfilter/avf_showcqt.h"
>> +
>> +#define DECLARE_CQT_CALC(type) \
>> +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
>> + const Coeffs *coeffs, int len, int fft_len)
>> +
>> +DECLARE_CQT_CALC(sse);
>> +DECLARE_CQT_CALC(sse3);
>> +DECLARE_CQT_CALC(avx);
>> +DECLARE_CQT_CALC(fma3);
>> +DECLARE_CQT_CALC(fma4);
>> +
>> +#define permute_coeffs_0 NULL
>> +
>> +static void permute_coeffs_01452367(float *v, int len)
>> +{
>> + int k;
>> + for (k = 0; k < len; k += 8) {
>> + FFSWAP(float, v[k+2], v[k+4]);
>> + FFSWAP(float, v[k+3], v[k+5]);
>> + }
>> +}
>> +
>> +av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
>> +{
>> + int cpuflags = av_get_cpu_flags();
>> +
>> +#define SELECT_CQT_CALC(type, TYPE, align, perm) \
>> +if (EXTERNAL_##TYPE(cpuflags)) { \
>> + s->cqt_calc = ff_showcqt_cqt_calc_##type; \
>> + s->cqt_align = align; \
>> + s->permute_coeffs = permute_coeffs_##perm; \
>> +}
>> +
>> + SELECT_CQT_CALC(sse, SSE, 4, 0);
>> + SELECT_CQT_CALC(sse3, SSE3, 4, 0);
>> + SELECT_CQT_CALC(avx, AVX, 8, 01452367);
>
> Use AVX_FAST, so this function will not be used on CPUs that set the
> AV_CPU_FLAG_AVXSLOW flag.
>
>> + SELECT_CQT_CALC(fma3, FMA3, 8, 01452367);
>
> Same, use FMA3_FAST. The result will then be the FMA3 version used by
> Intel CPUs and hopefully AMD Zen, and the FMA4 one by Bulldozer based
> CPUs.
>
>> + SELECT_CQT_CALC(fma4, FMA4, 8, 01452367);
>> +}
>>
>
OK, also reorder (FMA4 before AVX because AVX/ymm without FMA4 is faster than
FMA4/xmm)
Modified patch attached
Thank's
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-avfilter-avf_showcqt-cqt_calc-optimization-on-x86.patch
Type: text/x-patch
Size: 12745 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160607/ba93921d/attachment.bin>
More information about the ffmpeg-devel
mailing list