[FFmpeg-devel] [PATCH 2/2] avfilter/x86/vf_gblur: add postscale SIMD
James Almer
jamrial at gmail.com
Sun Feb 14 03:36:02 EET 2021
On 2/13/2021 8:10 AM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
> libavfilter/x86/vf_gblur.asm | 46 +++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_gblur_init.c | 11 ++++++--
> 2 files changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
> index a25b1659f5..8fea6d2a61 100644
> --- a/libavfilter/x86/vf_gblur.asm
> +++ b/libavfilter/x86/vf_gblur.asm
> @@ -183,3 +183,49 @@ HORIZ_SLICE
> INIT_XMM avx2
> HORIZ_SLICE
> %endif
> +
> +%macro POSTSCALE_SLICE 0
> +%if UNIX64
> +cglobal postscale_slice, 2, 6, 4, ptr, length, postscale, min, max, x
cglobal postscale_slice, 2, 3, 4, ptr, length, x
> +%else
> +cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
> +%endif
> + shl lengthd, 2
> +%if WIN64
> + SWAP 0, 2
> + SWAP 1, 3
> + SWAP 2, 4
> +%endif
> + shufps xm0, xm0, 0
> + shufps xm1, xm1, 0
> + shufps xm2, xm2, 0
> +%if cpuflag(avx2)
> + vinsertf128 m0, m0, xm0, 1
> + vinsertf128 m1, m1, xm1, 1
> + vinsertf128 m2, m2, xm2, 1
You can use vbroadcastss ymm, xmm with AVX2, which combines both the
shufps and vinsertf128 into one instruction.
As is, this function is base AVX. So if you can't measure any
performance gain with vbroadcastss, then just mark the function as AVX.
> +%endif
> + xor xq, xq
> +
> + .loop:
> + movu m3, [ptrq + xq]
> + mulps m3, m0
AVX can use unaligned memory operands, so just do
mulps m3, m0, [ptrq + xq]
But keep the explicit movu + mulps for the SSE version, otherwise x86inc
will expand it into a mova.
> + maxps m3, m1
> + minps m3, m2
> + movu [ptrq+xq], m3
> +
> + add xq, mmsize
> + cmp xd, lengthd
Can't you use the neg trick? It should let you reuse length instead of x.
> + jl .loop
> +
> + RET
> +%endmacro
> +
> +%if ARCH_X86_64
Nothing in this function seems to require x86_64.
> +INIT_XMM sse4
No instruction is SSE4 here. It's all base SSE.
> +POSTSCALE_SLICE
> +
> +%if HAVE_AVX_EXTERNAL
Wrong check.
> +INIT_YMM avx2
> +POSTSCALE_SLICE
> +%endif
> +%endif
> diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
> index e63e59fe23..7a9b40b0ad 100644
> --- a/libavfilter/x86/vf_gblur_init.c
> +++ b/libavfilter/x86/vf_gblur_init.c
> @@ -27,14 +27,21 @@
> void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
> void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
>
> +void ff_postscale_slice_sse4(float *ptr, int length, float postscale, float min, float max);
> +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
> +
> av_cold void ff_gblur_init_x86(GBlurContext *s)
> {
> #if ARCH_X86_64
> int cpu_flags = av_get_cpu_flags();
>
> - if (EXTERNAL_SSE4(cpu_flags))
> + if (EXTERNAL_SSE4(cpu_flags)) {
> s->horiz_slice = ff_horiz_slice_sse4;
> - if (EXTERNAL_AVX2(cpu_flags))
> + s->postscale_slice = ff_postscale_slice_sse4;
> + }
> + if (EXTERNAL_AVX2(cpu_flags)) {
> s->horiz_slice = ff_horiz_slice_avx2;
> + s->postscale_slice = ff_postscale_slice_avx2;
Needs to be EXTERNAL_AVX2_FAST. You're using ymm regs, unlike in
ff_horiz_slice_avx2.
> + }
> #endif
> }
>
More information about the ffmpeg-devel
mailing list