[FFmpeg-devel] [PATCH 2/2] avfilter/x86/vf_gblur: add postscale SIMD
James Almer
jamrial at gmail.com
Mon Feb 15 19:44:39 EET 2021
On 2/14/2021 11:32 AM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
> libavfilter/x86/vf_gblur.asm | 49 +++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_gblur_init.c | 17 ++++++++++--
> 2 files changed, 63 insertions(+), 3 deletions(-)
>
> diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
> index a25b1659f5..8ccfbdc56b 100644
> --- a/libavfilter/x86/vf_gblur.asm
> +++ b/libavfilter/x86/vf_gblur.asm
> @@ -183,3 +183,52 @@ HORIZ_SLICE
> INIT_XMM avx2
> HORIZ_SLICE
> %endif
> +
> +%macro POSTSCALE_SLICE 0
> +%if UNIX64
> +cglobal postscale_slice, 2, 3, 4, ptr, length, x
2, 2, 4, ptr, length
> +%else
> +cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
5, 5, 4, ptr, length, postscale, min, max
> +%endif
> + shl lengthd, 2
shl lengthd, 2
add ptrq, lengthq
neg lengthq
> +%if WIN64
> + SWAP 0, 2
> + SWAP 1, 3
> + SWAP 2, 4
> +%endif
> +%if cpuflag(avx2)
> + vbroadcastss m0, xm0
> + vbroadcastss m1, xm1
> + vbroadcastss m2, xm2
> +%else
> + shufps xm0, xm0, 0
> + shufps xm1, xm1, 0
> + shufps xm2, xm2, 0
> +%endif
> + xor xq, xq
remove this instruction.
> +
> + .loop:
> +%if cpuflag(avx2)
> + mulps m3, m0, [ptrq + xq]
Replace xq with lengthq here and everywhere else.
> +%else
> + movu m3, [ptrq + xq]
> + mulps m3, m0
> +%endif
> + maxps m3, m1
> + minps m3, m2
> + movu [ptrq+xq], m3
> +
> + add xq, mmsize
> + cmp xd, lengthd
remove this cmp.
> + jl .loop
> +
> + RET
> +%endmacro
> +
> +INIT_XMM sse
> +POSTSCALE_SLICE
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +POSTSCALE_SLICE
> +%endif
> diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
> index e63e59fe23..9223cb797d 100644
> --- a/libavfilter/x86/vf_gblur_init.c
> +++ b/libavfilter/x86/vf_gblur_init.c
> @@ -27,14 +27,25 @@
> void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
> void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
>
> +void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
> +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
> +
> av_cold void ff_gblur_init_x86(GBlurContext *s)
> {
> -#if ARCH_X86_64
> int cpu_flags = av_get_cpu_flags();
>
> - if (EXTERNAL_SSE4(cpu_flags))
> + if (EXTERNAL_SSE(cpu_flags)) {
> + s->postscale_slice = ff_postscale_slice_sse;
> + }
> + if (EXTERNAL_AVX2(cpu_flags)) {
EXTERNAL_AVX2_FAST
> + s->postscale_slice = ff_postscale_slice_avx2;
> + }
> +#if ARCH_X86_64
> + if (EXTERNAL_SSE4(cpu_flags)) {
> s->horiz_slice = ff_horiz_slice_sse4;
> - if (EXTERNAL_AVX2(cpu_flags))
> + }
> + if (EXTERNAL_AVX2(cpu_flags)) {
> s->horiz_slice = ff_horiz_slice_avx2;
> + }
> #endif
> }
LGTM with the above.
More information about the ffmpeg-devel
mailing list