[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide
James Almer
jamrial at gmail.com
Sun Feb 14 02:26:58 CET 2016
On 2/13/2016 9:27 PM, Timothy Gu wrote:
> ---
>
> The reason why this function uses SSE4.1 is the roundps instruction. Would
> love to find a way to truncate a float to integer in SSE2.
>
> ---
> libavfilter/x86/vf_blend.asm | 32 ++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_blend_init.c | 6 ++++++
> 2 files changed, 38 insertions(+)
>
> diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
> index a5ea74c..dac04d7 100644
> --- a/libavfilter/x86/vf_blend.asm
> +++ b/libavfilter/x86/vf_blend.asm
> @@ -24,6 +24,7 @@
>
> SECTION_RODATA
>
> +ps_255: times 4 dd 255.0
> pw_1: times 8 dw 1
> pw_128: times 8 dw 128
> pw_255: times 8 dw 255
> @@ -285,3 +286,34 @@ INIT_XMM sse2
> BLEND_ABS
> INIT_XMM ssse3
> BLEND_ABS
> +
> +INIT_XMM sse4
> +BLEND_INIT divide, 4
> + pxor m2, m2
> + mova m3, [ps_255]
> +.nextrow:
> + mov xq, widthq
> +
> + .loop:
> + movd m0, [topq + xq] ; 000000xx
> + movd m1, [bottomq + xq]
> + punpcklbw m0, m2 ; 00000x0x
> + punpcklbw m1, m2
Assuming you keep using sse4, you could instead do
pmovzxbd m0, [topq + xq]
pmovzxbd m1, [bottomq + xq]
> + punpcklwd m0, m2 ; 000x000x
> + punpcklwd m1, m2
> +
> + cvtdq2ps m0, m0
> + cvtdq2ps m1, m1
> + divps m0, m1 ; a / b
> + mulps m0, m3 ; a / b * 255
> + roundps m0, m0, 3 ; truncate
> + minps m0, m3
Are these two really needed? After a quick glance GCC seems to simply generate more
or less the same code you're using here sans these two. (convert to float, div, mul,
convert to int, saturate to uint8_t).
> + cvtps2dq m0, m0
> +
> + packusdw m0, m0 ; 00000x0x
> + packuswb m0, m0 ; 000000xx
> + movd [dstq + xq], m0
> + add xq, mmsize / 4
> +
> + jl .loop
> +BLEND_END
> diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
> index a6baf94..f542870 100644
> --- a/libavfilter/x86/vf_blend_init.c
> +++ b/libavfilter/x86/vf_blend_init.c
> @@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2)
> BLEND_FUNC(difference, ssse3)
> BLEND_FUNC(negation, sse2)
> BLEND_FUNC(negation, ssse3)
> +BLEND_FUNC(divide, sse4)
>
> av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
> {
> @@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
> case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break;
> }
> }
> + if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) {
> + switch (param->mode) {
> + case BLEND_DIVIDE: param->blend = ff_blend_divide_sse4; break;
> + }
> + }
> }
>
More information about the ffmpeg-devel
mailing list