[FFmpeg-devel] [PATCH] x86/vf_blend: Add SSE4.1 optimization for divide

Sun Feb 14 02:26:58 CET 2016

On 2/13/2016 9:27 PM, Timothy Gu wrote:
> ---
> 
> The reason why this function uses SSE4.1 is the roundps instruction. Would
> love to find a way to truncate a float to integer in SSE2.
> 
> ---
>  libavfilter/x86/vf_blend.asm    | 32 ++++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_blend_init.c |  6 ++++++
>  2 files changed, 38 insertions(+)
> 
> diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
> index a5ea74c..dac04d7 100644
> --- a/libavfilter/x86/vf_blend.asm
> +++ b/libavfilter/x86/vf_blend.asm
> @@ -24,6 +24,7 @@
>  
>  SECTION_RODATA
>  
> +ps_255: times 4 dd 255.0
>  pw_1:   times 8 dw 1
>  pw_128: times 8 dw 128
>  pw_255: times 8 dw 255
> @@ -285,3 +286,34 @@ INIT_XMM sse2
>  BLEND_ABS
>  INIT_XMM ssse3
>  BLEND_ABS
> +
> +INIT_XMM sse4
> +BLEND_INIT divide, 4
> +    pxor       m2, m2
> +    mova       m3, [ps_255]
> +.nextrow:
> +    mov        xq, widthq
> +
> +    .loop:
> +        movd            m0, [topq + xq]      ; 000000xx
> +        movd            m1, [bottomq + xq]
> +        punpcklbw       m0, m2               ; 00000x0x
> +        punpcklbw       m1, m2

Assuming you keep using sse4, you could instead do

pmovzxbd m0, [topq + xq]
pmovzxbd m1, [bottomq + xq]

> +        punpcklwd       m0, m2               ; 000x000x
> +        punpcklwd       m1, m2
> +
> +        cvtdq2ps        m0, m0
> +        cvtdq2ps        m1, m1
> +        divps           m0, m1               ; a / b
> +        mulps           m0, m3               ; a / b * 255
> +        roundps         m0, m0, 3            ; truncate
> +        minps           m0, m3

Are these two really needed? After a quick glance GCC seems to simply generate more
or less the same code you're using here sans these two. (convert to float, div, mul,
convert to int, saturate to uint8_t).

> +        cvtps2dq        m0, m0
> +
> +        packusdw        m0, m0               ; 00000x0x
> +        packuswb        m0, m0               ; 000000xx
> +        movd   [dstq + xq], m0
> +        add             xq, mmsize / 4
> +
> +    jl .loop
> +BLEND_END
> diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
> index a6baf94..f542870 100644
> --- a/libavfilter/x86/vf_blend_init.c
> +++ b/libavfilter/x86/vf_blend_init.c
> @@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2)
>  BLEND_FUNC(difference, ssse3)
>  BLEND_FUNC(negation, sse2)
>  BLEND_FUNC(negation, ssse3)
> +BLEND_FUNC(divide, sse4)
>  
>  av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
>  {
> @@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
>          case BLEND_NEGATION:   param->blend = ff_blend_negation_ssse3;   break;
>          }
>      }
> +    if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) {
> +        switch (param->mode) {
> +        case BLEND_DIVIDE:   param->blend = ff_blend_divide_sse4;   break;
> +        }
> +    }
>  }
>