[FFmpeg-devel] [PATCH] x86/me_cmp: port mmxext and sse2 sad functions to yasm

Mon Sep 15 00:12:06 CEST 2014

On Sat, Sep 13, 2014 at 10:12:12PM -0300, James Almer wrote:
> Also add a missing c->pix_abs[0][0] initialization, and sse2 versions of
> sad16_x2, sad16_y2 and sad16_xy2.
> Since the _xy2 versions are not bitexact, they are accordingly marked as
> approximate.
> 
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---

> Not benched.

if the author of some code doesnt benchmark his code, how can he know
which way it is faster ?
what effect each difference has ? ...

> 
>  libavcodec/x86/me_cmp.asm    | 229 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/me_cmp_init.c | 203 +++++++++-----------------------------
>  2 files changed, 278 insertions(+), 154 deletions(-)
> 
> diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
> index b0741f3..68dc701 100644
> --- a/libavcodec/x86/me_cmp.asm
> +++ b/libavcodec/x86/me_cmp.asm
> @@ -23,6 +23,10 @@
>  
>  %include "libavutil/x86/x86util.asm"
>  
> +SECTION_RODATA
> +
> +cextern pb_1
> +
>  SECTION .text
>  
>  %macro DIFF_PIXELS_1 4
> @@ -465,3 +469,228 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h
>  INIT_MMX mmx
>  HF_NOISE 8
>  HF_NOISE 16
> +
> +;---------------------------------------------------------------------------------------
> +;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
> +;---------------------------------------------------------------------------------------
> +%macro SAD 1
> +cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
> +%if %1 == mmsize
> +    shr hd, 1
> +%define STRIDE strideq
> +%else
> +%define STRIDE 8
> +%endif
> +    pxor      m2, m2
> +
> +align 16
> +.loop
> +    movu      m0, [pix2q]
> +    movu      m1, [pix2q+STRIDE]
> +    psadbw    m0, [pix1q]
> +    psadbw    m1, [pix1q+STRIDE]
> +    paddw     m2, m0
> +    paddw     m2, m1
> +%if %1 == mmsize
> +    lea    pix1q, [pix1q+strideq*2]
> +    lea    pix2q, [pix2q+strideq*2]
> +%else
> +    add    pix1q, strideq
> +    add    pix2q, strideq
> +%endif

> +    dec       hd
> +    jg .loop

the other loops use jnz, why the difference ?

> +%if mmsize == 16
> +    movhlps   m0, m2
> +    paddw     m2, m0
> +%endif
> +    movd     eax, m2
> +    RET
> +%endmacro
> +
> +INIT_MMX mmxext
> +SAD 8
> +SAD 16
> +INIT_XMM sse2
> +SAD 16
> +
> +;------------------------------------------------------------------------------------------
> +;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
> +;------------------------------------------------------------------------------------------
> +%macro SAD_X2 1
> +cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
> +%if %1 == mmsize
> +    shr       hd, 1
> +%define STRIDE strideq
> +%else
> +%define STRIDE 8
> +%endif
> +    pxor      m0, m0
> +

> +align 16

do these improve or reduce the speed ?

> +.loop:
> +    movu      m1, [pix2q]
> +    movu      m2, [pix2q+STRIDE]
> +%if cpuflag(sse2)
> +    movu      m3, [pix2q+1]
> +    movu      m4, [pix2q+STRIDE+1]
> +    pavgb     m1, m3
> +    pavgb     m2, m4
> +%else
> +    pavgb     m1, [pix2q+1]
> +    pavgb     m2, [pix2q+STRIDE+1]
> +%endif
> +    psadbw    m1, [pix1q]
> +    psadbw    m2, [pix1q+STRIDE]
> +    paddw     m0, m1
> +    paddw     m0, m2
> +%if %1 == mmsize
> +    lea    pix1q, [pix1q+2*strideq]
> +    lea    pix2q, [pix2q+2*strideq]
> +%else
> +    add    pix1q, strideq
> +    add    pix2q, strideq
> +%endif

> +    dec       hd

dec/inc has some speed penalties on some cpus
see 16.2 in http://www.agner.org/optimize/optimizing_assembly.pdf

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

If you think the mosad wants you dead since a long time then you are either
wrong or dead since a long time.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140915/11d6ec28/attachment.asc>