[FFmpeg-devel] [PATCH] x86/me_cmp: port mmxext and sse2 sad functions to yasm
Michael Niedermayer
michaelni at gmx.at
Mon Sep 15 00:12:06 CEST 2014
On Sat, Sep 13, 2014 at 10:12:12PM -0300, James Almer wrote:
> Also add a missing c->pix_abs[0][0] initialization, and sse2 versions of
> sad16_x2, sad16_y2 and sad16_xy2.
> Since the _xy2 versions are not bitexact, they are accordingly marked as
> approximate.
>
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
> Not benched.
if the author of some code doesnt benchmark his code, how can he know
which way it is faster ?
what effect each difference has ? ...
>
> libavcodec/x86/me_cmp.asm | 229 +++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/me_cmp_init.c | 203 +++++++++-----------------------------
> 2 files changed, 278 insertions(+), 154 deletions(-)
>
> diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
> index b0741f3..68dc701 100644
> --- a/libavcodec/x86/me_cmp.asm
> +++ b/libavcodec/x86/me_cmp.asm
> @@ -23,6 +23,10 @@
>
> %include "libavutil/x86/x86util.asm"
>
> +SECTION_RODATA
> +
> +cextern pb_1
> +
> SECTION .text
>
> %macro DIFF_PIXELS_1 4
> @@ -465,3 +469,228 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h
> INIT_MMX mmx
> HF_NOISE 8
> HF_NOISE 16
> +
> +;---------------------------------------------------------------------------------------
> +;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
> +;---------------------------------------------------------------------------------------
> +%macro SAD 1
> +cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
> +%if %1 == mmsize
> + shr hd, 1
> +%define STRIDE strideq
> +%else
> +%define STRIDE 8
> +%endif
> + pxor m2, m2
> +
> +align 16
> +.loop
> + movu m0, [pix2q]
> + movu m1, [pix2q+STRIDE]
> + psadbw m0, [pix1q]
> + psadbw m1, [pix1q+STRIDE]
> + paddw m2, m0
> + paddw m2, m1
> +%if %1 == mmsize
> + lea pix1q, [pix1q+strideq*2]
> + lea pix2q, [pix2q+strideq*2]
> +%else
> + add pix1q, strideq
> + add pix2q, strideq
> +%endif
> + dec hd
> + jg .loop
the other loops use jnz, why the difference ?
> +%if mmsize == 16
> + movhlps m0, m2
> + paddw m2, m0
> +%endif
> + movd eax, m2
> + RET
> +%endmacro
> +
> +INIT_MMX mmxext
> +SAD 8
> +SAD 16
> +INIT_XMM sse2
> +SAD 16
> +
> +;------------------------------------------------------------------------------------------
> +;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
> +;------------------------------------------------------------------------------------------
> +%macro SAD_X2 1
> +cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
> +%if %1 == mmsize
> + shr hd, 1
> +%define STRIDE strideq
> +%else
> +%define STRIDE 8
> +%endif
> + pxor m0, m0
> +
> +align 16
do these improve or reduce the speed ?
> +.loop:
> + movu m1, [pix2q]
> + movu m2, [pix2q+STRIDE]
> +%if cpuflag(sse2)
> + movu m3, [pix2q+1]
> + movu m4, [pix2q+STRIDE+1]
> + pavgb m1, m3
> + pavgb m2, m4
> +%else
> + pavgb m1, [pix2q+1]
> + pavgb m2, [pix2q+STRIDE+1]
> +%endif
> + psadbw m1, [pix1q]
> + psadbw m2, [pix1q+STRIDE]
> + paddw m0, m1
> + paddw m0, m2
> +%if %1 == mmsize
> + lea pix1q, [pix1q+2*strideq]
> + lea pix2q, [pix2q+2*strideq]
> +%else
> + add pix1q, strideq
> + add pix2q, strideq
> +%endif
> + dec hd
dec/inc has some speed penalties on some cpus
see 16.2 in http://www.agner.org/optimize/optimizing_assembly.pdf
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
If you think the mosad wants you dead since a long time then you are either
wrong or dead since a long time.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140915/11d6ec28/attachment.asc>
More information about the ffmpeg-devel
mailing list