[FFmpeg-devel] [WIP] [PATCH 4/4] x86: dsputilenc: convert hf_noise*_mmx to yasm

Mon Jun 2 03:36:43 CEST 2014

On Thu, May 29, 2014 at 08:56:04PM -0700, Timothy Gu wrote:
> Signed-off-by: Timothy Gu <timothygu99 at gmail.com>
> ---
> hf_noise16 segfaults with make fate-vsynth1-mpeg4-nsse, but I don't know
> hot to fix it. Also, did I get the `call` right?
> ---
>  libavcodec/x86/dsputilenc.asm   |  91 ++++++++++++++
>  libavcodec/x86/dsputilenc_mmx.c | 265 ++--------------------------------------
>  2 files changed, 101 insertions(+), 255 deletions(-)
> 
> diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
> index 7064024..09bbe95 100644
> --- a/libavcodec/x86/dsputilenc.asm
> +++ b/libavcodec/x86/dsputilenc.asm
> @@ -608,3 +608,94 @@ INIT_XMM sse2
>  SUM_ABS_DCTELEM 7, 2
>  INIT_XMM ssse3
>  SUM_ABS_DCTELEM 6, 2
> +
> +;------------------------------------------------------------------------------
> +; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h)
> +;------------------------------------------------------------------------------
> +; %1 = 8/16. %2-5=m#
> +%macro HF_NOISE_PART1 5
> +    mova      m%2, [pix1q]
> +%if %1 == 8
> +    mova      m%3, m%2
> +    psllq     m%2, 8
> +    psrlq     m%3, 8
> +    psrlq     m%2, 8
> +%else
> +    mova      m%3, [pix1q+1]
> +%endif
> +    mova      m%4, m%2
> +    mova      m%5, m%3
> +    punpcklbw m%2, m7
> +    punpcklbw m%3, m7
> +    punpckhbw m%4, m7
> +    punpckhbw m%5, m7
> +    psubw     m%2, m%3
> +    psubw     m%4, m%5
> +%endmacro
> +
> +; %1-2 = m#
> +%macro HF_NOISE_PART2 2
> +    psubw      m0, m4
> +    psubw      m2, m5
> +    pxor       m3, m3
> +    pxor       m1, m1
> +    pcmpgtw    m3, m%1
> +    pcmpgtw    m1, m%2
> +    pxor      m%1, m3
> +    pxor      m%2, m1
> +    psubw     m%1, m3
> +    psubw     m%2, m1
> +    paddw     m%2, m%1
> +    paddw      m6, m%2
> +%endmacro
> +
> +; %1 = 8/16
> +%macro HF_NOISE 1
> +cglobal hf_noise%1, 3,3,0, pix1, lsize, h
> +    movsxdifnidn lsizeq, lsized
> +%if %1 == 16
> +    push pix1q
> +    push hq
> +%endif

dont use push/pop they can messup the yasm magic macros
you can use PUSH/POP but better dont use them either, there should be
enough registers


> +    sub        hd, 2
> +    pxor       m7, m7
> +    pxor       m6, m6
> +    HF_NOISE_PART1 %1, 0, 1, 2, 3
> +    add     pix1q, lsizeq
> +    HF_NOISE_PART1 %1, 4, 1, 5, 3
> +    HF_NOISE_PART2     0, 2
> +    add     pix1q, lsizeq
> +.loop:
> +    HF_NOISE_PART1 %1, 0, 1, 2, 3
> +    HF_NOISE_PART2     4, 5
> +    add     pix1q, lsizeq
> +    HF_NOISE_PART1 %1, 4, 1, 5, 3
> +    HF_NOISE_PART2     0, 2
> +    add     pix1q, lsizeq
> +    sub        hd, 2
> +        jne .loop
> +
> +    mova       m0, m6
> +    punpcklwd  m0, m7
> +    punpckhwd  m6, m7
> +    paddd      m6, m0
> +    mova       m0, m6
> +    psrlq      m6, 32
> +    paddd      m0, m6
> +%if %1 == 16

> +    movd      ebx, m0   ; ebx = result of hf_noise16;

you cant just write into a random register
declare a local variable in the cglobal macro above and use it instead


> +    pop        hq       ; restore h and pix1
> +    pop     pix1q
> +    ; lsize is unchanged (except movsxd, which hf_noise8 is going to do anyway)
> +    add     pix1q, 8    ; pix1 = pix1 + 8;

> +    call    hf_noise8   ; eax = hf_noise8_mmx(pix1, lsize, h);

dont call cglobal functions, if you do you would have to emulate the
calling conventions of all ABIs, x86_32 would pass arguments over the
stack for example

also looking at the disassembly of the function with gdb and the
register values when it crashes (if it does) or single steping through
the code wth gdb should help you understand whats the problem or
difference between what you want and what the computer actually does

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Rewriting code that is poorly written but fully understood is good.
Rewriting code that one doesnt understand is a sign that one is less smart
then the original author, trying to rewrite it will not make it better.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140602/214633bc/attachment.asc>