[FFmpeg-devel] [WIP] [PATCH 4/4] x86: dsputilenc: convert hf_noise*_mmx to yasm

Mon Jun 2 16:53:22 CEST 2014

2014-06-02 15:38 GMT+02:00 Christophe Gisquet <christophe.gisquet at gmail.com>:
> but unless I'm
> mistaken, the solution I proposed earlier is much simpler.

See attached patch on top of Timothy's.

> Also, don't
> hesitate to run objdump -d on the object file just to make sure you
> missed nothing.

Which helps notice issues with HF_NOISE_PART2.

-- 
Christophe
-------------- next part --------------

diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 3fdc006..84cb7b3 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -634,9 +634,9 @@ SUM_ABS_DCTELEM 6, 2
 %endmacro
 
 ; %1-2 = m#
-%macro HF_NOISE_PART2 2
-    psubw      m0, m4
-    psubw      m2, m5
+%macro HF_NOISE_PART2 4
+    psubw     m%1, m%3
+    psubw     m%2, m%4
     pxor       m3, m3
     pxor       m1, m1
     pcmpgtw    m3, m%1
@@ -653,24 +653,20 @@ SUM_ABS_DCTELEM 6, 2
 %macro HF_NOISE 1
 cglobal hf_noise%1, 3,3,0, pix1, lsize, h
     movsxdifnidn lsizeq, lsized
-%if %1 == 16
-    push pix1q
-    push hq
-%endif
     sub        hd, 2
     pxor       m7, m7
     pxor       m6, m6
     HF_NOISE_PART1 %1, 0, 1, 2, 3
     add     pix1q, lsizeq
     HF_NOISE_PART1 %1, 4, 1, 5, 3
-    HF_NOISE_PART2     0, 2
+    HF_NOISE_PART2     0, 2, 4, 5
     add     pix1q, lsizeq
 .loop:
     HF_NOISE_PART1 %1, 0, 1, 2, 3
-    HF_NOISE_PART2     4, 5
+    HF_NOISE_PART2     4, 5, 0, 2
     add     pix1q, lsizeq
     HF_NOISE_PART1 %1, 4, 1, 5, 3
-    HF_NOISE_PART2     0, 2
+    HF_NOISE_PART2     0, 2, 4, 5
     add     pix1q, lsizeq
     sub        hd, 2
         jne .loop
@@ -682,18 +678,8 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h
     mova       m0, m6
     psrlq      m6, 32
     paddd      m0, m6
-%if %1 == 16
-    movd      ebx, m0   ; ebx = result of hf_noise16;
-    pop        hq       ; restore h and pix1
-    pop     pix1q
-    ; lsize is unchanged (except movsxd, which hf_noise8 is going to do anyway)
-    add     pix1q, 8    ; pix1 = pix1 + 8;
-    call    hf_noise8   ; eax = hf_noise8_mmx(pix1, lsize, h);
-    add       eax, ebx  ; eax = eax + ebx;
-%else
     movd      eax, m0   ; eax = result of hf_noise8;
-%endif
-    RET                 ; return eax;
+    REP_RET                 ; return eax;
 %endmacro
 
 INIT_MMX mmx
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index f215347..e180486 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -77,8 +77,8 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
         score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
     else
         score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
-    score2 = ff_hf_noise16_mmx(pix1, line_size, h) -
-             ff_hf_noise16_mmx(pix2, line_size, h);
+    score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
+           - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;