[FFmpeg-devel] [PATCH 2/2] all: do standards compliant absdiff computation
Clément Bœsch
u at pkh.me
Sun Aug 23 23:28:20 CEST 2015
On Sun, Aug 23, 2015 at 11:23:54PM +0200, Clément Bœsch wrote:
> On Sun, Aug 23, 2015 at 11:58:23AM -0400, Ganesh Ajjanagadde wrote:
> [...]
> > diff --git a/libavfilter/vf_hqx.c b/libavfilter/vf_hqx.c
> > index fa15d9c..0178793 100644
> > --- a/libavfilter/vf_hqx.c
> > +++ b/libavfilter/vf_hqx.c
> > @@ -65,9 +65,9 @@ static av_always_inline int yuv_diff(uint32_t yuv1, uint32_t yuv2)
> > #define YMASK 0xff0000
> > #define UMASK 0x00ff00
> > #define VMASK 0x0000ff
> > - return abs((yuv1 & YMASK) - (yuv2 & YMASK)) > (48 << 16) ||
> > - abs((yuv1 & UMASK) - (yuv2 & UMASK)) > ( 7 << 8) ||
> > - abs((yuv1 & VMASK) - (yuv2 & VMASK)) > ( 6 << 0);
> > + return FFUABSDIFF(yuv1 & YMASK, yuv2 & YMASK) > (48 << 16) ||
> > + FFUABSDIFF(yuv1 & UMASK, yuv2 & UMASK) > ( 7 << 8) ||
> > + FFUABSDIFF(yuv1 & VMASK, yuv2 & VMASK) > ( 6 << 0);
>
> This is one of the bottleneck function of the filter. How does it affect
> speed? Can you compare the generated ASM?
>
To answer my second question:
[/tmp]☭ cat a.c
#include <stdlib.h>
#include <stdint.h>
#define YMASK 0xff0000
#define UMASK 0x00ff00
#define VMASK 0x0000ff
#define FFUABSDIFF(a,b) (((a) > (b)) ? ((a)-(b)) : ((b)-(a)))
int yuv_diff_0(uint32_t yuv1, uint32_t yuv2)
{
return abs((yuv1 & YMASK) - (yuv2 & YMASK)) > (48 << 16) ||
abs((yuv1 & UMASK) - (yuv2 & UMASK)) > ( 7 << 8) ||
abs((yuv1 & VMASK) - (yuv2 & VMASK)) > ( 6 << 0);
}
int yuv_diff_1(uint32_t yuv1, uint32_t yuv2)
{
return FFUABSDIFF(yuv1 & YMASK, yuv2 & YMASK) > (48 << 16) ||
FFUABSDIFF(yuv1 & UMASK, yuv2 & UMASK) > ( 7 << 8) ||
FFUABSDIFF(yuv1 & VMASK, yuv2 & VMASK) > ( 6 << 0);
}
[/tmp]☭ gcc -Wall -O2 -c a.c && objdump -d -Mintel a.o
a.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <yuv_diff_0>:
0: 89 f8 mov eax,edi
2: 89 f2 mov edx,esi
4: 81 e2 00 00 ff 00 and edx,0xff0000
a: 25 00 00 ff 00 and eax,0xff0000
f: 29 d0 sub eax,edx
11: 99 cdq
12: 31 d0 xor eax,edx
14: 29 d0 sub eax,edx
16: 89 c2 mov edx,eax
18: b8 01 00 00 00 mov eax,0x1
1d: 81 fa 00 00 30 00 cmp edx,0x300000
23: 7f 3e jg 63 <yuv_diff_0+0x63>
25: 89 fa mov edx,edi
27: 89 f1 mov ecx,esi
29: 81 e1 00 ff 00 00 and ecx,0xff00
2f: 81 e2 00 ff 00 00 and edx,0xff00
35: 29 ca sub edx,ecx
37: 89 d1 mov ecx,edx
39: c1 f9 1f sar ecx,0x1f
3c: 31 ca xor edx,ecx
3e: 29 ca sub edx,ecx
40: 81 fa 00 07 00 00 cmp edx,0x700
46: 7f 1b jg 63 <yuv_diff_0+0x63>
48: 40 0f b6 ff movzx edi,dil
4c: 40 0f b6 f6 movzx esi,sil
50: 29 f7 sub edi,esi
52: 89 f8 mov eax,edi
54: c1 f8 1f sar eax,0x1f
57: 31 c7 xor edi,eax
59: 29 c7 sub edi,eax
5b: 31 c0 xor eax,eax
5d: 83 ff 06 cmp edi,0x6
60: 0f 9f c0 setg al
63: f3 c3 repz ret
65: 90 nop
66: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
6d: 00 00 00
0000000000000070 <yuv_diff_1>:
70: 89 fa mov edx,edi
72: 89 f0 mov eax,esi
74: 81 e2 00 00 ff 00 and edx,0xff0000
7a: 25 00 00 ff 00 and eax,0xff0000
7f: 39 c2 cmp edx,eax
81: 76 4d jbe d0 <yuv_diff_1+0x60>
83: 29 c2 sub edx,eax
85: b8 01 00 00 00 mov eax,0x1
8a: 81 fa 00 00 30 00 cmp edx,0x300000
90: 77 7e ja 110 <yuv_diff_1+0xa0>
92: 89 fa mov edx,edi
94: 89 f0 mov eax,esi
96: 81 e2 00 ff 00 00 and edx,0xff00
9c: 25 00 ff 00 00 and eax,0xff00
a1: 39 c2 cmp edx,eax
a3: 76 43 jbe e8 <yuv_diff_1+0x78>
a5: 29 c2 sub edx,eax
a7: b8 01 00 00 00 mov eax,0x1
ac: 81 fa 00 07 00 00 cmp edx,0x700
b2: 77 74 ja 128 <yuv_diff_1+0xb8>
b4: 40 0f b6 ff movzx edi,dil
b8: 40 0f b6 f6 movzx esi,sil
bc: 39 f7 cmp edi,esi
be: 76 58 jbe 118 <yuv_diff_1+0xa8>
c0: 29 f7 sub edi,esi
c2: 31 c0 xor eax,eax
c4: 83 ff 06 cmp edi,0x6
c7: 0f 97 c0 seta al
ca: c3 ret
cb: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
d0: 29 d0 sub eax,edx
d2: 89 c2 mov edx,eax
d4: b8 01 00 00 00 mov eax,0x1
d9: 81 fa 00 00 30 00 cmp edx,0x300000
df: 76 b1 jbe 92 <yuv_diff_1+0x22>
e1: f3 c3 repz ret
e3: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
e8: 29 d0 sub eax,edx
ea: 89 c2 mov edx,eax
ec: b8 01 00 00 00 mov eax,0x1
f1: 81 fa 00 07 00 00 cmp edx,0x700
f7: 77 e8 ja e1 <yuv_diff_1+0x71>
f9: 40 0f b6 ff movzx edi,dil
fd: 40 0f b6 f6 movzx esi,sil
101: 39 f7 cmp edi,esi
103: 76 13 jbe 118 <yuv_diff_1+0xa8>
105: eb b9 jmp c0 <yuv_diff_1+0x50>
107: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
10e: 00 00
110: f3 c3 repz ret
112: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
118: 29 fe sub esi,edi
11a: 31 c0 xor eax,eax
11c: 83 fe 06 cmp esi,0x6
11f: 0f 97 c0 seta al
122: c3 ret
123: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
128: f3 c3 repz ret
[/tmp]☭
I must say I'm slightly uncomfortable with that change.
--
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150823/ee0e0291/attachment.sig>
More information about the ffmpeg-devel
mailing list