[Ffmpeg-devel] gcc4 support & MMX fixups (from Debian)
Paweł Sikora
pluto
Tue Jan 31 23:37:04 CET 2006
Dnia Tuesday, 31 of January 2006 21:25, matthieu castet napisa?:
> Hi Pawe?,
>
> Pawe? Sikora wrote:
> > Hi all,
> >
> > I have an implementation of transpose4x4 in C which uses gcc's vector
> > extensions. It doesn't press register allocator so much and allows
> > optimal code scheduling.
> >
> > Instantiation of attached patch e.g. in foo(dst, src, 4, 4)
> > gives a nice piece of code:
> >
> > [ x86-64 example ]
> >
> > foo: movd 4(%rsi), %mm0
> > movd (%rsi), %mm1
> > movd 8(%rsi), %mm2
> > movd 12(%rsi), %mm3
> > punpcklbw %mm0, %mm1
> > punpcklbw %mm3, %mm2
> > movq %mm1, %mm0
> > punpckhwd %mm2, %mm1
> > punpcklwd %mm2, %mm0
> > movd %mm1, 8(%rdi)
> > punpckhdq %mm1, %mm1
> > movd %mm0, (%rdi)
> > punpckhdq %mm0, %mm0
> > movd %mm1, 12(%rdi)
> > movd %mm0, 4(%rdi)
> > ret
> >
> > actually gcc-4.1 has a good optimizer and happy asm. hardcoding
> > doesn't introduce incredible performance boost but only degradation
> > of code scheduling.
>
> Could you post a benchmarck between the 2 versions ?
I did a simple benchmark with transpose4x4 marked with attribute noinline.
results:
orig: iters = 1000000000, dt = 7.92 [avg]
fixed: iters = 1000000000, dt = 7.35 [avg]
we gain: ~7.2%
hardware:
cpu: athlon64-3000+
ram: 2x512MB geil.
mb: gigabyte K8U-939 ULi socket 939
moreover, for x86-64 version we can speedup transpose4x4.
simple change of `stride` parameters from `int` to `long` changes code from:
orig_transpose4x4:
leal (%rdx,%rdx), %r9d
leal (%rcx,%rcx), %eax
movslq %edx,%r11
movslq %ecx,%r8
movslq %r9d,%r10
addl %edx, %r9d
movslq %eax,%rdx
addl %ecx, %eax
movslq %r9d,%r9
cltq
#APP
movd (%rsi), %mm0
movd (%rsi,%r8), %mm1
movd (%rsi,%rdx), %mm2
movd (%rsi,%rax), %mm3
punpcklbw %mm1, %mm0
punpcklbw %mm3, %mm2
movq %mm0, %mm1
punpcklwd %mm2, %mm0
punpckhwd %mm2, %mm1
movd %mm0, (%rdi)
punpckhdq %mm0, %mm0
movd %mm0, (%rdi,%r11)
movd %mm1, (%rdi,%r10)
punpckhdq %mm1, %mm1
movd %mm1, (%rdi,%r9)
#NO_APP
ret
fixed_transpose4x4:
movslq %ecx,%rax
movd (%rsi), %mm1
movd (%rsi,%rax), %mm3
leal (%rcx,%rcx), %eax
movslq %eax,%r8
addl %ecx, %eax
punpcklbw %mm3, %mm1
cltq
movd (%rsi,%r8), %mm2
movd (%rsi,%rax), %mm0
movslq %edx,%rax
punpcklbw %mm0, %mm2
movq %mm1, %mm0
punpcklwd %mm2, %mm0
punpckhwd %mm2, %mm1
movd %mm0, (%rdi)
punpckhdq %mm0, %mm0
movd %mm0, (%rdi,%rax)
leal (%rdx,%rdx), %eax
movslq %eax,%rcx
addl %edx, %eax
movd %mm1, (%rdi,%rcx)
punpckhdq %mm1, %mm1
cltq
movd %mm1, (%rdi,%rax)
ret
to:
orig_transpose4x4:
leaq (%rdx,%rdx,2), %r8
leaq (%rcx,%rcx,2), %rax
#APP
movd (%rsi), %mm0
movd (%rsi,%rcx), %mm1
movd (%rsi,%rcx,2), %mm2
movd (%rax,%rsi), %mm3
punpcklbw %mm1, %mm0
punpcklbw %mm3, %mm2
movq %mm0, %mm1
punpcklwd %mm2, %mm0
punpckhwd %mm2, %mm1
movd %mm0, (%rdi)
punpckhdq %mm0, %mm0
movd %mm0, (%rdi,%rdx)
movd %mm1, (%rdi,%rdx,2)
punpckhdq %mm1, %mm1
movd %mm1, (%r8,%rdi)
#NO_APP
ret
fixed_transpose4x4:
movd (%rsi,%rcx), %mm3
movd (%rsi,%rcx,2), %mm2
leaq (%rcx,%rcx,2), %rcx
movd (%rsi), %mm1
movd (%rcx,%rsi), %mm0
punpcklbw %mm3, %mm1
punpcklbw %mm0, %mm2
movq %mm1, %mm0
punpckhwd %mm2, %mm1
punpcklwd %mm2, %mm0
movd %mm0, (%rdi)
punpckhdq %mm0, %mm0
movd %mm0, (%rdi,%rdx)
movd %mm1, (%rdi,%rdx,2)
punpckhdq %mm1, %mm1
leaq (%rdx,%rdx,2), %rdx
movd %mm1, (%rdx,%rdi)
ret
as you can see the function is smaller and faster.
this change reduces total time from 7.35s to 6.24s
and diffs beetwen both implementations are unmeasurable.
--
to_be || !to_be == 1, to_be | ~to_be == -1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: test.c
Type: text/x-csrc
Size: 2766 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20060131/580c3c80/attachment.c>
More information about the ffmpeg-devel
mailing list