[MPlayer-dev-eng] Fwd: [xine-devel] memcpy.c speedup patch
Arpi
arpi at thot.banki.hu
Thu Jan 9 16:09:31 CET 2003
maybe we should port these changes to fastemmcpy?
--------- Forwarded message ---------
From: Jonathan Brown <jbrown at emergence.uk.net>
To: xine-devel at lists.sourceforge.net
Subject: [xine-devel] memcpy.c speedup patch
I found the same mistake in both sse_memcpy and mmx2_memcpy. They both
presume that prefetchnta prefetches 64 bytes. In actual fact, the p3
prefetches 32 bytes and the p4 prefetches 128 bytes. The patch optimizes
it correctly for the p3. If you want to optimize for the p4 you should
really use movdqa/movdqu.
Please apply to the tree.
CC any replies to me as I am not on the list.
Here are the results on my machine:
------------------------------------------------------
WITHOUT PATCH
------------------------------------------------------
Run 1 glibc memcpy() : 886602411
linux kernel memcpy() : 853727718
MMX optimized memcpy() : 813381025
MMXEXT optimized memcpy() : 611290441
SSE optimized memcpy() : 622632865
Run 2 glibc memcpy() : 853194421
linux kernel memcpy() : 853906195
MMX optimized memcpy() : 811439122
MMXEXT optimized memcpy() : 615564847
SSE optimized memcpy() : 622085572
Run 3 glibc memcpy() : 874273009
linux kernel memcpy() : 853243982
MMX optimized memcpy() : 811123010
MMXEXT optimized memcpy() : 612914732
SSE optimized memcpy() : 623385322
Run 4 glibc memcpy() : 872962086
linux kernel memcpy() : 852637064
MMX optimized memcpy() : 812423490
MMXEXT optimized memcpy() : 616541095
SSE optimized memcpy() : 622359380
Run 5 glibc memcpy() : 871590819
linux kernel memcpy() : 854843246
MMX optimized memcpy() : 812416964
MMXEXT optimized memcpy() : 613631746
SSE optimized memcpy() : 620818979
------------------------------------------------------
WITH PATCH
------------------------------------------------------
Run 1 glibc memcpy() : 875260935
linux kernel memcpy() : 855195229
MMX optimized memcpy() : 815879691
MMXEXT optimized memcpy() : 510607166
SSE optimized memcpy() : 515002950
Run 2 glibc memcpy() : 853993138
linux kernel memcpy() : 855955195
MMX optimized memcpy() : 811581836
MMXEXT optimized memcpy() : 527623830
SSE optimized memcpy() : 512657609
Run 3 glibc memcpy() : 872485301
linux kernel memcpy() : 851106279
MMX optimized memcpy() : 811177856
MMXEXT optimized memcpy() : 509514430
SSE optimized memcpy() : 517702228
Run 4 glibc memcpy() : 879030747
linux kernel memcpy() : 856619835
MMX optimized memcpy() : 817141239
MMXEXT optimized memcpy() : 512585798
SSE optimized memcpy() : 515385082
Run 5 glibc memcpy() : 874519760
linux kernel memcpy() : 853971343
MMX optimized memcpy() : 813392177
MMXEXT optimized memcpy() : 508638682
SSE optimized memcpy() : 514397110
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 8
model name : Pentium III (Coppermine)
stepping : 6
cpu MHz : 800.067
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 2
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 mmx fxsr sse
bogomips : 1585.15
diff -ur xine-lib-1-beta2/src/xine-utils/memcpy.c
my-xine-lib-1-beta2/src/xine-utils/memcpy.c
--- xine-lib-1-beta2/src/xine-utils/memcpy.c 2002-12-22
20:48:28.000000000 +0000
+++ my-xine-lib-1-beta2/src/xine-utils/memcpy.c 2003-01-09
06:11:36.000000000 +0000
@@ -168,10 +168,15 @@
/* PREFETCH has effect even for MOVSB instruction ;) */
__asm__ __volatile__ (
" prefetchnta (%0)\n"
+ " prefetchnta 32(%0)\n"
" prefetchnta 64(%0)\n"
+ " prefetchnta 96(%0)\n"
" prefetchnta 128(%0)\n"
+ " prefetchnta 160(%0)\n"
" prefetchnta 192(%0)\n"
+ " prefetchnta 224(%0)\n"
" prefetchnta 256(%0)\n"
+ " prefetchnta 288(%0)\n"
: : "r" (from) );
if(len >= MIN_LEN)
@@ -193,6 +198,7 @@
{
__asm__ __volatile__ (
"prefetchnta 320(%0)\n"
+ "prefetchnta 352(%0)\n"
"movups (%0), %%xmm0\n"
"movups 16(%0), %%xmm1\n"
"movups 32(%0), %%xmm2\n"
@@ -215,6 +221,7 @@
{
__asm__ __volatile__ (
"prefetchnta 320(%0)\n"
+ "prefetchnta 352(%0)\n"
"movaps (%0), %%xmm0\n"
"movaps 16(%0), %%xmm1\n"
"movaps 32(%0), %%xmm2\n"
@@ -300,10 +307,15 @@
/* PREFETCH has effect even for MOVSB instruction ;) */
__asm__ __volatile__ (
" prefetchnta (%0)\n"
+ " prefetchnta 32(%0)\n"
" prefetchnta 64(%0)\n"
+ " prefetchnta 96(%0)\n"
" prefetchnta 128(%0)\n"
+ " prefetchnta 160(%0)\n"
" prefetchnta 192(%0)\n"
+ " prefetchnta 224(%0)\n"
" prefetchnta 256(%0)\n"
+ " prefetchnta 288(%0)\n"
: : "r" (from) );
if(len >= MIN_LEN)
@@ -323,6 +335,7 @@
{
__asm__ __volatile__ (
"prefetchnta 320(%0)\n"
+ "prefetchnta 352(%0)\n"
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
-------------------------------------------------------
This SF.NET email is sponsored by:
SourceForge Enterprise Edition + IBM + LinuxWorld = Something 2 See!
http://www.vasoftware.com
_______________________________________________
xine-devel mailing list
xine-devel at lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/xine-devel
More information about the MPlayer-dev-eng
mailing list