[Mplayer-users] [mplayer PATCH] fastmemcpy SSE support

Sat Apr 14 22:48:23 CEST 2001

Hello!

I tried to implement SSE support (through 128-bits registers) into fastmemcpy.
This code is perfectly not tested by me because hardware don't allow to do it.
But I hope this code may be faster on processors with SSE.
(But may be not and may be not workable). Anyway it would be better to test it
on your celeron-2 before applying.

Also I have one question: on my hardware sometime (but enough frequently)
during playing MPEG4-DivX;) files probably at half of file audio and video streams
are async. But after pressing left and right arrow keys streams sync again.
Where are problems? In codecs or in mplayer?

And last: Please look at libvo/x11-common.c. There are

#ifdef X11_FULLSCREEN
...
static int dpms_disabled=0;
static int timeout_save=0;
...
#endif

void saver_on(Display *mDisplay) {

    int nothing;
    if (dpms_disabled)

I.e. when X11_FULLSCREEN is undefined source can not be compiled.
My friend has video card with video out but have no X11-devel package installed
(i.e. *.h files are missed)
There is possibility to compile mplayer in such situation?

Best regards! Nick

========== PATCH ==================

--- fastmemcpy.old	Thu Apr 12 14:40:10 2001
+++ fastmemcpy.h	Sat Apr 14 20:00:07 2001
@@ -27,60 +27,57 @@
 	  len&=63;
 	  
 	__asm__ __volatile__ (
-		"1: prefetchnta (%0)\n"		/* This set is 28 bytes */
-		"   prefetchnta 64(%0)\n"
-		"   prefetchnta 128(%0)\n"
-		"   prefetchnta 192(%0)\n"
-		"   prefetchnta 256(%0)\n"
-#if 0		
-		"2:  \n"
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
-#endif		
+		"prefetchnta (%0)\n"
+		"prefetchnta 64(%0)\n"
+		"prefetchnta 128(%0)\n"
+		"prefetchnta 192(%0)\n"
+		"prefetchnta 256(%0)\n"
 		: : "r" (from) );
-		
-	
+        /*
+           This algorithm is top effective when the code consequently
+           reads and writes blocks which have size of cache line.
+           Size of cache line is processor-dependent.
+           It will, however, be a minimum of 32 bytes on any processors.
+           It would be better to have a number of instructions which
+           perform reading and writing to be multiple to a number of
+           processor's decoders, but it's not always possible.
+        */
 	for(; i>0; i--)
 	{
 		__asm__ __volatile__ (
-		"1:  prefetchnta 320(%0)\n"
-		"2:  movq (%0), %%mm0\n"
-		"  movq 8(%0), %%mm1\n"
-		"  movq 16(%0), %%mm2\n"
-		"  movq 24(%0), %%mm3\n"
-		"  movntq %%mm0, (%1)\n"
-		"  movntq %%mm1, 8(%1)\n"
-		"  movntq %%mm2, 16(%1)\n"
-		"  movntq %%mm3, 24(%1)\n"
-		"  movq 32(%0), %%mm0\n"
-		"  movq 40(%0), %%mm1\n"
-		"  movq 48(%0), %%mm2\n"
-		"  movq 56(%0), %%mm3\n"
-		"  movntq %%mm0, 32(%1)\n"
-		"  movntq %%mm1, 40(%1)\n"
-		"  movntq %%mm2, 48(%1)\n"
-		"  movntq %%mm3, 56(%1)\n"
-#if 0		
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 1b, 3b\n"
-		".previous"
-#endif		
-		: : "r" (from), "r" (to) : "memory");
+		"prefetchnta 320(%0)\n"
+#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
+		"movups (%0), %%xmm0\n"
+		"movups 16(%0), %%xmm1\n"
+		"movntps %%xmm0, (%1)\n"
+		"movntps %%xmm1, 16(%1)\n"
+		"movups 32(%0), %%xmm0\n"
+		"movups 48(%0), %%xmm1\n"
+		"movntps %%xmm0, 32(%1)\n"
+		"movntps %%xmm1, 48(%1)\n"
+#else /* Only K7 (may be other) */ 
+		"movq (%0), %%mm0\n"
+		"movq 8(%0), %%mm1\n"
+		"movq 16(%0), %%mm2\n"
+		"movq 24(%0), %%mm3\n"
+		"movntq %%mm0, (%1)\n"
+		"movntq %%mm1, 8(%1)\n"
+		"movntq %%mm2, 16(%1)\n"
+		"movntq %%mm3, 24(%1)\n"
+		"movq 32(%0), %%mm0\n"
+		"movq 40(%0), %%mm1\n"
+		"movq 48(%0), %%mm2\n"
+		"movq 56(%0), %%mm3\n"
+		"movntq %%mm0, 32(%1)\n"
+		"movntq %%mm1, 40(%1)\n"
+		"movntq %%mm2, 48(%1)\n"
+		"movntq %%mm3, 56(%1)\n"
+#endif
+		:: "r" (from), "r" (to) : "memory");
 		from+=64;
 		to+=64;
 	}
-	        __asm__ __volatile__ ("emms":::"memory");
+        __asm__ __volatile__ ("emms":::"memory");
 	}
 	/*
 	 *	Now do the tail of the block

====== EOF =========


_______________________________________________
Mplayer-users mailing list
Mplayer-users at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-users