[Mplayer-users] [PATCH]: mplayer-20010408 MMX2 support

Nick Kurshev nickols_k at mail.ru
Wed Apr 11 15:28:50 CEST 2001


Hello!

I did some experiments and have found that mplayer can be slightly speedup by using
my version of memcpy.
I have Duron processor and it speedup for 2-3% of video stream encoding 
in case of using -vo sdl option.
Unfortunately, this version has effect and can be used only on K7 and P3 processors and newer.
I take this code from Linux-2.4.3 and slightly modified it.
Because mplayer uses weakly ordered data Linux memcpy3d without modifications
only loads cpu.
In general all memcpy of project should be replaced with new version.
It can be made bt simple including "mmx.h" in c source but it requres
changing of structure of project entire.
Please tell me if you want to apply those changes to your project.


Best regards! Nick

Below patch for mplayer at 08 apr 2001:

diff -u -N -r ./main/configure ./main.new/configure
--- ./main/configure	Fri Apr  6 00:03:26 2001
+++ ./main.new/configure	Wed Apr 11 12:59:46 2001
@@ -70,6 +70,7 @@
 params:
         --cc                    use this C compiler to build MPlayer [gcc]
         --enable-mmx            build with mmx support [autodetect]
+        --enable-mmx2           build with mmx2 support (PIII, Athlon) [autodetect]
         --enable-3dnow          build with 3dnow! support [autodetect]
         --enable-sse            build with sse support [autodetect]
         --enable-gl             build with OpenGL render support [autodetect]
@@ -153,6 +154,7 @@
 pstepping=`cat /proc/cpuinfo | grep 'stepping' | cut -d ':' -f 2 | cut -d ' ' -f 2`
 
 _mmx=no
+_mmx2=no
 _3dnow=no
 _mtrr=no
 _sse=no
@@ -192,6 +194,9 @@
   mmx)
         _mmx=yes
         ;;
+  mmxext)
+        _mmx2=yes
+        ;;
   mtrr)
         _mtrr=yes
         ;;
@@ -442,6 +447,9 @@
   --enable-mmx)
         _mmx=yes
         ;;
+  --enable-mmx2)
+        _mmx2=yes
+        ;;
   --enable-mtrr)
   	_mtrr=yes
 	;;
@@ -504,6 +512,7 @@
         ;;
   --disable-mmx)
         _mmx=no
+	_mmx2=no
         ;;
   --disable-mtrr)
   	_mtrr=no
@@ -571,6 +580,7 @@
 echo "Checking for cpu type ... $pname"
 echo "Optimizing to ... $proc"
 echo "Checking for mmx support ... $_mmx"
+echo "Checking for mmx2 support ... $_mmx2"
 echo "Checking for 3dnow support ... $_3dnow"
 echo "Checking for sse support ... $_sse"
 echo "Checking for mtrr support ... $_mtrr"
@@ -669,6 +679,12 @@
  _mmx='#undef HAVE_MMX'
 fi
 
+if [ "$_mmx2" = "yes" ]; then
+ _mmx2='#define HAVE_MMX2'
+else
+ _mmx2='#undef HAVE_MMX2'
+fi
+
 if [ $_3dnow = yes ]; then
  _3dnowm='#define HAVE_3DNOW'
 else
@@ -845,6 +861,7 @@
 $_mlib     // available only on solaris
 $_3dnowm   // only define if you have 3DNOW (AMD k6-2, AMD Athlon, iDT WinChip, etc.)
 $_mmx      // only define if you have MMX
+$_mmx2     // only define if you have MMX2
 $_ssem     // only define if you have SSE (Intel Pentium III or Celeron II)
 
 /* libvo options */
diff -u -N -r ./main/libvo/mmx.h ./main.new/libvo/mmx.h
--- ./main/libvo/mmx.h	Sat Feb 24 23:29:56 2001
+++ ./main.new/libvo/mmx.h	Wed Apr 11 13:08:25 2001
@@ -27,6 +27,108 @@
 #ifndef _MMX_H
 #define _MMX_H
 
+/*
+ This part of code was taken by from Linux-2.4.3 and slightly modified
+for MMX2 instruction set. I have done it since linux uses page aligned
+blocks but mplayer uses weakly ordered data and original sources can not
+speedup their. Only using prefetch and movntq together have effect! 
+If you have questions please contact with me: Nick Kurshev: nickols_k at mail.ru.
+*/
+static inline void * __memcpy(void * to, const void * from, unsigned n)
+{
+int d0, d1, d2;
+__asm__ __volatile__(
+	"rep ; movsl\n\t"
+	"testb $2,%b4\n\t"
+	"je 1f\n\t"
+	"movsw\n"
+	"1:\ttestb $1,%b4\n\t"
+	"je 2f\n\t"
+	"movsb\n"
+	"2:"
+	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
+	:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
+	: "memory");
+return (to);
+}
+
+inline static void *__memcpy_mmx2(void *to, const void *from, unsigned len)
+{
+	void *p;
+	int i;
+
+        if(len >= 0x200) /* 512-byte blocks */
+	{
+  	  p = to;
+	  i = len >> 6; /* len/64 */
+	__asm__ __volatile__ (
+		"1: prefetch (%0)\n"		/* This set is 28 bytes */
+		"   prefetch 64(%0)\n"
+		"   prefetch 128(%0)\n"
+		"   prefetch 192(%0)\n"
+		"   prefetch 256(%0)\n"
+		"2:  \n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from) );
+		
+	
+	for(; i>0; i--)
+	{
+		__asm__ __volatile__ (
+		"1:  prefetch 320(%0)\n"
+		"2:  movq (%0), %%mm0\n"
+		"  movq 8(%0), %%mm1\n"
+		"  movq 16(%0), %%mm2\n"
+		"  movq 24(%0), %%mm3\n"
+		"  movntq %%mm0, (%1)\n"
+		"  movntq %%mm1, 8(%1)\n"
+		"  movntq %%mm2, 16(%1)\n"
+		"  movntq %%mm3, 24(%1)\n"
+		"  movq 32(%0), %%mm0\n"
+		"  movq 40(%0), %%mm1\n"
+		"  movq 48(%0), %%mm2\n"
+		"  movq 56(%0), %%mm3\n"
+		"  movntq %%mm0, 32(%1)\n"
+		"  movntq %%mm1, 40(%1)\n"
+		"  movntq %%mm2, 48(%1)\n"
+		"  movntq %%mm3, 56(%1)\n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	        __asm__ __volatile__ ("emms":::"memory");
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	__memcpy(to, from, len&63);
+	return p;
+}
+
+inline static void * memcpy(void * to, const void * from, unsigned n)
+{
+#ifdef HAVE_MMX2
+	return __memcpy_mmx2(to, from, n);
+#else
+	return __memcpy(to, from, n);
+#endif
+}
+
 
 /*	Warning:  at this writing, the version of GAS packaged
 	with most Linux distributions does not handle the
diff -u -N -r ./main/libvo/vo_3dfx.c ./main.new/libvo/vo_3dfx.c
--- ./main/libvo/vo_3dfx.c	Sun Mar  4 00:46:39 2001
+++ ./main.new/libvo/vo_3dfx.c	Wed Apr 11 12:45:55 2001
@@ -49,6 +49,8 @@
 
 #include "drivers/3dfx.h"
 
+#include "mmx.h"
+
 static vo_info_t vo_info = 
 {
 	"3dfx (/dev/3dfx)",
diff -u -N -r ./main/libvo/vo_fbdev.c ./main.new/libvo/vo_fbdev.c
--- ./main/libvo/vo_fbdev.c	Sun Apr  8 03:05:14 2001
+++ ./main.new/libvo/vo_fbdev.c	Wed Apr 11 12:45:43 2001
@@ -23,6 +23,8 @@
 
 #include "yuv2rgb.h"
 
+#include "mmx.h"
+
 LIBVO_EXTERN(fbdev)
 
 //#include "yuv2rgb.h"
diff -u -N -r ./main/libvo/vo_odivx.c ./main.new/libvo/vo_odivx.c
--- ./main/libvo/vo_odivx.c	Sun Mar 11 05:43:24 2001
+++ ./main.new/libvo/vo_odivx.c	Wed Apr 11 12:45:33 2001
@@ -19,6 +19,8 @@
 
 #include "../encore/encore.h"
 
+#include "mmx.h"
+
 static vo_info_t vo_info = 
 {
 	"OpenDivX AVI File writer",
diff -u -N -r ./main/libvo/vo_sdl.c ./main.new/libvo/vo_sdl.c
--- ./main/libvo/vo_sdl.c	Sun Apr  1 04:08:26 2001
+++ ./main.new/libvo/vo_sdl.c	Wed Apr 11 11:30:07 2001
@@ -61,6 +61,8 @@
 #include "video_out.h"
 #include "video_out_internal.h"
 
+
+#include "mmx.h"
 LIBVO_EXTERN(sdl)
 
 //#include "log.h"
diff -u -N -r ./main/libvo/vo_syncfb.c ./main.new/libvo/vo_syncfb.c
--- ./main/libvo/vo_syncfb.c	Sun Mar  4 00:46:39 2001
+++ ./main.new/libvo/vo_syncfb.c	Wed Apr 11 12:45:17 2001
@@ -43,6 +43,8 @@
 
 #include "drivers/syncfb/syncfb.h"
 
+#include "mmx.h"
+
 static vo_info_t vo_info =
 {
 	"Matrox G200/G400 Synchronous framebuffer (/dev/syncfb)",
diff -u -N -r ./main/libvo/vo_x11.c ./main.new/libvo/vo_x11.c
--- ./main/libvo/vo_x11.c	Sun Apr  8 01:27:57 2001
+++ ./main.new/libvo/vo_x11.c	Wed Apr 11 12:45:03 2001
@@ -36,6 +36,8 @@
 
 #include "x11_common.h"
 
+#include "mmx.h"
+
 static vo_info_t vo_info =
 {
         "X11 ( XImage/Shm )",
diff -u -N -r ./main/libvo/vo_xv.c ./main.new/libvo/vo_xv.c
--- ./main/libvo/vo_xv.c	Thu Mar 29 20:06:36 2001
+++ ./main.new/libvo/vo_xv.c	Wed Apr 11 11:20:33 2001
@@ -28,6 +28,8 @@
 
 #include "x11_common.h"
 
+#include "mmx.h"
+
 static vo_info_t vo_info =
 {
         "X11/Xv",



_______________________________________________
Mplayer-users mailing list
Mplayer-users at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-users



More information about the MPlayer-users mailing list