[MPlayer-dev-eng] [PATCH] SSE2-optimized libmpeg2 motion compensation

Wed Jun 14 13:39:07 CEST 2006

Hello list,

  Recently, I implement SSE2-optimized libmpeg2 motion compensation, and
I think that it might be useful to MPlayer. I have attached the patch in
this mail.

  The performance gain over the original MMXext(MMX2)-based motion
compensation implementation in libmpeg2 is as the following:

Sample
======
MPEG-PS file format detected.
VIDEO:  MPEG2  720x480  (aspect 2)  29.970 fps  2376.0 kbps (297.0 kbyte/s)

MMXext-based
============
 time   seconds   seconds    calls  ms/call  ms/call  name
 15.58     12.83    12.83  3013243     0.00     0.00  mmxext_idct
 14.70     24.94    12.11   515616     0.02     0.02  MC_put_xy_16_mmxext
 12.40     35.15    10.21    56139     0.18     0.18  fast_memcpy
  6.69     40.66     5.51  1826892     0.00     0.00  slice_intra_DCT
  6.34     45.88     5.22  1500758     0.00     0.00  MC_put_o_8_mmxext
  5.44     50.36     4.48  1758006     0.00     0.00  get_non_intra_block
  3.95     53.61     3.25    30480     0.11     2.16  mpeg2_slice
  3.28     56.31     2.70  1069690     0.00     0.03  motion_fr_frame_420
  3.21     58.95     2.64   112816     0.02     0.02  MC_avg_xy_16_mmxext
  3.14     61.54     2.59  1826892     0.00     0.01  mpeg2_idct_copy_mmxext

SSE2-based
==========
  %   cumulative   self              self     total
 time   seconds   seconds    calls  ms/call  ms/call  name
 16.32     13.24    13.24  3013243     0.00     0.00  mmxext_idct
 12.43     23.33    10.09    56139     0.18     0.18  fast_memcpy
 10.67     31.99     8.66   515616     0.02     0.02  MC_put_xy_16_sse2
  7.16     37.80     5.81  1826892     0.00     0.00  slice_intra_DCT
  6.85     43.36     5.56  1500758     0.00     0.00  MC_put_o_8_sse2
  5.73     48.01     4.65  1758006     0.00     0.00  get_non_intra_block
  4.97     52.04     4.03    30480     0.13     2.10  mpeg2_slice
  3.27     54.69     2.65  1069690     0.00     0.03  motion_fr_frame_420
  3.24     57.32     2.63   205987     0.01     0.01  MC_put_x_16_sse2
  3.11     59.84     2.52  1826892     0.00     0.01  mpeg2_idct_copy_mmxext
  2.93     62.22     2.38   112816     0.02     0.02  MC_avg_xy_16_sse2

Best Regards,
Jim Huuang (jserv)
http://jserv.sayya.org/
-------------- next part --------------
Index: libmpeg2/mpeg2_internal.h
===================================================================

--- libmpeg2/mpeg2_internal.h	(revision 18701)
+++ libmpeg2/mpeg2_internal.h	(working copy)
@@ -309,6 +309,7 @@ extern mpeg2_mc_t mpeg2_mc_c;
 extern mpeg2_mc_t mpeg2_mc_mmx;
 extern mpeg2_mc_t mpeg2_mc_mmxext;
 extern mpeg2_mc_t mpeg2_mc_3dnow;
+extern mpeg2_mc_t mpeg2_mc_sse2;
 extern mpeg2_mc_t mpeg2_mc_altivec;
 extern mpeg2_mc_t mpeg2_mc_alpha;
 extern mpeg2_mc_t mpeg2_mc_vis;
Index: libmpeg2/cpu_accel.c
===================================================================
--- libmpeg2/cpu_accel.c	(revision 18701)
+++ libmpeg2/cpu_accel.c	(working copy)
@@ -26,6 +26,7 @@
  */
 
 #include "config.h"
+#include "cpudetect.h"
 
 #include <inttypes.h>
 
@@ -37,78 +38,22 @@
 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 static inline uint32_t arch_accel (void)
 {
-    uint32_t eax, ebx, ecx, edx;
-    int AMD;
-    uint32_t caps;
-
-#if !defined(PIC) && !defined(__PIC__)
-#define cpuid(op,eax,ebx,ecx,edx)	\
-    __asm__ ("cpuid"			\
-	     : "=a" (eax),		\
-	       "=b" (ebx),		\
-	       "=c" (ecx),		\
-	       "=d" (edx)		\
-	     : "a" (op)			\
-	     : "cc")
-#else	/* PIC version : save ebx */
-#define cpuid(op,eax,ebx,ecx,edx)	\
-    __asm__ ("push %%ebx\n\t"		\
-	     "cpuid\n\t"		\
-	     "movl %%ebx,%1\n\t"	\
-	     "pop %%ebx"		\
-	     : "=a" (eax),		\
-	       "=r" (ebx),		\
-	       "=c" (ecx),		\
-	       "=d" (edx)		\
-	     : "a" (op)			\
-	     : "cc")
-#endif
-
-    __asm__ ("pushf\n\t"
-	     "pushf\n\t"
-	     "pop %0\n\t"
-	     "movl %0,%1\n\t"
-	     "xorl $0x200000,%0\n\t"
-	     "push %0\n\t"
-	     "popf\n\t"
-	     "pushf\n\t"
-	     "pop %0\n\t"
-	     "popf"
-	     : "=r" (eax),
-	       "=r" (ebx)
-	     :
-	     : "cc");
-
-    if (eax == ebx)		/* no cpuid */
-	return 0;
-
-    cpuid (0x00000000, eax, ebx, ecx, edx);
-    if (!eax)			/* vendor string only */
-	return 0;
-
-    AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
-
-    cpuid (0x00000001, eax, ebx, ecx, edx);
-    if (! (edx & 0x00800000))	/* no MMX */
-	return 0;
-
-    caps = MPEG2_ACCEL_X86_MMX;
-    if (edx & 0x02000000)	/* SSE - identical to AMD MMX extensions */
-	caps = MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT;
-
-    cpuid (0x80000000, eax, ebx, ecx, edx);
-    if (eax < 0x80000001)	/* no extended capabilities */
-	return caps;
-
-    cpuid (0x80000001, eax, ebx, ecx, edx);
-
-    if (edx & 0x80000000)
-	caps |= MPEG2_ACCEL_X86_3DNOW;
-
-    if (AMD && (edx & 0x00400000))	/* AMD MMX extensions */
-	caps |= MPEG2_ACCEL_X86_MMXEXT;
-
-    return caps;
+#if defined(HAVE_SSE2)
+    if (gCpuCaps.hasSSE2) {
+        return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT | MPEG2_ACCEL_X86_SSE2;
+    }
+#endif
+#if defined(HAVE_MMX2) || defined(HAVE_SSE)
+    if (gCpuCaps.hasSSE || gCpuCaps.hasMMX2)
+	return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT;
+    }
+#endif
+#if defined(HAVE_MMX)
+    if (gCpuCaps.hasMMX || gCpuCaps.has3DNow || gCpuCaps.has3DNowExt)
+	return MPEG2_ACCEL_X86_MMX;
+    }
+#endif
+    return 0;
 }
 #endif /* ARCH_X86 || ARCH_X86_64 */
 
Index: libmpeg2/motion_comp_sse2.c
===================================================================
--- libmpeg2/motion_comp_sse2.c	(revision 0)
+++ libmpeg2/motion_comp_sse2.c	(revision 0)
@@ -0,0 +1,413 @@
+/*	
+ * Copyright (c) 2006
+ *	Jim Huang <jserv.tw at gmail.com>
+ *
+ *  This Program is free software; you can redistribute it and/or modify	
+ *  it under the terms of the GNU General Public License as published by	
+ *  the Free Software Foundation; either version 2, or (at your option)	
+ *  any later version.	
+ *	
+ *  This Program is distributed in the hope that it will be useful,	
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of	
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	
+ *  GNU General Public License for more details.	
+ *	
+ *  You should have received a copy of the GNU General Public License	
+ *  along with GNU Make; see the file COPYING.  If not, write to	
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.	
+ *  http://www.gnu.org/copyleft/gpl.html	
+ *	
+ *  Based on Intel's AP-942	
+ */
+
+#include "config.h"
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#ifdef HAVE_SSE2
+
+#include <inttypes.h>
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+
+#ifdef __GNUC__
+  #ifndef __forceinline
+    #define __forceinline __attribute__((__always_inline__)) inline
+  #endif
+#endif
+
+#ifdef __GNUC__
+  #define __inline __forceinline  // GCC needs to force inlining of intrinsics functions
+#endif
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#ifdef __GNUC__
+  #undef __inline
+#endif
+
+#ifdef __GNUC__
+  #define __align8(t,v) t v __attribute__ ((aligned (8)))
+  #define __align16(t,v) t v __attribute__ ((aligned (16)))
+#else
+  #define __align8(t,v) __declspec(align(8)) t v
+  #define __align16(t,v) __declspec(align(16)) t v
+#endif
+
+static __m128i const_1_16_bytes;
+static void __attribute__((constructor)) mpeg2_MC_sse_ctor()
+{
+	const_1_16_bytes = _mm_set1_epi16(1);
+}
+
+static void MC_put_o_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+	const int edi = eax + eax;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		__m128i xmm0, xmm1;
+		xmm0 = _mm_loadu_si128((__m128i*) edx);
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		_mm_store_si128((__m128i*) (ecx + eax), xmm1);
+	}
+}
+
+static void MC_put_o_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+	const int edi = eax + eax;
+	for (; esi; edx += edi, ecx += edi, esi-= 2) {
+		__m128d xmm0;
+		xmm0 = _mm_loadl_pd(xmm0, (double*) edx);
+		xmm0 = _mm_loadh_pd(xmm0, (double*) (edx + eax));
+		_mm_storel_pd((double*) ecx, xmm0);
+		_mm_storeh_pd((double*) (ecx + eax), xmm0);
+	}
+}
+
+static void MC_put_x_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+	const int edi= eax + eax;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		__m128i xmm0, xmm1, xmm2, xmm3;
+		xmm0 = _mm_loadu_si128((__m128i*) edx);
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		_mm_store_si128((__m128i*) (ecx + eax), xmm2);
+	}
+}
+
+static void MC_put_x_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+	const int edi = eax + eax;
+	__m128i xmm0,xmm1;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+		xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		_mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+	}
+}
+
+static void MC_put_y_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+	const int edi= eax + eax;
+	__m128i xmm0;
+	xmm0 = _mm_loadu_si128((__m128i*) edx);
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		__m128i xmm1,xmm2;
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm1 = _mm_avg_epu8(xmm1, xmm2);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		xmm0 = xmm2;
+		_mm_store_si128((__m128i*) (ecx + eax), xmm1);
+	}
+}
+
+static void MC_put_y_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+	const int edi = eax + eax;
+	__m128i xmm0;
+	xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+	xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		__m128i xmm1;
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		_mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+		xmm0 = xmm1;
+	}
+}
+
+static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int eax = stride;
+	int esi = height;
+	int edi = eax + eax;
+	__m128i xmm7, xmm0, xmm1, xmm4, xmm5, xmm2, xmm3;
+	xmm7 = const_1_16_bytes;
+	xmm0 = _mm_loadu_si128((__m128i*) edx);
+	xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+		xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+		xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm1 = xmm5;
+		xmm5 = _mm_avg_epu8(xmm5, xmm4);
+		xmm2 = _mm_subs_epu8(xmm2, xmm7);
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		xmm2 = _mm_avg_epu8(xmm2, xmm5);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		xmm0 = xmm4;
+		_mm_store_si128((__m128i*) (ecx + eax), xmm2);
+	}       
+}
+
+static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int eax = stride;
+	int esi = height;
+	int edi = eax + eax;
+	__m128i xmm7, xmm0, xmm2, xmm1, xmm3;
+	xmm7 = const_1_16_bytes;
+	xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+	xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+	xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+	xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+		xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+		xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm0 = _mm_subs_epu8(xmm0, xmm7);
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		_mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+		xmm0 = xmm1;
+		xmm2 = xmm3;
+	}        
+}
+
+static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	for (; esi; edx += edi, ecx += edi,esi -= 2) {
+		__m128i xmm0, xmm1, xmm2, xmm3;
+		xmm0 = _mm_loadu_si128((__m128i*) edx);
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm2 = _mm_load_si128((__m128i*) ecx);
+		xmm3 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		xmm1 = _mm_avg_epu8(xmm1, xmm3);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		_mm_store_si128((__m128i*) (ecx + eax), xmm1);
+	}
+}
+
+static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+		xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) ecx));
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		_mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+	}
+}
+
+static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm0 = _mm_loadu_si128((__m128i*) edx);
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm4 = _mm_load_si128((__m128i*) ecx);
+		xmm5 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm4);
+		xmm2 = _mm_avg_epu8(xmm2, xmm5);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		_mm_store_si128((__m128i*) (ecx + eax), xmm2);
+	}
+}
+
+static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+		xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+		xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		_mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+	}
+}
+
+static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+
+	xmm0 = _mm_loadu_si128((__m128i*) edx);
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm1 = _mm_avg_epu8(xmm1, xmm2);
+		xmm3 = _mm_load_si128((__m128i*) ecx);
+		xmm4 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm3);
+		xmm1 = _mm_avg_epu8(xmm1, xmm4);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		xmm0 = xmm2;
+		_mm_store_si128((__m128i*) (ecx + eax), xmm1);
+	}
+}
+
+static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2;
+	xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+	xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+		xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		_mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+		xmm0 = xmm1;
+	}
+}
+
+static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+	xmm7 = const_1_16_bytes;
+	xmm0 = _mm_loadu_si128((__m128i*) edx);
+	xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+	for (; esi; edx += edi, ecx += edi, esi-= 2) {
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+		xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+		xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm1 = xmm5;
+		xmm5 = _mm_avg_epu8(xmm5, xmm4);
+		xmm2 = _mm_subs_epu8(xmm2, xmm7);
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		xmm2 = _mm_avg_epu8(xmm2, xmm5);
+		xmm5 = _mm_load_si128((__m128i*) ecx);
+		xmm6 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm5);
+		xmm2 = _mm_avg_epu8(xmm2, xmm6);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		xmm0 = xmm4;
+		_mm_store_si128((__m128i*) (ecx + eax), xmm2);
+	}
+}
+
+static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm7, xmm0, xmm2, xmm1, xmm3, xmm4;
+	xmm7 = const_1_16_bytes;
+	xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+	xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+	xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+	xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+	for (;esi;edx+=edi,ecx+=edi, esi -= 2) {
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+		xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+		xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm0 = _mm_subs_epu8(xmm0, xmm7);
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		xmm4 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm4), (double*) ecx));
+		xmm4 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm4), (double*) (ecx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm4);
+		_mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+		xmm0 = xmm1;
+		xmm2 = xmm3;
+	}
+}
+
+MPEG2_MC_EXTERN (sse2)
+
+#endif
+#endif
Index: libmpeg2/mpeg2.h
===================================================================
--- libmpeg2/mpeg2.h	(revision 18701)
+++ libmpeg2/mpeg2.h	(working copy)
@@ -159,6 +159,7 @@ void mpeg2_custom_fbuf (mpeg2dec_t * mpe
 #define MPEG2_ACCEL_X86_MMX 1
 #define MPEG2_ACCEL_X86_3DNOW 2
 #define MPEG2_ACCEL_X86_MMXEXT 4
+#define MPEG2_ACCEL_X86_SSE2 8
 #define MPEG2_ACCEL_PPC_ALTIVEC 1
 #define MPEG2_ACCEL_ALPHA 1
 #define MPEG2_ACCEL_ALPHA_MVI 2
Index: libmpeg2/motion_comp.c
===================================================================
--- libmpeg2/motion_comp.c	(revision 18701)
+++ libmpeg2/motion_comp.c	(working copy)
@@ -26,6 +26,7 @@
  */
 
 #include "config.h"
+#include "cpudetect.h"
 
 #include <inttypes.h>
 
@@ -38,6 +39,13 @@ mpeg2_mc_t mpeg2_mc;
 void mpeg2_mc_init (uint32_t accel)
 {
 #if defined(ARCH_X86) || defined(ARCH_X86_64)
+#if defined(HAVE_SSE2)
+    if (gCpuCaps.hasSSE2 || (accel & MPEG2_ACCEL_X86_SSE2)) {
+	mpeg2_mc = mpeg2_mc_sse2;
+	return;
+    }
+    else
+#endif
     if (accel & MPEG2_ACCEL_X86_MMXEXT)
 	mpeg2_mc = mpeg2_mc_mmxext;
     else if (accel & MPEG2_ACCEL_X86_3DNOW)
Index: libmpeg2/Makefile
===================================================================
--- libmpeg2/Makefile	(revision 18701)
+++ libmpeg2/Makefile	(working copy)
@@ -24,6 +24,10 @@ ifeq ($(TARGET_ARCH_X86_64),yes)
 SRCS += idct_mmx.c motion_comp_mmx.c
 endif
 
+ifeq ($(TARGET_SSE),yes)
+SRCS += motion_comp_sse2.c
+endif
+
 ifeq ($(TARGET_ALTIVEC),yes)
 SRCS += motion_comp_altivec.c idct_altivec.c
 endif