[MPlayer-dev-eng] [PATCH] SSE2-optimized libmpeg2 motion compensation
jserv at linux2.cc.ntu.edu.tw
jserv at linux2.cc.ntu.edu.tw
Wed Jun 14 13:39:07 CEST 2006
Hello list,
Recently, I implement SSE2-optimized libmpeg2 motion compensation, and
I think that it might be useful to MPlayer. I have attached the patch in
this mail.
The performance gain over the original MMXext(MMX2)-based motion
compensation implementation in libmpeg2 is as the following:
Sample
======
MPEG-PS file format detected.
VIDEO: MPEG2 720x480 (aspect 2) 29.970 fps 2376.0 kbps (297.0 kbyte/s)
MMXext-based
============
time seconds seconds calls ms/call ms/call name
15.58 12.83 12.83 3013243 0.00 0.00 mmxext_idct
14.70 24.94 12.11 515616 0.02 0.02 MC_put_xy_16_mmxext
12.40 35.15 10.21 56139 0.18 0.18 fast_memcpy
6.69 40.66 5.51 1826892 0.00 0.00 slice_intra_DCT
6.34 45.88 5.22 1500758 0.00 0.00 MC_put_o_8_mmxext
5.44 50.36 4.48 1758006 0.00 0.00 get_non_intra_block
3.95 53.61 3.25 30480 0.11 2.16 mpeg2_slice
3.28 56.31 2.70 1069690 0.00 0.03 motion_fr_frame_420
3.21 58.95 2.64 112816 0.02 0.02 MC_avg_xy_16_mmxext
3.14 61.54 2.59 1826892 0.00 0.01 mpeg2_idct_copy_mmxext
SSE2-based
==========
% cumulative self self total
time seconds seconds calls ms/call ms/call name
16.32 13.24 13.24 3013243 0.00 0.00 mmxext_idct
12.43 23.33 10.09 56139 0.18 0.18 fast_memcpy
10.67 31.99 8.66 515616 0.02 0.02 MC_put_xy_16_sse2
7.16 37.80 5.81 1826892 0.00 0.00 slice_intra_DCT
6.85 43.36 5.56 1500758 0.00 0.00 MC_put_o_8_sse2
5.73 48.01 4.65 1758006 0.00 0.00 get_non_intra_block
4.97 52.04 4.03 30480 0.13 2.10 mpeg2_slice
3.27 54.69 2.65 1069690 0.00 0.03 motion_fr_frame_420
3.24 57.32 2.63 205987 0.01 0.01 MC_put_x_16_sse2
3.11 59.84 2.52 1826892 0.00 0.01 mpeg2_idct_copy_mmxext
2.93 62.22 2.38 112816 0.02 0.02 MC_avg_xy_16_sse2
Best Regards,
Jim Huuang (jserv)
http://jserv.sayya.org/
-------------- next part --------------
Index: libmpeg2/mpeg2_internal.h
===================================================================
--- libmpeg2/mpeg2_internal.h (revision 18701)
+++ libmpeg2/mpeg2_internal.h (working copy)
@@ -309,6 +309,7 @@ extern mpeg2_mc_t mpeg2_mc_c;
extern mpeg2_mc_t mpeg2_mc_mmx;
extern mpeg2_mc_t mpeg2_mc_mmxext;
extern mpeg2_mc_t mpeg2_mc_3dnow;
+extern mpeg2_mc_t mpeg2_mc_sse2;
extern mpeg2_mc_t mpeg2_mc_altivec;
extern mpeg2_mc_t mpeg2_mc_alpha;
extern mpeg2_mc_t mpeg2_mc_vis;
Index: libmpeg2/cpu_accel.c
===================================================================
--- libmpeg2/cpu_accel.c (revision 18701)
+++ libmpeg2/cpu_accel.c (working copy)
@@ -26,6 +26,7 @@
*/
#include "config.h"
+#include "cpudetect.h"
#include <inttypes.h>
@@ -37,78 +38,22 @@
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline uint32_t arch_accel (void)
{
- uint32_t eax, ebx, ecx, edx;
- int AMD;
- uint32_t caps;
-
-#if !defined(PIC) && !defined(__PIC__)
-#define cpuid(op,eax,ebx,ecx,edx) \
- __asm__ ("cpuid" \
- : "=a" (eax), \
- "=b" (ebx), \
- "=c" (ecx), \
- "=d" (edx) \
- : "a" (op) \
- : "cc")
-#else /* PIC version : save ebx */
-#define cpuid(op,eax,ebx,ecx,edx) \
- __asm__ ("push %%ebx\n\t" \
- "cpuid\n\t" \
- "movl %%ebx,%1\n\t" \
- "pop %%ebx" \
- : "=a" (eax), \
- "=r" (ebx), \
- "=c" (ecx), \
- "=d" (edx) \
- : "a" (op) \
- : "cc")
-#endif
-
- __asm__ ("pushf\n\t"
- "pushf\n\t"
- "pop %0\n\t"
- "movl %0,%1\n\t"
- "xorl $0x200000,%0\n\t"
- "push %0\n\t"
- "popf\n\t"
- "pushf\n\t"
- "pop %0\n\t"
- "popf"
- : "=r" (eax),
- "=r" (ebx)
- :
- : "cc");
-
- if (eax == ebx) /* no cpuid */
- return 0;
-
- cpuid (0x00000000, eax, ebx, ecx, edx);
- if (!eax) /* vendor string only */
- return 0;
-
- AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
-
- cpuid (0x00000001, eax, ebx, ecx, edx);
- if (! (edx & 0x00800000)) /* no MMX */
- return 0;
-
- caps = MPEG2_ACCEL_X86_MMX;
- if (edx & 0x02000000) /* SSE - identical to AMD MMX extensions */
- caps = MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT;
-
- cpuid (0x80000000, eax, ebx, ecx, edx);
- if (eax < 0x80000001) /* no extended capabilities */
- return caps;
-
- cpuid (0x80000001, eax, ebx, ecx, edx);
-
- if (edx & 0x80000000)
- caps |= MPEG2_ACCEL_X86_3DNOW;
-
- if (AMD && (edx & 0x00400000)) /* AMD MMX extensions */
- caps |= MPEG2_ACCEL_X86_MMXEXT;
-
- return caps;
+#if defined(HAVE_SSE2)
+ if (gCpuCaps.hasSSE2) {
+ return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT | MPEG2_ACCEL_X86_SSE2;
+ }
+#endif
+#if defined(HAVE_MMX2) || defined(HAVE_SSE)
+ if (gCpuCaps.hasSSE || gCpuCaps.hasMMX2)
+ return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT;
+ }
+#endif
+#if defined(HAVE_MMX)
+ if (gCpuCaps.hasMMX || gCpuCaps.has3DNow || gCpuCaps.has3DNowExt)
+ return MPEG2_ACCEL_X86_MMX;
+ }
+#endif
+ return 0;
}
#endif /* ARCH_X86 || ARCH_X86_64 */
Index: libmpeg2/motion_comp_sse2.c
===================================================================
--- libmpeg2/motion_comp_sse2.c (revision 0)
+++ libmpeg2/motion_comp_sse2.c (revision 0)
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2006
+ * Jim Huang <jserv.tw at gmail.com>
+ *
+ * This Program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This Program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Make; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * Based on Intel's AP-942
+ */
+
+#include "config.h"
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#ifdef HAVE_SSE2
+
+#include <inttypes.h>
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+
+#ifdef __GNUC__
+ #ifndef __forceinline
+ #define __forceinline __attribute__((__always_inline__)) inline
+ #endif
+#endif
+
+#ifdef __GNUC__
+ #define __inline __forceinline // GCC needs to force inlining of intrinsics functions
+#endif
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#ifdef __GNUC__
+ #undef __inline
+#endif
+
+#ifdef __GNUC__
+ #define __align8(t,v) t v __attribute__ ((aligned (8)))
+ #define __align16(t,v) t v __attribute__ ((aligned (16)))
+#else
+ #define __align8(t,v) __declspec(align(8)) t v
+ #define __align16(t,v) __declspec(align(16)) t v
+#endif
+
+static __m128i const_1_16_bytes;
+static void __attribute__((constructor)) mpeg2_MC_sse_ctor()
+{
+ const_1_16_bytes = _mm_set1_epi16(1);
+}
+
+static void MC_put_o_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm0, xmm1;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_put_o_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ for (; esi; edx += edi, ecx += edi, esi-= 2) {
+ __m128d xmm0;
+ xmm0 = _mm_loadl_pd(xmm0, (double*) edx);
+ xmm0 = _mm_loadh_pd(xmm0, (double*) (edx + eax));
+ _mm_storel_pd((double*) ecx, xmm0);
+ _mm_storeh_pd((double*) (ecx + eax), xmm0);
+ }
+}
+
+static void MC_put_x_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi= eax + eax;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_put_x_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ __m128i xmm0,xmm1;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void MC_put_y_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi= eax + eax;
+ __m128i xmm0;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm1,xmm2;
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm1 = _mm_avg_epu8(xmm1, xmm2);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm2;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_put_y_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ __m128i xmm0;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm1;
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ }
+}
+
+static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int eax = stride;
+ int esi = height;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm1, xmm4, xmm5, xmm2, xmm3;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm1 = xmm5;
+ xmm5 = _mm_avg_epu8(xmm5, xmm4);
+ xmm2 = _mm_subs_epu8(xmm2, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm4;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int eax = stride;
+ int esi = height;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm2, xmm1, xmm3;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+ xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm0 = _mm_subs_epu8(xmm0, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ xmm2 = xmm3;
+ }
+}
+
+static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ for (; esi; edx += edi, ecx += edi,esi -= 2) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_load_si128((__m128i*) ecx);
+ xmm3 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm1 = _mm_avg_epu8(xmm1, xmm3);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) ecx));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm4 = _mm_load_si128((__m128i*) ecx);
+ xmm5 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm4);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm1 = _mm_avg_epu8(xmm1, xmm2);
+ xmm3 = _mm_load_si128((__m128i*) ecx);
+ xmm4 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm3);
+ xmm1 = _mm_avg_epu8(xmm1, xmm4);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm2;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ }
+}
+
+static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ for (; esi; edx += edi, ecx += edi, esi-= 2) {
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm1 = xmm5;
+ xmm5 = _mm_avg_epu8(xmm5, xmm4);
+ xmm2 = _mm_subs_epu8(xmm2, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ xmm5 = _mm_load_si128((__m128i*) ecx);
+ xmm6 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm5);
+ xmm2 = _mm_avg_epu8(xmm2, xmm6);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm4;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm2, xmm1, xmm3, xmm4;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+ for (;esi;edx+=edi,ecx+=edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+ xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm0 = _mm_subs_epu8(xmm0, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm4 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm4), (double*) ecx));
+ xmm4 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm4), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm4);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ xmm2 = xmm3;
+ }
+}
+
+MPEG2_MC_EXTERN (sse2)
+
+#endif
+#endif
Index: libmpeg2/mpeg2.h
===================================================================
--- libmpeg2/mpeg2.h (revision 18701)
+++ libmpeg2/mpeg2.h (working copy)
@@ -159,6 +159,7 @@ void mpeg2_custom_fbuf (mpeg2dec_t * mpe
#define MPEG2_ACCEL_X86_MMX 1
#define MPEG2_ACCEL_X86_3DNOW 2
#define MPEG2_ACCEL_X86_MMXEXT 4
+#define MPEG2_ACCEL_X86_SSE2 8
#define MPEG2_ACCEL_PPC_ALTIVEC 1
#define MPEG2_ACCEL_ALPHA 1
#define MPEG2_ACCEL_ALPHA_MVI 2
Index: libmpeg2/motion_comp.c
===================================================================
--- libmpeg2/motion_comp.c (revision 18701)
+++ libmpeg2/motion_comp.c (working copy)
@@ -26,6 +26,7 @@
*/
#include "config.h"
+#include "cpudetect.h"
#include <inttypes.h>
@@ -38,6 +39,13 @@ mpeg2_mc_t mpeg2_mc;
void mpeg2_mc_init (uint32_t accel)
{
#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#if defined(HAVE_SSE2)
+ if (gCpuCaps.hasSSE2 || (accel & MPEG2_ACCEL_X86_SSE2)) {
+ mpeg2_mc = mpeg2_mc_sse2;
+ return;
+ }
+ else
+#endif
if (accel & MPEG2_ACCEL_X86_MMXEXT)
mpeg2_mc = mpeg2_mc_mmxext;
else if (accel & MPEG2_ACCEL_X86_3DNOW)
Index: libmpeg2/Makefile
===================================================================
--- libmpeg2/Makefile (revision 18701)
+++ libmpeg2/Makefile (working copy)
@@ -24,6 +24,10 @@ ifeq ($(TARGET_ARCH_X86_64),yes)
SRCS += idct_mmx.c motion_comp_mmx.c
endif
+ifeq ($(TARGET_SSE),yes)
+SRCS += motion_comp_sse2.c
+endif
+
ifeq ($(TARGET_ALTIVEC),yes)
SRCS += motion_comp_altivec.c idct_altivec.c
endif
More information about the MPlayer-dev-eng
mailing list