[MPlayer-dev-eng] [PATCH] SSE2-optimized libmpeg2 motion compensation
jserv at linux2.cc.ntu.edu.tw
jserv at linux2.cc.ntu.edu.tw
Wed Jun 14 15:57:18 CEST 2006
On Wed, Jun 14, 2006 at 01:48:30PM +0200, Guillaume POIRIER wrote:
> Hi,
salut Guillaume,
> On 6/14/06, jserv at linux2.cc.ntu.edu.tw <jserv at linux2.cc.ntu.edu.tw> wrote:
> > Recently, I implement SSE2-optimized libmpeg2 motion compensation, and
> >I think that it might be useful to MPlayer. I have attached the patch in
> >this mail.
>
> Quick feedback: since you are using intrinsics, you should make your
> code depend on the availability of mmintrin.h, xmmintrin.h, and
> emmintrin.h
I update my patch in attachment, and it should be able to check the
availability of MMX intrinsics via the compiler-time definition -
HAVE_BUILTIN_VECTOR, which is generated by configure script.
> What CPU did you run your tests on? What compiler did you use?
> Did you try to see if your code was compiling ok with icc (intel's
> compiler)?
I forgot to mention my hardware:
$ cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 9
model name : Intel(R) Pentium(R) M processor 1300MHz
stepping : 5
cpu MHz : 1293.867
cache size : 1024 KB
I am using GCC 4.1.1, and Intel C++ compiler is known to build with my patch.
cheers,
Jim Huang (jserv)
http://jserv.sayya.org/
-------------- next part --------------
Index: libmpeg2/mpeg2_internal.h
===================================================================
--- libmpeg2/mpeg2_internal.h (revision 18701)
+++ libmpeg2/mpeg2_internal.h (working copy)
@@ -309,6 +309,7 @@ extern mpeg2_mc_t mpeg2_mc_c;
extern mpeg2_mc_t mpeg2_mc_mmx;
extern mpeg2_mc_t mpeg2_mc_mmxext;
extern mpeg2_mc_t mpeg2_mc_3dnow;
+extern mpeg2_mc_t mpeg2_mc_sse2;
extern mpeg2_mc_t mpeg2_mc_altivec;
extern mpeg2_mc_t mpeg2_mc_alpha;
extern mpeg2_mc_t mpeg2_mc_vis;
Index: libmpeg2/cpu_accel.c
===================================================================
--- libmpeg2/cpu_accel.c (revision 18701)
+++ libmpeg2/cpu_accel.c (working copy)
@@ -26,6 +26,7 @@
*/
#include "config.h"
+#include "cpudetect.h"
#include <inttypes.h>
@@ -37,78 +38,22 @@
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline uint32_t arch_accel (void)
{
- uint32_t eax, ebx, ecx, edx;
- int AMD;
- uint32_t caps;
-
-#if !defined(PIC) && !defined(__PIC__)
-#define cpuid(op,eax,ebx,ecx,edx) \
- __asm__ ("cpuid" \
- : "=a" (eax), \
- "=b" (ebx), \
- "=c" (ecx), \
- "=d" (edx) \
- : "a" (op) \
- : "cc")
-#else /* PIC version : save ebx */
-#define cpuid(op,eax,ebx,ecx,edx) \
- __asm__ ("push %%ebx\n\t" \
- "cpuid\n\t" \
- "movl %%ebx,%1\n\t" \
- "pop %%ebx" \
- : "=a" (eax), \
- "=r" (ebx), \
- "=c" (ecx), \
- "=d" (edx) \
- : "a" (op) \
- : "cc")
-#endif
-
- __asm__ ("pushf\n\t"
- "pushf\n\t"
- "pop %0\n\t"
- "movl %0,%1\n\t"
- "xorl $0x200000,%0\n\t"
- "push %0\n\t"
- "popf\n\t"
- "pushf\n\t"
- "pop %0\n\t"
- "popf"
- : "=r" (eax),
- "=r" (ebx)
- :
- : "cc");
-
- if (eax == ebx) /* no cpuid */
- return 0;
-
- cpuid (0x00000000, eax, ebx, ecx, edx);
- if (!eax) /* vendor string only */
- return 0;
-
- AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
-
- cpuid (0x00000001, eax, ebx, ecx, edx);
- if (! (edx & 0x00800000)) /* no MMX */
- return 0;
-
- caps = MPEG2_ACCEL_X86_MMX;
- if (edx & 0x02000000) /* SSE - identical to AMD MMX extensions */
- caps = MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT;
-
- cpuid (0x80000000, eax, ebx, ecx, edx);
- if (eax < 0x80000001) /* no extended capabilities */
- return caps;
-
- cpuid (0x80000001, eax, ebx, ecx, edx);
-
- if (edx & 0x80000000)
- caps |= MPEG2_ACCEL_X86_3DNOW;
-
- if (AMD && (edx & 0x00400000)) /* AMD MMX extensions */
- caps |= MPEG2_ACCEL_X86_MMXEXT;
-
- return caps;
+#if defined(HAVE_SSE2)
+ if (gCpuCaps.hasSSE2) {
+ return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT | MPEG2_ACCEL_X86_SSE2;
+ }
+#endif
+#if defined(HAVE_MMX2) || defined(HAVE_SSE)
+ if (gCpuCaps.hasSSE || gCpuCaps.hasMMX2)
+ return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT;
+ }
+#endif
+#if defined(HAVE_MMX)
+ if (gCpuCaps.hasMMX || gCpuCaps.has3DNow || gCpuCaps.has3DNowExt)
+ return MPEG2_ACCEL_X86_MMX;
+ }
+#endif
+ return 0;
}
#endif /* ARCH_X86 || ARCH_X86_64 */
Index: libmpeg2/motion_comp_sse2.c
===================================================================
--- libmpeg2/motion_comp_sse2.c (revision 0)
+++ libmpeg2/motion_comp_sse2.c (revision 0)
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2006
+ * Jim Huang <jserv.tw at gmail.com>
+ *
+ * This Program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This Program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Make; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * Based on Intel's AP-942
+ */
+
+#include "config.h"
+
+#if defined(HAVE_BUILTIN_VECTOR)
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#ifdef HAVE_SSE2
+
+#include <inttypes.h>
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+
+#ifdef __GNUC__
+ #ifndef __forceinline
+ #define __forceinline __attribute__((__always_inline__)) inline
+ #endif
+#endif
+
+#ifdef __GNUC__
+ #define __inline __forceinline // GCC needs to force inlining of intrinsics functions
+#endif
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#ifdef __GNUC__
+ #undef __inline
+#endif
+
+#ifdef __GNUC__
+ #define __align8(t,v) t v __attribute__ ((aligned (8)))
+ #define __align16(t,v) t v __attribute__ ((aligned (16)))
+#else
+ #define __align8(t,v) __declspec(align(8)) t v
+ #define __align16(t,v) __declspec(align(16)) t v
+#endif
+
+static __m128i const_1_16_bytes;
+static void __attribute__((constructor)) mpeg2_MC_sse_ctor()
+{
+ const_1_16_bytes = _mm_set1_epi16(1);
+}
+
+static void MC_put_o_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm0, xmm1;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_put_o_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ for (; esi; edx += edi, ecx += edi, esi-= 2) {
+ __m128d xmm0;
+ xmm0 = _mm_loadl_pd(xmm0, (double*) edx);
+ xmm0 = _mm_loadh_pd(xmm0, (double*) (edx + eax));
+ _mm_storel_pd((double*) ecx, xmm0);
+ _mm_storeh_pd((double*) (ecx + eax), xmm0);
+ }
+}
+
+static void MC_put_x_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi= eax + eax;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_put_x_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ __m128i xmm0,xmm1;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void MC_put_y_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi= eax + eax;
+ __m128i xmm0;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm1,xmm2;
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm1 = _mm_avg_epu8(xmm1, xmm2);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm2;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_put_y_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
+{
+ const int edi = eax + eax;
+ __m128i xmm0;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ __m128i xmm1;
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ }
+}
+
+static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int eax = stride;
+ int esi = height;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm1, xmm4, xmm5, xmm2, xmm3;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm1 = xmm5;
+ xmm5 = _mm_avg_epu8(xmm5, xmm4);
+ xmm2 = _mm_subs_epu8(xmm2, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm4;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int eax = stride;
+ int esi = height;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm2, xmm1, xmm3;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+ xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm0 = _mm_subs_epu8(xmm0, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ xmm2 = xmm3;
+ }
+}
+
+static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ for (; esi; edx += edi, ecx += edi,esi -= 2) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_load_si128((__m128i*) ecx);
+ xmm3 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm1 = _mm_avg_epu8(xmm1, xmm3);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) ecx));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm4 = _mm_load_si128((__m128i*) ecx);
+ xmm5 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm4);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm1 = _mm_avg_epu8(xmm1, xmm2);
+ xmm3 = _mm_load_si128((__m128i*) ecx);
+ xmm4 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm3);
+ xmm1 = _mm_avg_epu8(xmm1, xmm4);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm2;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ }
+}
+
+static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ for (; esi; edx += edi, ecx += edi, esi-= 2) {
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm1 = xmm5;
+ xmm5 = _mm_avg_epu8(xmm5, xmm4);
+ xmm2 = _mm_subs_epu8(xmm2, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ xmm5 = _mm_load_si128((__m128i*) ecx);
+ xmm6 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm5);
+ xmm2 = _mm_avg_epu8(xmm2, xmm6);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm4;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm2, xmm1, xmm3, xmm4;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+ for (;esi;edx+=edi,ecx+=edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+ xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm0 = _mm_subs_epu8(xmm0, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm4 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm4), (double*) ecx));
+ xmm4 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm4), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm4);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ xmm2 = xmm3;
+ }
+}
+
+MPEG2_MC_EXTERN (sse2)
+
+#endif
+#endif
+#endif
Index: libmpeg2/mpeg2.h
===================================================================
--- libmpeg2/mpeg2.h (revision 18701)
+++ libmpeg2/mpeg2.h (working copy)
@@ -159,6 +159,7 @@ void mpeg2_custom_fbuf (mpeg2dec_t * mpe
#define MPEG2_ACCEL_X86_MMX 1
#define MPEG2_ACCEL_X86_3DNOW 2
#define MPEG2_ACCEL_X86_MMXEXT 4
+#define MPEG2_ACCEL_X86_SSE2 8
#define MPEG2_ACCEL_PPC_ALTIVEC 1
#define MPEG2_ACCEL_ALPHA 1
#define MPEG2_ACCEL_ALPHA_MVI 2
Index: libmpeg2/motion_comp.c
===================================================================
--- libmpeg2/motion_comp.c (revision 18701)
+++ libmpeg2/motion_comp.c (working copy)
@@ -26,6 +26,7 @@
*/
#include "config.h"
+#include "cpudetect.h"
#include <inttypes.h>
@@ -38,6 +39,14 @@ mpeg2_mc_t mpeg2_mc;
void mpeg2_mc_init (uint32_t accel)
{
#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#if defined(HAVE_SSE2) && defined(HAVE_BUILTIN_VECTOR)
+ /* SSE2-optimized MC depends on MMX intrinsics. */
+ if (gCpuCaps.hasSSE2 || (accel & MPEG2_ACCEL_X86_SSE2)) {
+ mpeg2_mc = mpeg2_mc_sse2;
+ return;
+ }
+ else
+#endif
if (accel & MPEG2_ACCEL_X86_MMXEXT)
mpeg2_mc = mpeg2_mc_mmxext;
else if (accel & MPEG2_ACCEL_X86_3DNOW)
Index: libmpeg2/Makefile
===================================================================
--- libmpeg2/Makefile (revision 18701)
+++ libmpeg2/Makefile (working copy)
@@ -24,6 +24,10 @@ ifeq ($(TARGET_ARCH_X86_64),yes)
SRCS += idct_mmx.c motion_comp_mmx.c
endif
+ifeq ($(TARGET_SSE),yes)
+SRCS += motion_comp_sse2.c
+endif
+
ifeq ($(TARGET_ALTIVEC),yes)
SRCS += motion_comp_altivec.c idct_altivec.c
endif
More information about the MPlayer-dev-eng
mailing list