[FFmpeg-devel] Anybody has a Core 2? [PATCH] Small SSSE3 optimization
Zuxy Meng
zuxy.meng
Wed May 9 06:27:45 CEST 2007
Hi,
2007/5/8, Zuxy Meng <zuxy.meng at gmail.com>:
> Hi,
>
> Attached patch makes use of SSSE3 instruction pabsw to calculate the
> absolute value of packed words. Just for fun. And I don't have a SSSE3
> capable CPU so hopefully someone with a Core 2 can help test it to
> ensure it doesn't break anything (better with benchmarks of course:-)
> ).
Updated patch against curren SVN HEAD. Full test passed on MMX2. Of
course it still needs testing under Core 2.
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c ?????? 8946??
+++ libavcodec/i386/dsputil_mmx.c ????????????
@@ -1538,6 +1538,13 @@
MMABS_MMX2(a,z)\
"paddusw " #a ", " #sum " \n\t"
+#define MMABS_SSSE3(a)\
+ "pabsw " #a ", " #a " \n\t"
+
+#define MMABS_SUM_SSSE3(a, sum)\
+ "pabsw " #a ", " #a " \n\t"\
+ "paddusw " #a ", " #sum " \n\t"
+
#define LOAD4(o, a, b, c, d)\
"movq "#o"(%1), " #a " \n\t"\
"movq "#o"+16(%1), " #b " \n\t"\
@@ -1609,7 +1616,7 @@
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
\
HADAMARD48\
- "movq %%mm7, 64(%1) \n\t"\
+ SAVE_MM7(64)\
MMABS(%%mm0, %%mm7)\
MMABS_SUM(%%mm1, %%mm7, %%mm0)\
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
@@ -1617,15 +1624,14 @@
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
MMABS_SUM(%%mm5, %%mm7, %%mm0)\
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
- "movq 64(%1), %%mm1 \n\t"\
- MMABS_SUM(%%mm1, %%mm7, %%mm0)\
+ RESTORE_MM7(64)\
"movq %%mm0, 64(%1) \n\t"\
\
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)\
\
HADAMARD48\
- "movq %%mm7, (%1) \n\t"\
+ SAVE_MM7(0)\
MMABS(%%mm0, %%mm7)\
MMABS_SUM(%%mm1, %%mm7, %%mm0)\
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
@@ -1633,10 +1639,8 @@
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
MMABS_SUM(%%mm5, %%mm7, %%mm0)\
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
- "movq (%1), %%mm1 \n\t"\
- MMABS_SUM(%%mm1, %%mm7, %%mm0)\
- "movq 64(%1), %%mm1 \n\t"\
- MMABS_SUM(%%mm1, %%mm7, %%mm0)\
+ RESTORE_MM7(0)\
+ "paddusw 64(%1), %%mm0 \n\t"\
\
HSUM(%%mm0, %%mm1, %0)\
\
@@ -1646,6 +1650,11 @@
return sum&0xFFFF;\
}
+#define SAVE_MM7(x)\
+ "movq %%mm7, "#x"(%1) \n\t"
+#define RESTORE_MM7(x)\
+ "movq "#x"(%1), %%mm1 \n\t"\
+ MMABS_SUM(%%mm1, %%mm7, %%mm0)
#define MMABS(a,z) MMABS_MMX(a,z)
#define MMABS_SUM(a,z,sum) MMABS_SUM_MMX(a,z,sum)
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
@@ -1661,9 +1670,24 @@
#undef MMABS
#undef MMABS_SUM
#undef HSUM
+#undef SAVE_MM7
+#undef RESTORE_MM7
+#define SAVE_MM7(x)
+#define RESTORE_MM7(x) MMABS_SUM_SSSE3(%%mm7, %%mm0)
+#define MMABS(a,z) MMABS_SSSE3(a)
+#define MMABS_SUM(a,z,sum) MMABS_SUM_SSSE3(a,sum)
+#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
+HADAMARD8_DIFF_MMX(ssse3)
+#undef MMABS
+#undef MMABS_SUM
+#undef HSUM
+#undef SAVE_MM7
+#undef RESTORE_MM7
+
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+WARPER8_16_SQ(hadamard8_diff_ssse3, hadamard8_diff16_ssse3)
static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
int sum;
@@ -3460,6 +3484,12 @@
}
if(mm_flags & MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+#ifdef CONFIG_ENCODERS
+ if(mm_flags & MM_SSSE3){
+ c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
+ c->hadamard8_diff[1]= hadamard8_diff_ssse3;
+ }
+#endif //CONFIG_ENCODERS
}
#ifdef CONFIG_ENCODERS
More information about the ffmpeg-devel
mailing list