[FFmpeg-devel] Anybody has a Core 2? [PATCH] Small SSSE3 optimization
Zuxy Meng
zuxy.meng
Tue May 8 16:27:22 CEST 2007
Hi,
Attached patch makes use of SSSE3 instruction pabsw to calculate the
absolute value of packed words. Just for fun. And I don't have a SSSE3
capable CPU so hopefully someone with a Core 2 can help test it to
ensure it doesn't break anything (better with benchmarks of course:-)
).
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c ?????? 8932??
+++ libavcodec/i386/dsputil_mmx.c ????????????
@@ -1537,6 +1537,13 @@
"pmaxsw " #z ", " #a " \n\t"\
"paddusw " #a ", " #sum " \n\t"
+#define MMABS_SSSE3(a)\
+ "pabsw " #a ", " #a " \n\t"
+
+#define MMABS_SUM_SSSE3(a, sum)\
+ "pabsw " #a ", " #a " \n\t"\
+ "paddusw " #a ", " #sum " \n\t"
+
#define TRANSPOSE4(a,b,c,d,t)\
SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
@@ -1727,9 +1734,90 @@
return sum&0xFFFF;
}
+static int hadamard8_diff_ssse3(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
+ DECLARE_ALIGNED_8(uint64_t, temp[16]);
+ int sum=0;
+ assert(h==8);
+
+ diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
+
+ asm volatile(
+ LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
+ LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
+
+ HADAMARD48
+
+ "movq %%mm7, 112(%1) \n\t"
+
+ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
+ STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
+
+ "movq 112(%1), %%mm7 \n\t"
+ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+ STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
+
+ LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
+ LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
+
+ HADAMARD48
+
+ "movq %%mm7, 120(%1) \n\t"
+
+ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
+ STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
+
+ "movq 120(%1), %%mm7 \n\t"
+ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+ "movq %%mm7, %%mm5 \n\t"//FIXME remove
+ "movq %%mm6, %%mm7 \n\t"
+ "movq %%mm0, %%mm6 \n\t"
+// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
+
+ LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
+// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
+
+ HADAMARD48
+ MMABS_SSSE3(%%mm0)
+ MMABS_SUM_SSSE3(%%mm1, %%mm0)
+ MMABS_SUM_SSSE3(%%mm2, %%mm0)
+ MMABS_SUM_SSSE3(%%mm3, %%mm0)
+ MMABS_SUM_SSSE3(%%mm4, %%mm0)
+ MMABS_SUM_SSSE3(%%mm5, %%mm0)
+ MMABS_SUM_SSSE3(%%mm6, %%mm0)
+ MMABS_SUM_SSSE3(%%mm7, %%mm0)
+ "movq %%mm0, 64(%1) \n\t"
+
+ LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
+ LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
+
+ HADAMARD48
+ MMABS_SSSE3(%%mm0)
+ MMABS_SUM_SSSE3(%%mm1, %%mm0)
+ MMABS_SUM_SSSE3(%%mm2, %%mm0)
+ MMABS_SUM_SSSE3(%%mm3, %%mm0)
+ MMABS_SUM_SSSE3(%%mm4, %%mm0)
+ MMABS_SUM_SSSE3(%%mm5, %%mm0)
+ MMABS_SUM_SSSE3(%%mm6, %%mm0)
+ MMABS_SUM_SSSE3(%%mm7, %%mm0)
+ "pabsw 64(%1), %%mm1 \n\t"
+ "paddusw %%mm1, %%mm0 \n\t" // MMABS_SUM_SSSE3(64(%1), %%mm0)
+
+ "pshufw $0x0E, %%mm0, %%mm1 \n\t"
+ "paddusw %%mm1, %%mm0 \n\t"
+ "pshufw $0x01, %%mm0, %%mm1 \n\t"
+ "paddusw %%mm1, %%mm0 \n\t"
+ "movd %%mm0, %0 \n\t"
+
+ : "=r" (sum)
+ : "r"(temp)
+ );
+ return sum&0xFFFF;
+}
+
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+WARPER8_16_SQ(hadamard8_diff_ssse3, hadamard8_diff16_ssse3)
static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
int sum;
@@ -3526,6 +3614,12 @@
}
if(mm_flags & MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+#ifdef CONFIG_ENCODERS
+ if(mm_flags && MM_SSSE3){
+ c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
+ c->hadamard8_diff[1]= hadamard8_diff_ssse3;
+ }
+#endif //CONFIG_ENCODERS
}
#ifdef CONFIG_ENCODERS
More information about the ffmpeg-devel
mailing list