[FFmpeg-cvslog] avutil/pixelutils: faster pixelutils_sad_16x16
Clément Bœsch
git at videolan.org
Sat Aug 23 20:14:17 CEST 2014
ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Sat Aug 23 20:03:10 2014 +0200| [554d8190624f25cefe079bd7b9ad61a2ade8541a] | committer: Clément Bœsch
avutil/pixelutils: faster pixelutils_sad_16x16
501 to 439 decicycles.
See 45c7f3997ea11c3d1007b2126b1c0049a8c27105.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=554d8190624f25cefe079bd7b9ad61a2ade8541a
---
libavutil/x86/pixelutils.asm | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
index 15213d9..7522f24 100644
--- a/libavutil/x86/pixelutils.asm
+++ b/libavutil/x86/pixelutils.asm
@@ -109,18 +109,24 @@ cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2
;-------------------------------------------------------------------------------
INIT_XMM sse2
cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
- pxor m4, m4
-%rep 8
- movu m0, [src1q]
+ movu m4, [src1q]
+ movu m2, [src2q]
movu m1, [src1q + stride1q]
+ movu m3, [src2q + stride2q]
+ psadbw m4, m2
+ psadbw m1, m3
+ paddw m4, m1
+%rep 7
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+ movu m0, [src1q]
movu m2, [src2q]
+ movu m1, [src1q + stride1q]
movu m3, [src2q + stride2q]
psadbw m0, m2
psadbw m1, m3
paddw m4, m0
paddw m4, m1
- lea src1q, [src1q + 2*stride1q]
- lea src2q, [src2q + 2*stride2q]
%endrep
movhlps m0, m4
paddw m4, m0
More information about the ffmpeg-cvslog
mailing list