[FFmpeg-devel] [PATCH 1/5] dsputil: modify scalarproduct_int16 to handle mod8 numbers of loops.
Christophe Gisquet
christophe.gisquet at gmail.com
Wed May 8 16:51:47 CEST 2013
It was reported that handling such numbers of loops was needed.
---
libavcodec/dsputil.h | 2 +-
libavcodec/x86/dsputil.asm | 14 +++++++++++++-
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 07a95af..b4868ab 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -269,7 +269,7 @@ typedef struct DSPContext {
/**
* Calculate scalar product of two vectors.
- * @param len length of vectors, should be multiple of 16
+ * @param len length of vectors, should be multiple of 8
*/
int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len);
/* ape functions */
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 77069e2..3fe04dd 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -37,8 +37,14 @@ SECTION_TEXT
%macro SCALARPRODUCT 0
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
-cglobal scalarproduct_int16, 3,3,3, v1, v2, order
+cglobal scalarproduct_int16, 3,3+mmsize/16,3, v1, v2, order
+%if mmsize == 16
+ sar orderq, 4
+ sbb r3, r3
+ shl orderq, 5
+%else
shl orderq, 1
+%endif
add v1q, orderq
add v2q, orderq
neg orderq
@@ -53,6 +59,12 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
add orderq, mmsize*2
jl .loop
%if mmsize == 16
+ cmp r3, 0
+ jz .end
+ movu m0, [v1q]
+ pmaddwd m0, [v2q]
+ paddd m2, m0
+.end:
movhlps m0, m2
paddd m2, m0
pshuflw m0, m2, 0x4e
--
1.8.0.msysgit.0
More information about the ffmpeg-devel
mailing list