[FFmpeg-devel] [PATCH 1/5] dsputil: modify scalarproduct_int16 to handle mod8 numbers of loops.

Wed May 8 16:51:47 CEST 2013

It was reported that handling such numbers of loops was needed.
---
 libavcodec/dsputil.h       |  2 +-
 libavcodec/x86/dsputil.asm | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 07a95af..b4868ab 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -269,7 +269,7 @@ typedef struct DSPContext {
 
     /**
      * Calculate scalar product of two vectors.
-     * @param len length of vectors, should be multiple of 16
+     * @param len length of vectors, should be multiple of 8
      */
     int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len);
     /* ape functions */
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 77069e2..3fe04dd 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -37,8 +37,14 @@ SECTION_TEXT
 
 %macro SCALARPRODUCT 0
 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
-cglobal scalarproduct_int16, 3,3,3, v1, v2, order
+cglobal scalarproduct_int16, 3,3+mmsize/16,3, v1, v2, order
+%if mmsize == 16
+    sar orderq, 4
+    sbb r3, r3
+    shl orderq, 5
+%else
     shl orderq, 1
+%endif
     add v1q, orderq
     add v2q, orderq
     neg orderq
@@ -53,6 +59,12 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     add     orderq, mmsize*2
     jl .loop
 %if mmsize == 16
+    cmp     r3, 0
+    jz    .end
+    movu    m0, [v1q]
+    pmaddwd m0, [v2q]
+    paddd   m2, m0
+.end:
     movhlps m0, m2
     paddd   m2, m0
     pshuflw m0, m2, 0x4e
-- 
1.8.0.msysgit.0