[FFmpeg-devel] [PATCH 3/5] dsputil: allow scalarproduct_and_madd_int16 to handle mod8 number of loops

Christophe Gisquet christophe.gisquet at gmail.com
Wed May 8 16:51:49 CEST 2013


Some callers may pass such number of loops. The x86 SSE2/SSSE3 versions are
modified to have a tail verifying whether a last batch of 8 elements needs
to be processed.
---
 libavcodec/dsputil.h       |  2 +-
 libavcodec/x86/dsputil.asm | 32 +++++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index b4868ab..83a4e48 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -276,7 +276,7 @@ typedef struct DSPContext {
     /**
      * Calculate scalar product of v1 and v2,
      * and v1[i] += v3[i] * mul
-     * @param len length of vectors, should be multiple of 16
+     * @param len length of vectors, should be multiple of 8
      */
     int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
 
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 3fe04dd..bc46986 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -76,13 +76,16 @@ cglobal scalarproduct_int16, 3,3+mmsize/16,3, v1, v2, order
     RET
 
 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
-    shl orderq, 1
+cglobal scalarproduct_and_madd_int16, 4,4+mmsize/16,8, v1, v2, v3, order, mul
     movd    m7, mulm
 %if mmsize == 16
+    sar orderq, 3
     pshuflw m7, m7, 0
+    sbb     r4, r4
     punpcklqdq m7, m7
+    shl orderq, 4
 %else
+    shl orderq, 1
     pshufw  m7, m7, 0
 %endif
     pxor    m6, m6
@@ -110,6 +113,17 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
     add     orderq, mmsize*2
     jl .loop
 %if mmsize == 16
+    cmp     r4, 0
+    jz    .end
+    movu    m0, [v2q + orderq]
+    mova    m4, [v1q + orderq]
+    movu    m2, [v3q + orderq]
+    pmaddwd m0, m4
+    pmullw  m2, m7
+    paddd   m6, m0
+    paddw   m2, m4
+    mova    [v1q + orderq], m2
+.end:
     movhlps m0, m6
     paddd   m6, m0
     pshuflw m0, m6, 0x4e
@@ -167,7 +181,7 @@ align 16
     mova    [v1q + orderq + mmsize], m3
     jg .loop%1
 %if %1
-    jmp .end
+    jmp .tail
 %endif
 %endmacro
 
@@ -208,6 +222,18 @@ SCALARPRODUCT_LOOP 6
 SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
+.tail
+    jz      .end
+    add     orderq, mmsize
+    movu    m0, [v2q + orderq]
+    mova    m1, [v1q + orderq]
+    movu    m2, [v3q + orderq]
+
+    pmaddwd m0, m1
+    pmullw  m2, m7
+    paddd   m6, m0
+    paddw   m2, m1
+    mova    [v1q + orderq], m2
 .end:
     movhlps m0, m6
     paddd   m6, m0
-- 
1.8.0.msysgit.0



More information about the ffmpeg-devel mailing list