[FFmpeg-devel] [PATCH 3/5] dsputil: allow scalarproduct_and_madd_int16 to handle mod8 number of loops
Christophe Gisquet
christophe.gisquet at gmail.com
Wed May 8 16:51:49 CEST 2013
Some callers may pass such number of loops. The x86 SSE2/SSSE3 versions are
modified to have a tail verifying whether a last batch of 8 elements needs
to be processed.
---
libavcodec/dsputil.h | 2 +-
libavcodec/x86/dsputil.asm | 32 +++++++++++++++++++++++++++++---
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index b4868ab..83a4e48 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -276,7 +276,7 @@ typedef struct DSPContext {
/**
* Calculate scalar product of v1 and v2,
* and v1[i] += v3[i] * mul
- * @param len length of vectors, should be multiple of 16
+ * @param len length of vectors, should be multiple of 8
*/
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 3fe04dd..bc46986 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -76,13 +76,16 @@ cglobal scalarproduct_int16, 3,3+mmsize/16,3, v1, v2, order
RET
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
- shl orderq, 1
+cglobal scalarproduct_and_madd_int16, 4,4+mmsize/16,8, v1, v2, v3, order, mul
movd m7, mulm
%if mmsize == 16
+ sar orderq, 3
pshuflw m7, m7, 0
+ sbb r4, r4
punpcklqdq m7, m7
+ shl orderq, 4
%else
+ shl orderq, 1
pshufw m7, m7, 0
%endif
pxor m6, m6
@@ -110,6 +113,17 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
add orderq, mmsize*2
jl .loop
%if mmsize == 16
+ cmp r4, 0
+ jz .end
+ movu m0, [v2q + orderq]
+ mova m4, [v1q + orderq]
+ movu m2, [v3q + orderq]
+ pmaddwd m0, m4
+ pmullw m2, m7
+ paddd m6, m0
+ paddw m2, m4
+ mova [v1q + orderq], m2
+.end:
movhlps m0, m6
paddd m6, m0
pshuflw m0, m6, 0x4e
@@ -167,7 +181,7 @@ align 16
mova [v1q + orderq + mmsize], m3
jg .loop%1
%if %1
- jmp .end
+ jmp .tail
%endif
%endmacro
@@ -208,6 +222,18 @@ SCALARPRODUCT_LOOP 6
SCALARPRODUCT_LOOP 4
SCALARPRODUCT_LOOP 2
SCALARPRODUCT_LOOP 0
+.tail
+ jz .end
+ add orderq, mmsize
+ movu m0, [v2q + orderq]
+ mova m1, [v1q + orderq]
+ movu m2, [v3q + orderq]
+
+ pmaddwd m0, m1
+ pmullw m2, m7
+ paddd m6, m0
+ paddw m2, m1
+ mova [v1q + orderq], m2
.end:
movhlps m0, m6
paddd m6, m0
--
1.8.0.msysgit.0
More information about the ffmpeg-devel
mailing list