[FFmpeg-cvslog] x86/dsputilenc: implement SSE2 versions of pix_{sum16, norm1}
James Almer
git at videolan.org
Wed May 28 23:44:14 CEST 2014
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue May 27 13:57:15 2014 -0300| [561bfc85eb26921b27aacb6360b2fa3ebfa85058] | committer: Michael Niedermayer
x86/dsputilenc: implement SSE2 versions of pix_{sum16, norm1}
Signed-off-by: James Almer <jamrial at gmail.com>
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=561bfc85eb26921b27aacb6360b2fa3ebfa85058
---
libavcodec/x86/dsputilenc.asm | 103 ++++++++++++++++++++++++---------------
libavcodec/x86/dsputilenc_mmx.c | 4 ++
libavutil/x86/x86util.asm | 5 ++
3 files changed, 72 insertions(+), 40 deletions(-)
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 6269532..0ff0e60 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -23,6 +23,10 @@
%include "libavutil/x86/x86util.asm"
+SECTION_RODATA
+
+cextern pw_1
+
SECTION .text
%macro DIFF_PIXELS_1 4
@@ -439,73 +443,92 @@ cglobal diff_pixels, 4, 5, 5
jne .loop
RET
-INIT_MMX mmx
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_SUM16 2
+cglobal pix_sum16, 2, 3, %1
movsxdifnidn r1, r1d
- mov r2, r1
- neg r2
- shl r2, 4
- sub r0, r2
- pxor m7, m7
- pxor m6, m6
+ mov r2, %2
+ pxor m5, m5
+ pxor m4, m4
.loop:
- mova m0, [r0+r2+0]
- mova m1, [r0+r2+0]
- mova m2, [r0+r2+8]
- mova m3, [r0+r2+8]
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
+ mova m0, [r0]
+%if mmsize == 8
+ mova m1, [r0+8]
+%else
+ mova m1, [r0+r1]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
paddw m1, m0
paddw m3, m2
paddw m3, m1
- paddw m6, m3
- add r2, r1
- js .loop
- mova m5, m6
- psrlq m6, 32
- paddw m6, m5
- mova m5, m6
- psrlq m6, 16
- paddw m6, m5
- movd eax, m6
+ paddw m4, m3
+%if mmsize == 8
+ add r0, r1
+%else
+ lea r0, [r0+r1*2]
+%endif
+ dec r2
+ jne .loop
+ HADDW m4, m5
+ movd eax, m4
and eax, 0xffff
RET
+%endmacro
INIT_MMX mmx
+PIX_SUM16 0, 16
+INIT_XMM sse2
+PIX_SUM16 6, 8
+
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
movsxdifnidn r1, r1d
- mov r2, 16
+ mov r2, %2
pxor m0, m0
- pxor m7, m7
+ pxor m5, m5
.loop:
mova m2, [r0+0]
+%if mmsize == 8
mova m3, [r0+8]
- mova m1, m2
- punpckhbw m1, m0
+%else
+ mova m3, [r0+r1]
+%endif
+ punpckhbw m1, m2, m0
punpcklbw m2, m0
- mova m4, m3
- punpckhbw m3, m0
- punpcklbw m4, m0
+ punpckhbw m4, m3, m0
+ punpcklbw m3, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
- paddd m7, m2
+ paddd m5, m2
+ paddd m5, m4
+%if mmsize == 8
add r0, r1
- paddd m7, m4
+%else
+ lea r0, [r0+r1*2]
+%endif
dec r2
jne .loop
- mova m1, m7
- psrlq m7, 32
- paddd m1, m7
- movd eax, m1
+ HADDD m5, m1
+ movd eax, m5
RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8
;-----------------------------------------------
;int ff_sum_abs_dctelem(int16_t *block)
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 5aae147..efe835f 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -38,7 +38,9 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride);
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
int ff_sum_abs_dctelem_mmx(int16_t *block);
int ff_sum_abs_dctelem_mmxext(int16_t *block);
int ff_sum_abs_dctelem_sse2(int16_t *block);
@@ -906,6 +908,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->sse[0] = ff_sse16_sse2;
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
c->diff_pixels = ff_diff_pixels_sse2;
+ c->pix_sum = ff_pix_sum16_sse2;
+ c->pix_norm1 = ff_pix_norm1_sse2;
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 67d7905..807e87e 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -288,7 +288,12 @@
paddd %1, %2
%endif
%if notcpuflag(xop) || sizeof%1 != 16
+%if cpuflag(mmxext)
PSHUFLW %2, %1, q0032
+%else ; mmx
+ mova %2, %1
+ psrlq %2, 32
+%endif
paddd %1, %2
%endif
%undef %1
More information about the ffmpeg-cvslog
mailing list