[FFmpeg-devel] [PATCH 3/4] avcodec/x86: avg_pixels16_y2_sse2
Michael Niedermayer
michaelni at gmx.at
Sun Feb 3 16:31:08 CET 2013
about 1% faster bidirectional motion compensation for matrixbench
on i7
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
libavcodec/x86/dsputil_mmx.c | 3 +++
libavcodec/x86/hpeldsp.asm | 22 ++++++++++++----------
2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 047504e..1c796ae 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1529,6 +1529,8 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
@@ -2043,6 +2045,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
}
}
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 088f811..a151834 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -388,31 +388,31 @@ AVG_PIXELS8_X2
; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
%macro AVG_PIXELS8_Y2 0
-cglobal avg_pixels8_y2, 4,5
+cglobal avg_pixels %+ mmsize %+ _y2, 4,5
movsxdifnidn r2, r2d
lea r4, [r2*2]
- mova m0, [r1]
+ movu m0, [r1]
sub r0, r2
.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
+ movu m1, [r1+r2]
+ movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
- mova m3, [r0+r2]
- mova m4, [r0+r4]
+ movu m3, [r0+r2]
+ movu m4, [r0+r4]
PAVGB m0, m3
PAVGB m1, m4
mova [r0+r2], m0
mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
+ movu m1, [r1+r2]
+ movu m0, [r1+r4]
PAVGB m2, m1
PAVGB m1, m0
add r0, r4
add r1, r4
- mova m3, [r0+r2]
- mova m4, [r0+r4]
+ movu m3, [r0+r2]
+ movu m4, [r0+r4]
PAVGB m2, m3
PAVGB m1, m4
mova [r0+r2], m2
@@ -427,6 +427,8 @@ INIT_MMX mmxext
AVG_PIXELS8_Y2
INIT_MMX 3dnow
AVG_PIXELS8_Y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
--
1.7.9.5
More information about the ffmpeg-devel
mailing list