[FFmpeg-devel] [PATCH 2/4] avcodec/x86: put_pixels16_y2_sse2
Michael Niedermayer
michaelni at gmx.at
Sun Feb 3 16:31:07 CET 2013
about 1% faster P frame motion compensation for matrixbench on i7
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
libavcodec/x86/dsputil_mmx.c | 3 +++
libavcodec/x86/hpeldsp.asm | 14 ++++++++------
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 29d87a1..047504e 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1525,6 +1525,8 @@ void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
@@ -2037,6 +2039,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
if (!high_bit_depth) {
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 81b6901..088f811 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -202,21 +202,21 @@ PUT_NO_RND_PIXELS8_X2_EXACT
; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
%macro PUT_PIXELS8_Y2 0
-cglobal put_pixels8_y2, 4,5
+cglobal put_pixels %+ mmsize %+ _y2, 4, 5, 3
movsxdifnidn r2, r2d
lea r4, [r2*2]
- mova m0, [r1]
+ movu m0, [r1]
sub r0, r2
.loop:
- mova m1, [r1+r2]
- mova m2, [r1+r4]
+ movu m1, [r1+r2]
+ movu m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
+ movu m1, [r1+r2]
+ movu m0, [r1+r4]
add r0, r4
add r1, r4
PAVGB m2, m1
@@ -233,6 +233,8 @@ INIT_MMX mmxext
PUT_PIXELS8_Y2
INIT_MMX 3dnow
PUT_PIXELS8_Y2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
--
1.7.9.5
More information about the ffmpeg-devel
mailing list