[FFmpeg-devel] [WIP] [PATCH] Convert ff_put_pixels_clamped_mmx() to yasm
Timothy Gu
timothygu99 at gmail.com
Thu May 22 03:25:20 CEST 2014
Also adds some changes:
- use macros
- pass the +64 as argument to macro instead of add blockq, 64
- save some registers by passing memory address to packuswb
- add SSE2 version
Tricks stolen from ff_put_signed_pixels_clamped_*().
Signed-off-by: Timothy Gu <timothygu99 at gmail.com>
---
I actually have ff_put_signed_pixels_clamped_mmx() and similar functions
converted to yasm long before James submitted his patch, but I forgot
about it. When I saw the patch, I was like "Dammit." So here it is, my
yasm ff_put_pixels_clamped_mmx(), written something like 2 months ago.
I am not happy with the changes of all the files that directly
references ff_put_pixels_clamped_mmx(), but I cannot find another way
to fix --disable-yasm. Any ideas?
---
libavcodec/dct-test.c | 6 ++---
libavcodec/x86/cavsdsp.c | 7 ++---
libavcodec/x86/dsputil.asm | 45 ++++++++++++++++++++++++++++++++
libavcodec/x86/dsputil_init.c | 3 ++-
libavcodec/x86/dsputil_mmx.c | 58 -----------------------------------------
libavcodec/x86/dsputil_x86.h | 2 ++
libavcodec/x86/idct_mmx_xvid.c | 2 ++
libavcodec/x86/idct_sse2_xvid.c | 4 ++-
libavcodec/x86/simple_idct.c | 5 ++--
9 files changed, 64 insertions(+), 68 deletions(-)
diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 2f4c3f7..670e009 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -139,14 +139,14 @@ static const struct algo idct_tab[] = {
{ "SIMPLE-C", ff_simple_idct_8, NO_PERM },
{ "PR-C", ff_prores_idct_wrap, NO_PERM, 0, 1 },
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_INLINE && HAVE_MMX_EXTERNAL
{ "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
{ "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
#endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_MMXEXT_INLINE && HAVE_MMX_EXTERNAL
{ "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
#endif
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_INLINE && HAVE_SSE2_EXTERNAL
{ "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
#if ARCH_X86_64 && HAVE_YASM
{ "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index 78d4689..970221f 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -33,7 +33,8 @@
#include "fpel.h"
#include "config.h"
-#if HAVE_MMX_INLINE
+// yasm needed for ff_add_pixels_clamped_mmx()
+#if HAVE_MMX_INLINE && HAVE_MMX_EXTERNAL
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \
@@ -200,7 +201,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
ff_add_pixels_clamped_mmx(b2, dst, stride);
}
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_MMX_INLINE && HAVE_MMX_EXTERNAL */
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
@@ -495,12 +496,12 @@ static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
-#endif
#if HAVE_MMX_INLINE
c->cavs_idct8_add = cavs_idct8_add_mmx;
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_MMX_EXTERNAL */
}
#define DSPFUNC(PFX, IDX, NUM, EXT) \
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 747c645..527a4fb 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -3,6 +3,7 @@
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
+;* Copyright (c) 2014 Tiancheng "Timothy" Gu
;*
;* This file is part of FFmpeg.
;*
@@ -576,6 +577,50 @@ INIT_XMM sse
%define ZERO xorps
CLEAR_BLOCKS 1
+;------------------------------------------------------------------------------
+; void ff_put_pixels_clamped_*(const int16_t *block, uint8_t *pixels,
+; int line_size);
+;------------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+ mova m0, [blockq+mmsize*0+%1]
+ mova m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+ mova m2, [blockq+mmsize*4+%1]
+ mova m3, [blockq+mmsize*6+%1]
+%endif
+ packuswb m0, [blockq+mmsize*1+%1]
+ packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+ packuswb m2, [blockq+mmsize*5+%1]
+ packuswb m3, [blockq+mmsize*7+%1]
+ movq [pixelsq], m0
+ movq [lsizeq+pixelsq], m1
+ movq [2*lsizeq+pixelsq], m2
+ movq [lsize3q+pixelsq], m3
+%else
+ movq [pixelsq], m0
+ movhps [lsizeq+pixelsq], m0
+ movq [2*lsizeq+pixelsq], m1
+ movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+; %1 = # of xmm used
+%macro PUT_PIXELS_CLAMPED 1
+cglobal put_pixels_clamped, 3,4,%1, block, pixels, lsize, lsize3
+ lea lsize3q, [lsizeq*3]
+ PUT_PIXELS_CLAMPED_HALF 0
+ lea pixelsq, [pixelsq+lsizeq*4]
+ PUT_PIXELS_CLAMPED_HALF 64
+ RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED 2
+
;--------------------------------------------------------------------------
;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
; int line_size)
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index e274e67..205e033 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -529,7 +529,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
int cpu_flags, unsigned high_bit_depth)
{
#if HAVE_MMX_INLINE
- c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
@@ -549,6 +548,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->clear_blocks = ff_clear_blocks_mmx;
}
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+ c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
@@ -627,6 +627,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
+ c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index fa77a5c..9423d76 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -36,64 +36,6 @@
#if HAVE_INLINE_ASM
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- const int16_t *p;
- uint8_t *pix;
-
- /* read the pixels */
- p = block;
- pix = pixels;
- /* unrolled loop */
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
- "r" (p)
- : "memory");
- pix += line_size * 4;
- p += 32;
-
- // if here would be an exact copy of the code above
- // compiler would generate some very strange code
- // thus using "r"
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
- "r" (p)
- : "memory");
-}
-
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 1f4711d..a63e412 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -35,6 +35,8 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size);
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+ int line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size);
void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 4cd6de1..027016f 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -47,6 +47,7 @@
#include "dsputil_x86.h"
#include "idct_xvid.h"
+#if HAVE_MMX_EXTERNAL // needed for ff_put_pixels_clamped_mmx
#if HAVE_MMX_INLINE
//=============================================================================
@@ -560,3 +561,4 @@ void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
}
#endif /* HAVE_MMXEXT_INLINE */
+#endif /* HAVE_MMX_EXTERNAL */
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index a181099..99f60d0 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -43,6 +43,7 @@
#include "idct_xvid.h"
#include "dsputil_x86.h"
+#if HAVE_SSE2_EXTERNAL // needed for ff_put_pixels_clamped_sse2
#if HAVE_SSE2_INLINE
/**
@@ -395,7 +396,7 @@ av_extern_inline void ff_idct_xvid_sse2(short *block)
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
{
ff_idct_xvid_sse2(block);
- ff_put_pixels_clamped_mmx(block, dest, line_size);
+ ff_put_pixels_clamped_sse2(block, dest, line_size);
}
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
@@ -405,3 +406,4 @@ void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
}
#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE2_EXTERNAL */
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 3ae30f3..ac29023 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -24,7 +24,8 @@
#include "libavutil/x86/asm.h"
#include "dsputil_x86.h"
-#if HAVE_INLINE_ASM
+// yasm needed for ff_add_pixels_clamped_mmx()
+#if HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
/*
23170.475006
@@ -1166,4 +1167,4 @@ void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
--
1.9.1
More information about the ffmpeg-devel
mailing list