[FFmpeg-devel] [WIP] [PATCH] Convert ff_put_pixels_clamped_mmx() to yasm

Thu May 22 03:25:20 CEST 2014

Also adds some changes:
- use macros
- pass the +64 as argument to macro instead of add blockq, 64
- save some registers by passing memory address to packuswb
- add SSE2 version

Tricks stolen from ff_put_signed_pixels_clamped_*().

Signed-off-by: Timothy Gu <timothygu99 at gmail.com>
---
I actually have ff_put_signed_pixels_clamped_mmx() and similar functions
converted to yasm long before James submitted his patch, but I forgot
about it. When I saw the patch, I was like "Dammit." So here it is, my
yasm ff_put_pixels_clamped_mmx(), written something like 2 months ago.

I am not happy with the changes of all the files that directly
references ff_put_pixels_clamped_mmx(), but I cannot find another way
to fix --disable-yasm. Any ideas?
---
 libavcodec/dct-test.c           |  6 ++---
 libavcodec/x86/cavsdsp.c        |  7 ++---
 libavcodec/x86/dsputil.asm      | 45 ++++++++++++++++++++++++++++++++
 libavcodec/x86/dsputil_init.c   |  3 ++-
 libavcodec/x86/dsputil_mmx.c    | 58 -----------------------------------------
 libavcodec/x86/dsputil_x86.h    |  2 ++
 libavcodec/x86/idct_mmx_xvid.c  |  2 ++
 libavcodec/x86/idct_sse2_xvid.c |  4 ++-
 libavcodec/x86/simple_idct.c    |  5 ++--
 9 files changed, 64 insertions(+), 68 deletions(-)

diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 2f4c3f7..670e009 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -139,14 +139,14 @@ static const struct algo idct_tab[] = {
     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
     { "PR-C",           ff_prores_idct_wrap,   NO_PERM, 0, 1 },
 
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_INLINE && HAVE_MMX_EXTERNAL
     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 #endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_MMXEXT_INLINE && HAVE_MMX_EXTERNAL
     { "XVID-MMXEXT",    ff_idct_xvid_mmxext,   NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
 #endif
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_INLINE && HAVE_SSE2_EXTERNAL
     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 #if ARCH_X86_64 && HAVE_YASM
     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index 78d4689..970221f 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -33,7 +33,8 @@
 #include "fpel.h"
 #include "config.h"
 
-#if HAVE_MMX_INLINE
+// yasm needed for ff_add_pixels_clamped_mmx()
+#if HAVE_MMX_INLINE && HAVE_MMX_EXTERNAL
 
 /* in/out: mma=mma+mmb, mmb=mmb-mma */
 #define SUMSUB_BA( a, b ) \
@@ -200,7 +201,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
     ff_add_pixels_clamped_mmx(b2, dst, stride);
 }
 
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_MMX_INLINE && HAVE_MMX_EXTERNAL */
 
 #if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
 
@@ -495,12 +496,12 @@ static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
     c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
-#endif
 
 #if HAVE_MMX_INLINE
     c->cavs_idct8_add = cavs_idct8_add_mmx;
     c->idct_perm      = FF_TRANSPOSE_IDCT_PERM;
 #endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_MMX_EXTERNAL */
 }
 
 #define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 747c645..527a4fb 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -3,6 +3,7 @@
 ;* Copyright (c) 2008 Loren Merritt
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
+;* Copyright (c) 2014 Tiancheng "Timothy" Gu
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -576,6 +577,50 @@ INIT_XMM sse
 %define ZERO xorps
 CLEAR_BLOCKS 1
 
+;------------------------------------------------------------------------------
+; void ff_put_pixels_clamped_*(const int16_t *block, uint8_t *pixels,
+;                              int line_size);
+;------------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+; %1 = # of xmm used
+%macro PUT_PIXELS_CLAMPED 1
+cglobal put_pixels_clamped, 3,4,%1, block, pixels, lsize, lsize3
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED 2
+
 ;--------------------------------------------------------------------------
 ;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
 ;                                  int line_size)
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index e274e67..205e033 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -529,7 +529,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
                                      int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_MMX_INLINE
-    c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
 
     if (!high_bit_depth) {
@@ -549,6 +548,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
         c->clear_blocks = ff_clear_blocks_mmx;
     }
     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+    c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
 #endif /* HAVE_MMX_EXTERNAL */
 }
@@ -627,6 +627,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
     }
     c->bswap_buf = ff_bswap32_buf_sse2;
+    c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index fa77a5c..9423d76 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -36,64 +36,6 @@
 
 #if HAVE_INLINE_ASM
 
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                int line_size)
 {
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 1f4711d..a63e412 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -35,6 +35,8 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                int line_size);
 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                int line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                int line_size);
 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                       int line_size);
 void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 4cd6de1..027016f 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -47,6 +47,7 @@
 #include "dsputil_x86.h"
 #include "idct_xvid.h"
 
+#if HAVE_MMX_EXTERNAL // needed for ff_put_pixels_clamped_mmx
 #if HAVE_MMX_INLINE
 
 //=============================================================================
@@ -560,3 +561,4 @@ void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
 }
 
 #endif /* HAVE_MMXEXT_INLINE */
+#endif /* HAVE_MMX_EXTERNAL */
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index a181099..99f60d0 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -43,6 +43,7 @@
 #include "idct_xvid.h"
 #include "dsputil_x86.h"
 
+#if HAVE_SSE2_EXTERNAL // needed for ff_put_pixels_clamped_sse2
 #if HAVE_SSE2_INLINE
 
 /**
@@ -395,7 +396,7 @@ av_extern_inline void ff_idct_xvid_sse2(short *block)
 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
 {
     ff_idct_xvid_sse2(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
+    ff_put_pixels_clamped_sse2(block, dest, line_size);
 }
 
 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
@@ -405,3 +406,4 @@ void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
 }
 
 #endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE2_EXTERNAL */
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 3ae30f3..ac29023 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -24,7 +24,8 @@
 #include "libavutil/x86/asm.h"
 #include "dsputil_x86.h"
 
-#if HAVE_INLINE_ASM
+// yasm needed for ff_add_pixels_clamped_mmx()
+#if HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
 
 /*
 23170.475006
@@ -1166,4 +1167,4 @@ void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
     ff_add_pixels_clamped_mmx(block, dest, line_size);
 }
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
-- 
1.9.1