[FFmpeg-devel] Patch: Inline asm fixes for Intel compiler on Windows

Mon Mar 17 04:29:27 CET 2014

On Mon, Mar 17, 2014 at 12:08:50PM +1100, Matt Oliver wrote:
> >
> > some of the patches seem to not apply cleanly
> 
> 
> Had to do a rebase again. This time patch 2 and 3 needed updating due to a
> changes in 1 and some upstream modifications.

as you mention rebases with some touch of "ohh shit i hate this work"
(which i can understand of course ...)

If this patchset is applied it will need someone to maintain it
otherwise it will probably be more broken than working most of the
time.

I mean changes to asm will likely break the intel specia cases
and would need someone regularly testing with affected compilers
and updating things ...

> 
> I created a clean repo and applied all the patches without error so should
> be good now.

>  configure               |    6 ++++++
>  libavcodec/x86/mlpdsp.c |    4 ++--
>  2 files changed, 8 insertions(+), 2 deletions(-)
> fab12562f034a123084e2f5d626ee66d5fbd1271  0002-2-6-Check-for-nonlocal-inline-asm-labels.patch
> From afa726326ece927f4959a3b575b181c7bba42f93 Mon Sep 17 00:00:00 2001
> From: Matt Oliver <protogonoi at gmail.com>
> Date: Sun, 16 Mar 2014 14:53:39 +1100
> Subject: [PATCH 2/6] [2/6] Check for nonlocal inline asm labels.
> 
> ---
>  configure               | 6 ++++++
>  libavcodec/x86/mlpdsp.c | 4 ++--
>  2 files changed, 8 insertions(+), 2 deletions(-)
>  mode change 100755 => 100644 configure
> 
> diff --git a/configure b/configure
> index 34200a3..14475ae
> --- a/configure
> +++ b/configure
> @@ -1689,6 +1689,7 @@ TOOLCHAIN_FEATURES="
>      gnu_windres
>      ibm_asm
>      inline_asm_labels
> +    inline_asm_nonlocal_labels
>      pragma_deprecated
>      rsync_contimeout
>      symver_asm_label

> @@ -1917,6 +1918,9 @@ fast_unaligned_if_any="aarch64 ppc x86"
>  
>  need_memalign="altivec neon sse"
>  
> +inline_asm_labels_deps="inline_asm"

unrelated

> +inline_asm_nonlocal_labels_deps="inline_asm"
> +
>  # system capabilities
>  log2_deps="!libc_msvcrt"
>  

is this actually needed ?

[...]
[...]
> index af4790c..e1878fa 100644
> --- a/libavcodec/x86/idct_sse2_xvid.c
> +++ b/libavcodec/x86/idct_sse2_xvid.c
> @@ -147,7 +147,7 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
>  
>  #endif
>  
> -#define ROUND(x) "paddd   "MANGLE(x)
> +#define ROUND(x) "paddd   "x
>  
>  #define JZ(reg, to)                         \
>      "testl     "reg","reg"            \n\t" \
> @@ -347,13 +347,13 @@ inline void ff_idct_xvid_sse2(short *block)
>  {
>      __asm__ volatile(
>      "movq     "MANGLE(m127)", %%mm0                              \n\t"
> -    iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
> -    iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
> -    iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
> +    iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(MANGLE(walkenIdctRounders)),      PUT_EVEN(ROW0))
> +    iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND("1*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW1))
> +    iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND("2*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW2))
>  
>      TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
>      JZ("%%eax", "1f")
> -    iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
> +    iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND("3*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW3))
>  
>      TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
>      TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
> @@ -368,20 +368,20 @@ inline void ff_idct_xvid_sse2(short *block)
>      "2:                                                          \n\t"
>      iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
>      "3:                                                          \n\t"
> -    iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
> +    iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND("4*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW5))
>      JZ("%%edx", "1f")
>      "4:                                                          \n\t"
> -    iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
> +    iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW6))
>      JZ("%%esi", "1f")
>      "5:                                                          \n\t"
> -    iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
> +    iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW7))
>  #if ARCH_X86_32
>      iLLM_HEAD
>  #endif
>      iLLM_PASS("%0")

applied hunks above

[...]

> diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
> index c7a1bb4..6ea3f6d 100644
> --- a/libswscale/x86/swscale_template.c
> +++ b/libswscale/x86/swscale_template.c
> @@ -171,7 +171,8 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
>  #define YSCALEYUV2PACKEDX_END                     \
>          :: "r" (&c->redDither),                   \
>              "m" (dummy), "m" (dummy), "m" (dummy),\
> -            "r" (dest), "m" (dstW_reg), "m"(uv_off) \
> +            "r" (dest), "m" (dstW_reg), "m"(uv_off)
> +#define YSCALEYUV2PACKEDX_END2                    \
>          : "%"REG_a, "%"REG_d, "%"REG_S            \
>      );
>  
> @@ -360,12 +361,14 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
>          "packuswb                  %%mm7, %%mm1         \n\t"
>          WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
>          YSCALEYUV2PACKEDX_END
> +        YSCALEYUV2PACKEDX_END2
>      } else {
>          YSCALEYUV2PACKEDX_ACCURATE
>          YSCALEYUV2RGBX
>          "pcmpeqd %%mm7, %%mm7 \n\t"
>          WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
>          YSCALEYUV2PACKEDX_END
> +        YSCALEYUV2PACKEDX_END2
>      }
>  }

this looks a bit more complex than needed
you should be able to just unconditionally add the
NAMED_CONSTRAINTS_ADD() to the existing macro

this would also decrease the likelhood of future changes breaking
this code

[...]
> diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
> index c879102..0e39a7b 100644
> --- a/libswscale/x86/yuv2rgb_template.c
> +++ b/libswscale/x86/yuv2rgb_template.c
> @@ -138,6 +138,8 @@
>          : "+r" (index), "+r" (image)                              \
>          : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
>            "r" (py - 2*index)                                      \
> +
> +#define YUV2RGB_OPERANDS2                                         \
>          : "memory"                                                \
>          );                                                        \
>      }                                                             \
> @@ -146,9 +148,6 @@
>          : "+r" (index), "+r" (image)                              \
>          : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
>            "r" (py - 2*index), "r" (pa - 2*index)                  \
> -        : "memory"                                                \
> -        );                                                        \
> -    }                                                             \
>  
>  #define YUV2RGB_ENDFUNC                          \
>      __asm__ volatile (SFENCE"\n\t"               \
> @@ -207,6 +206,8 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
>  
>      YUV2RGB_ENDLOOP(2)
>      YUV2RGB_OPERANDS
> +    NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,mmx_redmask,pb_e0)
> +    YUV2RGB_OPERANDS2
>      YUV2RGB_ENDFUNC

same issue here

>  }
>  
> @@ -235,6 +236,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
>  
>      YUV2RGB_ENDLOOP(2)
>      YUV2RGB_OPERANDS
> +    NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_07,mmx_redmask,pb_e0)
> +    YUV2RGB_OPERANDS2
>      YUV2RGB_ENDFUNC
>  }
>  #endif /* !COMPILE_TEMPLATE_MMXEXT */
> @@ -260,6 +263,8 @@ DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
>  DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
>  DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
>  DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
> +#undef RGB_PACK24_B_OPERANDS
> +#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ADD(mask1101,mask0110,mask0100,mask0010,mask1001)
>  #undef RGB_PACK24_B
>  #define RGB_PACK24_B\
>      "pshufw    $0xc6,  %%mm2, %%mm1 \n"\
> @@ -283,6 +288,8 @@ DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};

and here

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In fact, the RIAA has been known to suggest that students drop out
of college or go to community college in order to be able to afford
settlements. -- The RIAA
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140317/a8462827/attachment.asc>