[FFmpeg-devel] Patch: Inline asm fixes for Intel compiler on Windows
Michael Niedermayer
michaelni at gmx.at
Mon Mar 17 04:29:27 CET 2014
On Mon, Mar 17, 2014 at 12:08:50PM +1100, Matt Oliver wrote:
> >
> > some of the patches seem to not apply cleanly
>
>
> Had to do a rebase again. This time patch 2 and 3 needed updating due to a
> changes in 1 and some upstream modifications.
as you mention rebases with some touch of "ohh shit i hate this work"
(which i can understand of course ...)
If this patchset is applied it will need someone to maintain it
otherwise it will probably be more broken than working most of the
time.
I mean changes to asm will likely break the intel specia cases
and would need someone regularly testing with affected compilers
and updating things ...
>
> I created a clean repo and applied all the patches without error so should
> be good now.
> configure | 6 ++++++
> libavcodec/x86/mlpdsp.c | 4 ++--
> 2 files changed, 8 insertions(+), 2 deletions(-)
> fab12562f034a123084e2f5d626ee66d5fbd1271 0002-2-6-Check-for-nonlocal-inline-asm-labels.patch
> From afa726326ece927f4959a3b575b181c7bba42f93 Mon Sep 17 00:00:00 2001
> From: Matt Oliver <protogonoi at gmail.com>
> Date: Sun, 16 Mar 2014 14:53:39 +1100
> Subject: [PATCH 2/6] [2/6] Check for nonlocal inline asm labels.
>
> ---
> configure | 6 ++++++
> libavcodec/x86/mlpdsp.c | 4 ++--
> 2 files changed, 8 insertions(+), 2 deletions(-)
> mode change 100755 => 100644 configure
>
> diff --git a/configure b/configure
> index 34200a3..14475ae
> --- a/configure
> +++ b/configure
> @@ -1689,6 +1689,7 @@ TOOLCHAIN_FEATURES="
> gnu_windres
> ibm_asm
> inline_asm_labels
> + inline_asm_nonlocal_labels
> pragma_deprecated
> rsync_contimeout
> symver_asm_label
> @@ -1917,6 +1918,9 @@ fast_unaligned_if_any="aarch64 ppc x86"
>
> need_memalign="altivec neon sse"
>
> +inline_asm_labels_deps="inline_asm"
unrelated
> +inline_asm_nonlocal_labels_deps="inline_asm"
> +
> # system capabilities
> log2_deps="!libc_msvcrt"
>
is this actually needed ?
[...]
[...]
> index af4790c..e1878fa 100644
> --- a/libavcodec/x86/idct_sse2_xvid.c
> +++ b/libavcodec/x86/idct_sse2_xvid.c
> @@ -147,7 +147,7 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
>
> #endif
>
> -#define ROUND(x) "paddd "MANGLE(x)
> +#define ROUND(x) "paddd "x
>
> #define JZ(reg, to) \
> "testl "reg","reg" \n\t" \
> @@ -347,13 +347,13 @@ inline void ff_idct_xvid_sse2(short *block)
> {
> __asm__ volatile(
> "movq "MANGLE(m127)", %%mm0 \n\t"
> - iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
> - iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
> - iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
> + iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(MANGLE(walkenIdctRounders)), PUT_EVEN(ROW0))
> + iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND("1*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW1))
> + iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND("2*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW2))
>
> TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
> JZ("%%eax", "1f")
> - iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
> + iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND("3*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW3))
>
> TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
> TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
> @@ -368,20 +368,20 @@ inline void ff_idct_xvid_sse2(short *block)
> "2: \n\t"
> iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
> "3: \n\t"
> - iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
> + iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND("4*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW5))
> JZ("%%edx", "1f")
> "4: \n\t"
> - iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
> + iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW6))
> JZ("%%esi", "1f")
> "5: \n\t"
> - iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
> + iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW7))
> #if ARCH_X86_32
> iLLM_HEAD
> #endif
> iLLM_PASS("%0")
applied hunks above
[...]
> diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
> index c7a1bb4..6ea3f6d 100644
> --- a/libswscale/x86/swscale_template.c
> +++ b/libswscale/x86/swscale_template.c
> @@ -171,7 +171,8 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
> #define YSCALEYUV2PACKEDX_END \
> :: "r" (&c->redDither), \
> "m" (dummy), "m" (dummy), "m" (dummy),\
> - "r" (dest), "m" (dstW_reg), "m"(uv_off) \
> + "r" (dest), "m" (dstW_reg), "m"(uv_off)
> +#define YSCALEYUV2PACKEDX_END2 \
> : "%"REG_a, "%"REG_d, "%"REG_S \
> );
>
> @@ -360,12 +361,14 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
> "packuswb %%mm7, %%mm1 \n\t"
> WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
> YSCALEYUV2PACKEDX_END
> + YSCALEYUV2PACKEDX_END2
> } else {
> YSCALEYUV2PACKEDX_ACCURATE
> YSCALEYUV2RGBX
> "pcmpeqd %%mm7, %%mm7 \n\t"
> WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
> YSCALEYUV2PACKEDX_END
> + YSCALEYUV2PACKEDX_END2
> }
> }
this looks a bit more complex than needed
you should be able to just unconditionally add the
NAMED_CONSTRAINTS_ADD() to the existing macro
this would also decrease the likelhood of future changes breaking
this code
[...]
> diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
> index c879102..0e39a7b 100644
> --- a/libswscale/x86/yuv2rgb_template.c
> +++ b/libswscale/x86/yuv2rgb_template.c
> @@ -138,6 +138,8 @@
> : "+r" (index), "+r" (image) \
> : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
> "r" (py - 2*index) \
> +
> +#define YUV2RGB_OPERANDS2 \
> : "memory" \
> ); \
> } \
> @@ -146,9 +148,6 @@
> : "+r" (index), "+r" (image) \
> : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
> "r" (py - 2*index), "r" (pa - 2*index) \
> - : "memory" \
> - ); \
> - } \
>
> #define YUV2RGB_ENDFUNC \
> __asm__ volatile (SFENCE"\n\t" \
> @@ -207,6 +206,8 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
>
> YUV2RGB_ENDLOOP(2)
> YUV2RGB_OPERANDS
> + NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,mmx_redmask,pb_e0)
> + YUV2RGB_OPERANDS2
> YUV2RGB_ENDFUNC
same issue here
> }
>
> @@ -235,6 +236,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
>
> YUV2RGB_ENDLOOP(2)
> YUV2RGB_OPERANDS
> + NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_07,mmx_redmask,pb_e0)
> + YUV2RGB_OPERANDS2
> YUV2RGB_ENDFUNC
> }
> #endif /* !COMPILE_TEMPLATE_MMXEXT */
> @@ -260,6 +263,8 @@ DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
> DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
> DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
> DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
> +#undef RGB_PACK24_B_OPERANDS
> +#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ADD(mask1101,mask0110,mask0100,mask0010,mask1001)
> #undef RGB_PACK24_B
> #define RGB_PACK24_B\
> "pshufw $0xc6, %%mm2, %%mm1 \n"\
> @@ -283,6 +288,8 @@ DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
and here
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
In fact, the RIAA has been known to suggest that students drop out
of college or go to community college in order to be able to afford
settlements. -- The RIAA
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140317/a8462827/attachment.asc>
More information about the ffmpeg-devel
mailing list