[FFmpeg-devel] [PATCH] some SIMD write-combining for h264

Sun Jan 17 03:59:22 CET 2010

Alexander Strange <astrange at ithinksw.com> writes:

> On Jan 16, 2010, at 12:35 AM, Michael Niedermayer wrote:
>
>> On Fri, Jan 15, 2010 at 11:11:23PM -0500, Alexander Strange wrote:
>>> This adds intreadwrite macros for 64/128-bit memory operations and uses them in h264.
>>> 
>>> Unlike the other macros, these assume correct alignment, and the patch only defines the ones there was an immediate use for.
>>> This only has x86 versions, but others should be easy. The 64-bit operations can be done with double copies on most systems, I guess.
>>> 
>>> Decoding a 30s file on Core 2 Merom with --cpu=core2 (minimum of 5 runs):
>>> x86-32: 12.72s before, 12.51s after (1.7%)
>>> x86-64: 10.24s before, 10.20s after (.4%)
>>> 
>>> Tested on x86-32, x86-64, x86-32 with --arch=c.
>> 
>> as your code uses MMX you need to at least mention EMMS/float issue in the
>> dox and probably a  emms_c(); call before draw_horiz_band()
>> dunno if these are all
>
> Added in the comment.
>
>> also what sets __MMX__ ? we have our own defines for that
>
> It's a gcc builtin define, set based on ./configure --cpu=x adding
> -march.  HAVE_MMX is for the build and not the host cpu family, and
> this is inlined asm, so it can't use it.

Huh?  Host... build???

>> and last
>> 1.7% makes 50->48.3 % left to CoreAVC if we assume we are 50% behind
>
> Well, it turned out to work unusually well on that file (I had the
> idea when profiling it in the first place), closer to 1% on
> avchd-test-1.ts posted here a while ago.
>
>
> From c1d931e9846ca862935254726010e2d21737f5c5 Mon Sep 17 00:00:00 2001
> From: Alexander Strange <astrange at ithinksw.com>
> Date: Fri, 15 Jan 2010 01:47:47 -0500
> Subject: [PATCH 1/2] Add macros for write-combining optimization.
>
> ---
>  libavutil/intreadwrite.h     |   36 ++++++++++++++++-
>  libavutil/x86/intreadwrite.h |   92 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 127 insertions(+), 1 deletions(-)
>  create mode 100644 libavutil/x86/intreadwrite.h
>
> diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h
> index 933732c..73d6000 100644
> --- a/libavutil/intreadwrite.h
> +++ b/libavutil/intreadwrite.h
> @@ -25,7 +25,7 @@
>  
>  /*
>   * Arch-specific headers can provide any combination of
> - * AV_[RW][BLN](16|24|32|64) macros.  Preprocessor symbols must be
> + * AV_([RW]|COPY|SWAP)[BLN](16|24|32|64) macros.  Preprocessor symbols must be

That's both ugly and wrong.  Write  "AV_[RW][BLN](16|24|32|64) or
AV_(COPY|SWAP)(32|64|128)" instead, or whatever sizes make sense for
the second one.

>   * defined, even if these are implemented as inline functions.
>   */
>  
> @@ -37,6 +37,8 @@
>  #   include "mips/intreadwrite.h"
>  #elif ARCH_PPC
>  #   include "ppc/intreadwrite.h"
> +#elif ARCH_X86
> +#   include "x86/intreadwrite.h"
>  #endif
>  
>  /*
> @@ -397,4 +399,36 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
>      } while(0)
>  #endif
>  
> +/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be
> + * naturally aligned. They may be implemented using MMX,
> + * so emms_c() must be called before using any float code
> + * afterwards.
> + */
> +
> +#define AV_COPY(n, d, s) *(uint##n##_t*)(d) = *(const uint##n##_t*)(s)

Please put () around the entire expansion.  You never know...

> +#ifndef AV_COPY64
> +#   define AV_COPY64(d, s) AV_COPY(64, d, s)
> +#endif
> +
> +#ifndef AV_COPY128
> +#   define AV_COPY128(d, s) do {AV_COPY64(d, s); AV_COPY64((char*)(d)+8, (char*)(s)+8);} while(0)
> +#endif

A few line breaks would make that much more readable.

> +#define AV_SWAP(n, a, b) FFSWAP(uint##n##_t, *(uint##n##_t*)(a), *(uint##n##_t*)(b))
> +
> +#ifndef AV_SWAP64
> +#   define AV_SWAP64(a, b) AV_SWAP(64, a, b)
> +#endif
> +
> +#define AV_ZERO(n, d) *(uint##n##_t*)(d) = 0

Once again, () around the whole thing.

> +#ifndef AV_ZERO64
> +#   define AV_ZERO64(d) AV_ZERO(64, d)
> +#endif
> +
> +#ifndef AV_ZERO128
> +#   define AV_ZERO128(d) do {AV_ZERO64(d); AV_ZERO64((char*)(d)+8);} while(0)
> +#endif

Some newlines wouldn't hurt.

>  #endif /* AVUTIL_INTREADWRITE_H */
> diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
> new file mode 100644
> index 0000000..bdb1e53
> --- /dev/null
> +++ b/libavutil/x86/intreadwrite.h
> @@ -0,0 +1,92 @@
> +/*
> + * Copyright (c) 2010 Alexander Strange <astrange at ithinksw.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_X86_INTREADWRITE_H
> +#define AVUTIL_X86_INTREADWRITE_H
> +
> +#include <stdint.h>
> +#include "config.h"
> +
> +#if !HAVE_FAST_64BIT && __MMX__

If you insist that __MMX__ is the right thing to test, you must use
defined(__MMX__).  It's not a 0/1 thing like ours.

> +#define AV_COPY64 AV_COPY64
> +static inline void AV_COPY64(void *d, const void *s)
> +{
> +    __asm__("movq   %1, %%mm0  \n\t"
> +            "movq   %%mm0, %0  \n\t"
> +            : "=m"(*(uint64_t*)d)
> +            : "m" (*(const uint64_t*)s)
> +            : "mm0");
> +}
> +
> +#define AV_SWAP64 AV_SWAP64
> +static inline void AV_SWAP64(void *a, void *b)
> +{
> +    __asm__("movq   %1, %%mm0  \n\t"
> +            "movq   %0, %%mm1  \n\t"
> +            "movq   %%mm0, %0  \n\t"
> +            "movq   %%mm1, %1  \n\t"
> +            : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
> +            ::"mm0", "mm1");
> +}
> +
> +#define AV_ZERO64 AV_ZERO64
> +static inline void AV_ZERO64(void *d)
> +{
> +    __asm__("pxor %%mm0, %%mm0  \n\t"
> +            "movq %%mm0, %0     \n\t"
> +            : "=m"(*(uint64_t*)d)
> +            :: "mm0");
> +}
> +
> +#endif /* __MMX__ && !HAVE_FAST_64BIT */
> +
> +#if __SSE__

Use #ifdef.

> +#define AV_COPY128 AV_COPY128
> +static inline void AV_COPY128(void *d, const void *s)
> +{
> +    typedef struct {uint64_t i[2];} v;
> +
> +    __asm__("movaps   %1, %%xmm0  \n\t"
> +            "movaps   %%xmm0, %0  \n\t"
> +            : "=m"(*(v*)d)
> +            : "m" (*(const v*)s)
> +            : "xmm0");
> +}

I would have done something like this instead of the typedef:

    struct { uint64_t i[2]; } *vd = d, *vs = s;

Or maybe like this:

    uint64_t (*vd)[2] = d;
    uint64_t (*vs)[2] = s;

Examining asm is probably the best way of choosing.

> +#endif /* __SSE__ */
> +
> +#if __SSE2__

Use #ifdef.

> +#define AV_ZERO128 AV_ZERO128
> +static inline void AV_ZERO128(void *d)
> +{
> +    typedef struct {uint64_t i[2];} v;
> +
> +    __asm__("pxor %%xmm0, %%xmm0  \n\t"
> +            "movdqa   %%xmm0, %0  \n\t"
> +            : "=m"(*(v*)d)
> +            :: "xmm0");
> +}

Same as above about the typedef.

> +#endif /* __SSE2__ */
> +
> +#endif /* AVUTIL_X86_INTREADWRITE_H */
> -- 
> 1.6.5.2
>
>

-- 
M?ns Rullg?rd
mans at mansr.com