[FFmpeg-devel] [PATCH] some SIMD write-combining for h264
Måns Rullgård
mans
Sun Jan 17 03:59:22 CET 2010
Alexander Strange <astrange at ithinksw.com> writes:
> On Jan 16, 2010, at 12:35 AM, Michael Niedermayer wrote:
>
>> On Fri, Jan 15, 2010 at 11:11:23PM -0500, Alexander Strange wrote:
>>> This adds intreadwrite macros for 64/128-bit memory operations and uses them in h264.
>>>
>>> Unlike the other macros, these assume correct alignment, and the patch only defines the ones there was an immediate use for.
>>> This only has x86 versions, but others should be easy. The 64-bit operations can be done with double copies on most systems, I guess.
>>>
>>> Decoding a 30s file on Core 2 Merom with --cpu=core2 (minimum of 5 runs):
>>> x86-32: 12.72s before, 12.51s after (1.7%)
>>> x86-64: 10.24s before, 10.20s after (.4%)
>>>
>>> Tested on x86-32, x86-64, x86-32 with --arch=c.
>>
>> as your code uses MMX you need to at least mention EMMS/float issue in the
>> dox and probably a emms_c(); call before draw_horiz_band()
>> dunno if these are all
>
> Added in the comment.
>
>> also what sets __MMX__ ? we have our own defines for that
>
> It's a gcc builtin define, set based on ./configure --cpu=x adding
> -march. HAVE_MMX is for the build and not the host cpu family, and
> this is inlined asm, so it can't use it.
Huh? Host... build???
>> and last
>> 1.7% makes 50->48.3 % left to CoreAVC if we assume we are 50% behind
>
> Well, it turned out to work unusually well on that file (I had the
> idea when profiling it in the first place), closer to 1% on
> avchd-test-1.ts posted here a while ago.
>
>
> From c1d931e9846ca862935254726010e2d21737f5c5 Mon Sep 17 00:00:00 2001
> From: Alexander Strange <astrange at ithinksw.com>
> Date: Fri, 15 Jan 2010 01:47:47 -0500
> Subject: [PATCH 1/2] Add macros for write-combining optimization.
>
> ---
> libavutil/intreadwrite.h | 36 ++++++++++++++++-
> libavutil/x86/intreadwrite.h | 92 ++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 127 insertions(+), 1 deletions(-)
> create mode 100644 libavutil/x86/intreadwrite.h
>
> diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h
> index 933732c..73d6000 100644
> --- a/libavutil/intreadwrite.h
> +++ b/libavutil/intreadwrite.h
> @@ -25,7 +25,7 @@
>
> /*
> * Arch-specific headers can provide any combination of
> - * AV_[RW][BLN](16|24|32|64) macros. Preprocessor symbols must be
> + * AV_([RW]|COPY|SWAP)[BLN](16|24|32|64) macros. Preprocessor symbols must be
That's both ugly and wrong. Write "AV_[RW][BLN](16|24|32|64) or
AV_(COPY|SWAP)(32|64|128)" instead, or whatever sizes make sense for
the second one.
> * defined, even if these are implemented as inline functions.
> */
>
> @@ -37,6 +37,8 @@
> # include "mips/intreadwrite.h"
> #elif ARCH_PPC
> # include "ppc/intreadwrite.h"
> +#elif ARCH_X86
> +# include "x86/intreadwrite.h"
> #endif
>
> /*
> @@ -397,4 +399,36 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
> } while(0)
> #endif
>
> +/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be
> + * naturally aligned. They may be implemented using MMX,
> + * so emms_c() must be called before using any float code
> + * afterwards.
> + */
> +
> +#define AV_COPY(n, d, s) *(uint##n##_t*)(d) = *(const uint##n##_t*)(s)
Please put () around the entire expansion. You never know...
> +#ifndef AV_COPY64
> +# define AV_COPY64(d, s) AV_COPY(64, d, s)
> +#endif
> +
> +#ifndef AV_COPY128
> +# define AV_COPY128(d, s) do {AV_COPY64(d, s); AV_COPY64((char*)(d)+8, (char*)(s)+8);} while(0)
> +#endif
A few line breaks would make that much more readable.
> +#define AV_SWAP(n, a, b) FFSWAP(uint##n##_t, *(uint##n##_t*)(a), *(uint##n##_t*)(b))
> +
> +#ifndef AV_SWAP64
> +# define AV_SWAP64(a, b) AV_SWAP(64, a, b)
> +#endif
> +
> +#define AV_ZERO(n, d) *(uint##n##_t*)(d) = 0
Once again, () around the whole thing.
> +#ifndef AV_ZERO64
> +# define AV_ZERO64(d) AV_ZERO(64, d)
> +#endif
> +
> +#ifndef AV_ZERO128
> +# define AV_ZERO128(d) do {AV_ZERO64(d); AV_ZERO64((char*)(d)+8);} while(0)
> +#endif
Some newlines wouldn't hurt.
> #endif /* AVUTIL_INTREADWRITE_H */
> diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
> new file mode 100644
> index 0000000..bdb1e53
> --- /dev/null
> +++ b/libavutil/x86/intreadwrite.h
> @@ -0,0 +1,92 @@
> +/*
> + * Copyright (c) 2010 Alexander Strange <astrange at ithinksw.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_X86_INTREADWRITE_H
> +#define AVUTIL_X86_INTREADWRITE_H
> +
> +#include <stdint.h>
> +#include "config.h"
> +
> +#if !HAVE_FAST_64BIT && __MMX__
If you insist that __MMX__ is the right thing to test, you must use
defined(__MMX__). It's not a 0/1 thing like ours.
> +#define AV_COPY64 AV_COPY64
> +static inline void AV_COPY64(void *d, const void *s)
> +{
> + __asm__("movq %1, %%mm0 \n\t"
> + "movq %%mm0, %0 \n\t"
> + : "=m"(*(uint64_t*)d)
> + : "m" (*(const uint64_t*)s)
> + : "mm0");
> +}
> +
> +#define AV_SWAP64 AV_SWAP64
> +static inline void AV_SWAP64(void *a, void *b)
> +{
> + __asm__("movq %1, %%mm0 \n\t"
> + "movq %0, %%mm1 \n\t"
> + "movq %%mm0, %0 \n\t"
> + "movq %%mm1, %1 \n\t"
> + : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
> + ::"mm0", "mm1");
> +}
> +
> +#define AV_ZERO64 AV_ZERO64
> +static inline void AV_ZERO64(void *d)
> +{
> + __asm__("pxor %%mm0, %%mm0 \n\t"
> + "movq %%mm0, %0 \n\t"
> + : "=m"(*(uint64_t*)d)
> + :: "mm0");
> +}
> +
> +#endif /* __MMX__ && !HAVE_FAST_64BIT */
> +
> +#if __SSE__
Use #ifdef.
> +#define AV_COPY128 AV_COPY128
> +static inline void AV_COPY128(void *d, const void *s)
> +{
> + typedef struct {uint64_t i[2];} v;
> +
> + __asm__("movaps %1, %%xmm0 \n\t"
> + "movaps %%xmm0, %0 \n\t"
> + : "=m"(*(v*)d)
> + : "m" (*(const v*)s)
> + : "xmm0");
> +}
I would have done something like this instead of the typedef:
struct { uint64_t i[2]; } *vd = d, *vs = s;
Or maybe like this:
uint64_t (*vd)[2] = d;
uint64_t (*vs)[2] = s;
Examining asm is probably the best way of choosing.
> +#endif /* __SSE__ */
> +
> +#if __SSE2__
Use #ifdef.
> +#define AV_ZERO128 AV_ZERO128
> +static inline void AV_ZERO128(void *d)
> +{
> + typedef struct {uint64_t i[2];} v;
> +
> + __asm__("pxor %%xmm0, %%xmm0 \n\t"
> + "movdqa %%xmm0, %0 \n\t"
> + : "=m"(*(v*)d)
> + :: "xmm0");
> +}
Same as above about the typedef.
> +#endif /* __SSE2__ */
> +
> +#endif /* AVUTIL_X86_INTREADWRITE_H */
> --
> 1.6.5.2
>
>
--
M?ns Rullg?rd
mans at mansr.com
More information about the ffmpeg-devel
mailing list