[FFmpeg-devel] [PATCH] Make HAVE_FAST_UNALIGNED allow unaligned memory accesses

Fri Jul 18 10:00:01 CEST 2008

Michael Niedermayer <michaelni at gmx.at> writes:

> On Fri, Jul 18, 2008 at 02:23:29AM +0100, Mans Rullgard wrote:
>> If HAVE_FAST_UNALIGNED is defined, potentially unaligned data is
>> accessed through normal pointers.  Otherwise, compiler-specific
>> code is used to perform unaligned accesses, falling back to
>> byte-wise access if no compiler support is available.
>> ---
>>  libavutil/intreadwrite.h |   34 +++++++++++++++++++++++++++++-----
>>  1 files changed, 29 insertions(+), 5 deletions(-)
>> 
>> diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h
>> index 72ad5b3..12dec54 100644
>> --- a/libavutil/intreadwrite.h
>> +++ b/libavutil/intreadwrite.h
>> @@ -23,6 +23,8 @@
>>  #include "config.h"
>>  #include "bswap.h"
>>  
>> +#if !defined(HAVE_FAST_UNALIGNED)
>> +
>>  #ifdef __GNUC__
>>  
>>  struct unaligned_64 { uint64_t l; } __attribute__((packed));
>> @@ -47,7 +49,9 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
>>  #define AV_WN32(a, b) *((__unaligned uint32_t*)(a)) = (b)
>>  #define AV_WN64(a, b) *((__unaligned uint64_t*)(a)) = (b)
>>  
>> -#else
>> +#endif
>> +
>> +#else /* !HAVE_FAST_UNALIGNED */
>>  
>>  #define AV_RN16(a) (*((const uint16_t*)(a)))
>>  #define AV_RN32(a) (*((const uint32_t*)(a)))
>> @@ -57,7 +61,7 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
>>  #define AV_WN32(a, b) *((uint32_t*)(a)) = (b)
>>  #define AV_WN64(a, b) *((uint64_t*)(a)) = (b)
>>  
>> -#endif /* !__GNUC__ */
>> +#endif /* !HAVE_FAST_UNALIGNED */
>>  
>>  /* endian macros */
>>  #define AV_RB8(x)     (((const uint8_t*)(x))[0])
>
> ok
>
>> @@ -66,7 +70,8 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
>>  #define AV_RL8(x)     AV_RB8(x)
>>  #define AV_WL8(p, d)  AV_WB8(p, d)
>>  
>> -#ifdef HAVE_FAST_UNALIGNED
>> +#ifdef AV_RN16
>> +
>>  # ifdef WORDS_BIGENDIAN
>>  #  define AV_RB16(x)    AV_RN16(x)
>>  #  define AV_WB16(p, d) AV_WN16(p, d)
>
> Platforms where HAVE_FAST_UNALIGNED is not set would generally have AV_RN16
> set thus use things like:
> #  define AV_RL16(x)    bswap_16(AV_RN16(x))
> #  define AV_WL16(p, d) AV_WN16(p, bswap_16(d))
> #  define AV_RL32(x)    bswap_32(AV_RN32(x))
> #  define AV_WL32(p, d) AV_WN32(p, bswap_32(d))
>
> These may be slower than the naive bytewise reading macros on such
> platforms.

For the reverse endian version, this may be the case.  It depends on
the CPU and compiler.  Some CPUs (e.g. Alpha) have special
instructions for unaligned load/store that are faster than byte-wise
access (especially on the early Alphas without byte-access), but
slower than aligned operations.  I'll have to do some tests with the
cross-compilers I have available and see what happens.

For native endian, the compiler (including gcc) will either use
CPU-specific instructions or generate the equivalent of our naive
versions.  I would not, however, count on the compiler optimising the
latter using dedicated instructions.

-- 
M?ns Rullg?rd
mans at mansr.com