[FFmpeg-devel] [PATCH v2] avcodec/ppc/h264dsp: Fix unaligned stores

Thu Mar 14 21:23:56 EET 2024

On 3/14/2024 4:13 PM, Sean McGovern wrote:
> Andreas:
> 
> On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt
> <andreas.rheinhardt at outlook.com> wrote:
>>
>> Also fix an effective-type violation.
>> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt at outlook.com>
>> ---
>>   libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>>   1 file changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
>> index c02733dda2..f50f2553a2 100644
>> --- a/libavcodec/ppc/h264dsp.c
>> +++ b/libavcodec/ppc/h264dsp.c
>> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>>                                register vec_u8 r0, register vec_u8 r1,
>>                                register vec_u8 r2, register vec_u8 r3) {
>>       DECLARE_ALIGNED(16, unsigned char, result)[64];
>> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
>> -    int int_dst_stride = dst_stride/4;
>> +    uint32_t *src_int = (uint32_t *)result;
>>
>>       vec_st(r0, 0, result);
>>       vec_st(r1, 16, result);
>>       vec_st(r2, 32, result);
>>       vec_st(r3, 48, result);
>>       /* FIXME: there has to be a better way!!!! */
>> -    *dst_int = *src_int;
>> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
>> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
>> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
>> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
>> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
>> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
>> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
>> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
>> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
>> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
>> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
>> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
>> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
>> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
>> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
>> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
>> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
>> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
>> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
>> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
>> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
>> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
>> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
>> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
>> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
>> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
>> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
>> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
>> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
>> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
>> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
>>   }
>>
>>   /** @brief performs a 6x16 transpose of data in src, and stores it to dst
>> --
>> 2.40.1
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel at ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
> 
> First of all, thank you for looking into this.
> 
> Second, do we feel that this change covers the FIXME immediately above
> it that exclaims "there has to be a better way!!!!"?
> If so, we can remove the comment.

Doubt it. Even after Andreas' change it's essentially the same as before 
(load four bytes, write four bytes) but without the UB. The FIXME 
probably refers to finding a way to do this with vector intrinsics.

> 
> I did not perform a full FATE run as it is expensive on my QEMU setup,
> but I can confirm that this fixes the checkasm-h264dsp test under GCC
> UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).