[FFmpeg-devel] [PATCH] avcodec/x86/audiodsp: add scalarproduct avx2

Mon Sep 12 22:00:15 EEST 2022

On 9/12/2022 3:39 PM, James Almer wrote:
>> From 55eb5a18b4bf029f52f9d9108a750c576ba780ee Mon Sep 17 00:00:00 2001
>> From: Paul B Mahol <onemda at gmail.com>
>> Date: Mon, 12 Sep 2022 18:53:31 +0200
>> Subject: [PATCH] avcodec/x86/audiodsp: add scalarproduct avx2
>>
>> Signed-off-by: Paul B Mahol <onemda at gmail.com>
>> ---
>>  libavcodec/x86/audiodsp.asm    | 24 ++++++++++++++++++++++++
>>  libavcodec/x86/audiodsp_init.c |  6 ++++++
>>  2 files changed, 30 insertions(+)
>>
>> diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
>> index b604b0443c..55051f6aa7 100644
>> --- a/libavcodec/x86/audiodsp.asm
>> +++ b/libavcodec/x86/audiodsp.asm
>> @@ -44,6 +44,30 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
>>      movd   eax, m2
>>      RET
>>
>> +INIT_YMM avx2
>> +cglobal scalarproduct_int16, 3,4,3, v1, v2, order, offset
>> +    xor offsetq, offsetq
>> +    add orderd, orderd
>> +    pxor    m1, m1
>> +    cmp orderd, 32
> 
> This parameter needs to be multiple of 16. What will happen below if 
> it's for example 48? Are both buffers padded enough to handle 16 bytes 
> of overread?

Nevermind, it's int16_t* buffers.

You can simplify this as:

INIT_YMM avx2
cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     add orderd, orderd
     add v1q, orderq
     add v2q, orderq
     neg orderq
     pxor    m1, m1
.loop:
     movu    m0, [v1q + orderq]
     pmaddwd m0, [v2q + orderq]
     paddd   m1, m0
     add     orderq, mmsize
     jl .loop
     HADDD   m1, m0
     movd   eax, xm1
     RET

> 
>> +    jl   .l16
>> +.loop:
>> +    movu    m0, [v1q + offsetq]
>> +    pmaddwd m0, [v2q + offsetq]
>> +    paddd   m1, m0
>> +    add     offsetq, mmsize
>> +    cmp     offsetq, orderq
> 
> You should use the neg trick from the sse2 version so you can remove the 
> cmp from this loop.
> 
>> +    jl .loop
>> +    HADDD   m1, m0
>> +    movd   eax, xm1
>> +    RET
>> +.l16:
>> +    movu    xm0, [v1q + offsetq]
>> +    pmaddwd xm0, [v2q + offsetq]
>> +    paddd   xm1, xm0
>> +    HADDD  xm1, xm0
>> +    movd   eax, xm1
>> +    RET
>>
>>  ;----------------------------------------------------------------------------- 
>>
>>  ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t 
>> min,
>> diff --git a/libavcodec/x86/audiodsp_init.c 
>> b/libavcodec/x86/audiodsp_init.c
>> index aa5e43e570..77d5948442 100644
>> --- a/libavcodec/x86/audiodsp_init.c
>> +++ b/libavcodec/x86/audiodsp_init.c
>> @@ -24,6 +24,9 @@
>>  #include "libavutil/x86/cpu.h"
>>  #include "libavcodec/audiodsp.h"
>>
>> +int32_t ff_scalarproduct_int16_avx2(const int16_t *v1, const int16_t 
>> *v2,
>> +                                    int order);
>> +
>>  int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t 
>> *v2,
>>                                      int order);
>>
>> @@ -53,4 +56,7 @@ av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
>>
>>      if (EXTERNAL_SSE4(cpu_flags))
>>          c->vector_clip_int32 = ff_vector_clip_int32_sse4;
>> +
>> +    if (EXTERNAL_AVX2(cpu_flags))
>> +        c->scalarproduct_int16 = ff_scalarproduct_int16_avx2;
>>  }
>> -- 
>> 2.37.2
>>