[FFmpeg-devel] [PATCH 3/6] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function

Thu Jul 19 18:26:48 EEST 2018

On 19 July 2018 at 15:52, James Darnley <jdarnley at obe.tv> wrote:

> Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
> relevant transform.
> C:     84fps
> SSE2: 111fps
> AVX2: 115fps
> ---
>  libavcodec/x86/dirac_dwt_10bit.asm    | 38 +++++++++++++++++++++++++++
>  libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++++++++++
>  2 files changed, 54 insertions(+)
>
> diff --git a/libavcodec/x86/dirac_dwt_10bit.asm
> b/libavcodec/x86/dirac_dwt_10bit.asm
> index c00de32bfe..681de5e1df 100644
> --- a/libavcodec/x86/dirac_dwt_10bit.asm
> +++ b/libavcodec/x86/dirac_dwt_10bit.asm
> @@ -25,6 +25,7 @@ SECTION_RODATA
>
>  cextern pd_1
>  pd_2: times 4 dd 2
> +pd_8: times 4 dd 8
>
>  SECTION .text
>
> @@ -153,7 +154,44 @@ RET
>
>  %endmacro
>
> +%macro DD97_VERTICAL_HI 0
> +
> +cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w
> +    mova m7, [pd_8]
> +    shl wd, 2
> +    add b0q, wq
> +    add b1q, wq
> +    add b2q, wq
> +    add b3q, wq
> +    add b4q, wq
> +    neg wq
> +
> +    ALIGN 16
> +    .loop:
> +        mova m0, [b0q + wq]
> +        mova m1, [b1q + wq]
> +        mova m2, [b2q + wq]
> +        mova m3, [b3q + wq]
> +        mova m4, [b4q + wq]
> +        pslld m5, m1, 3
> +        pslld m6, m3, 3
> +        paddd m5, m1
> +        paddd m6, m3
> +        psubd m5, m0
> +        psubd m6, m4
> +        paddd m5, m7
> +        paddd m5, m6
> +        psrad m5, 4
> +        paddd m2, m5
> +        mova [b2q + wq], m2
> +        add wq, mmsize
> +    jl .loop
> +RET
> +
> +%endmacro
> +
>  INIT_XMM sse2
> +DD97_VERTICAL_HI
>  HAAR_HORIZONTAL
>  HAAR_VERTICAL
>  LEGALL53_VERTICAL_HI
> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
> b/libavcodec/x86/dirac_dwt_init_10bit.c
> index 88cf267d14..e7e7534050 100644
> --- a/libavcodec/x86/dirac_dwt_init_10bit.c
> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
> @@ -23,6 +23,8 @@
>  #include "libavutil/x86/cpu.h"
>  #include "libavcodec/dirac_dwt.h"
>
> +void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> int32_t *b3, int32_t *b4, int width);
> +
>  void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> int width);
>  void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> int width);
>
> @@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0,
> int32_t *b1, int32_t *b2, int
>          b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
>  }
>
> +static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
> +                                  int32_t *b3, int32_t *b4, int width)
> +{
> +    int i = width & ~3;
> +    ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
> +    for(; i<width; i++)
> +        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
> +
> +}
>

This, along with the rest of the patchset: what's up with the hybrid
implementations? Couldn't you put the second part in the asm code as well?
Now there are 2 function calls instead of 1.