[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions

Fri Jul 27 17:22:39 EEST 2018

On 27 July 2018 at 12:47, James Darnley <jdarnley at obe.tv> wrote:

> On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
> > On 26 July 2018 at 12:28, James Darnley <jdarnley at obe.tv> wrote:
> > +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
> >> +    DECLARE_REG_TMP 4,5
> >> +
> >> +    mova  m2, [pd_1]
> >> +    mov  r3d, wd
> >> +    and   wd, ~(mmsize/4 - 1)
> >> +    shl   wd, 2
> >> +    add  b0q, wq
> >> +    add  b1q, wq
> >> +    neg   wq
> >> +
> >> +    ALIGN 16
> >> +    .loop_simd:
> >> +        mova m0, [b0q + wq]
> >> +        mova m1, [b1q + wq]
> >> +        paddd m3, m1, m2
> >> +        psrad m3, 1
> >> +        psubd m0, m3
> >> +        paddd m1, m0
> >> +        mova [b0q + wq], m0
> >> +        mova [b1q + wq], m1
> >> +        add wq, mmsize
> >> +    jl .loop_simd
> >> +
> >> +    and  r3d, mmsize/4 - 1
> >> +    jz .end
> >> +    .loop_scalar:
> >> +        mov t0d, [b0q]
> >> +        mov t1d, [b1q]
> >> +        mov r2d, t1d
> >> +        add r2d, 1
> >> +        sar r2d, 1
> >> +        sub t0d, r2d
> >> +        add t1d, t0d
> >> +        mov [b0q], t0d
> >> +        mov [b1q], t1d
> >> +
> >> +        add b0q, 4
> >> +        add b1q, 4
> >> +        sub r3d, 1
> >> +    jg .loop_scalar
> >> +
> >> +    .end:
> >> +RET
> >> +
> >> +%endmacro
> >> +
> >> +%macro HAAR_HORIZONTAL 0
> >>
> > +
> >>
> >
> > Could you remove this newline from every patch? All asm I've written and
> > seen keep them without a newline. It made me think there's something in
> the
> > asm which checked the value of the macro, not that the entire function is
> > macro'd.
>
> What?  I don't understand what you mean.  Do you think I have too many
> blank lines between things?
>

Just remove the newline between the macro definition and the cglobal.

> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
> >> x, b2
> >> +    DECLARE_REG_TMP 2,5
> >> +    %if ARCH_X86_64
> >> +        %define tail r6d
> >> +    %else
> >> +        %define tail dword wm
> >> +    %endif
> >> +
> >> +    mova m2, [pd_1]
> >> +    xor xd, xd
> >> +    shr wd, 1
> >> +    mov tail, wd
> >> +    lea b2q, [bq + 4*wq]
> >> +
> >> +    ALIGN 16
> >> +    .loop_lo:
> >> +        mova m0, [bq  + 4*xq]
> >> +        movu m1, [b2q + 4*xq]
> >> +        paddd m1, m2
> >> +        psrad m1, 1
> >> +        psubd m0, m1
> >> +        mova [temp_q + 4*xq], m0
> >> +        add xd, mmsize/4
> >> +        cmp xd, wd
> >> +    jl .loop_lo
> >> +
> >> +    xor xd, xd
> >> +    and wd, ~(mmsize/4 - 1)
> >> +
> >> +    ALIGN 16
> >> +    .loop_hi:
> >> +        mova m0, [temp_q + 4*xq]
> >> +        movu m1, [b2q    + 4*xq]
> >> +        paddd m1, m0
> >> +        paddd m0, m2
> >> +        paddd m1, m2
> >> +        psrad m0, 1
> >> +        psrad m1, 1
> >> +        SBUTTERFLY dq, 0,1,3
> >> +        %if cpuflag(avx2)
> >> +            SBUTTERFLY dqqq, 0,1,3
> >> +        %endif
> >> +        mova [bq + 8*xq], m0
> >> +        mova [bq + 8*xq + mmsize], m1
> >> +        add xd, mmsize/4
> >> +        cmp xd, wd
> >> +    jl .loop_hi
> >> +
> >> +    and tail, mmsize/4 - 1
> >> +    jz .end
> >> +    .loop_scalar:
> >> +        mov t0d, [temp_q + 4*xq]
> >> +        mov t1d, [b2q    + 4*xq]
> >> +        add t1d, t0d
> >> +        add t0d, 1
> >> +        add t1d, 1
> >> +        sar t0d, 1
> >> +        sar t1d, 1
> >> +        mov [bq + 8*xq], t0d
> >> +        mov [bq + 8*xq + 4], t1d
> >> +        add  xq, 1
> >> +        sub tail, 1
> >> +    jg .loop_scalar
> >> +
> >> +    .end:
> >> +REP_RET
> >> +
> >> +%endmacro
> >> +
> >> +INIT_XMM sse2
> >> +HAAR_HORIZONTAL
> >> +HAAR_VERTICAL
> >> +
> >> +INIT_XMM avx
> >> +HAAR_HORIZONTAL
> >> +HAAR_VERTICAL
> >>
> >
> > You're not using any avx functions in that version, not unless a macro'd
> > instruction inserts one for you. I think you should remove the avx
> version
> > then.
> > Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
> > version you can just make a single macro to do both versions at the same
> > time.
>
> Now that I think about it there will be only one 3-operand instruction
> in the SBUTTERFLY and the vertical function also only has 1.  I will
> remove it.
>
> I can merge the two macros but I will look back at what I've done
> previously.  I think it is usually 1 macro per function.
>
> > +
> >> +INIT_YMM avx2
> >> +HAAR_HORIZONTAL
> >> +HAAR_VERTICAL
> >> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
> >> b/libavcodec/x86/dirac_dwt_init_10bit.c
> >> new file mode 100644
> >> index 0000000000..289862d728
> >> --- /dev/null
> >> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
> >> @@ -0,0 +1,76 @@
> >> +/*
> >> + * x86 optimized discrete wavelet transform
> >> + * Copyright (c) 2018 James Darnley
> >> + *
> >> + * This file is part of FFmpeg.
> >> + *
> >> + * FFmpeg is free software; you can redistribute it and/or
> >> + * modify it under the terms of the GNU Lesser General Public
> >> + * License as published by the Free Software Foundation; either
> >> + * version 2.1 of the License, or (at your option) any later version.
> >> + *
> >> + * FFmpeg is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> + * Lesser General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU Lesser General Public
> >> + * License along with FFmpeg; if not, write to the Free Software
> >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> >> 02110-1301 USA
> >> + */
> >> +
> >> +#include "libavutil/x86/asm.h"
> >> +#include "libavutil/x86/cpu.h"
> >> +#include "libavcodec/dirac_dwt.h"
> >> +
> >> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1,
> int
> >> width_align);
> >> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1,
> int
> >> width_align);
> >> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1,
> int
> >> width_align);
> >> +
> >> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> >> width_align);
> >> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> >> width_align);
> >> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> >> width_align);
> >> +
> >> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum
> dwt_type
> >> type)
> >> +{
> >> +#if HAVE_X86ASM
> >> +    int cpu_flags = av_get_cpu_flags();
> >> +
> >> +    if (EXTERNAL_SSE2(cpu_flags)) {
> >> +        switch (type) {
> >> +            case DWT_DIRAC_HAAR0:
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_sse2;
> >> +                break;
> >> +            case DWT_DIRAC_HAAR1:
> >> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> >> haar_10bit_sse2;
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_sse2;
> >> +                break;
> >> +        }
> >> +    }
> >> +
> >> +    if (EXTERNAL_AVX(cpu_flags)) {
> >> +        switch (type) {
> >> +            case DWT_DIRAC_HAAR0:
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_avx;
> >> +                break;
> >> +            case DWT_DIRAC_HAAR1:
> >> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> >> haar_10bit_avx;
> >> +                d->vertical_compose = (void*)ff_vertical_compose_
> >> haar_10bit_avx;
> >> +                break;
> >> +        }
> >>
> >
> > We keep switches and cases on the same line.
>
> I had a look and it is the most common style even though not everywhere
> holds to it.  Also `indent` does it that way.  So I will change it and
> keep it in mind for the future.
>
> >> +
> >> +%macro HAAR_HORIZONTAL 0
> >> +
> >> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_,
> w,
> >> x, b2
> >> +    DECLARE_REG_TMP 2,5
> >> +    %if ARCH_X86_64
> >> +        %define tail r6d
> >> +    %else
> >> +        %define tail dword wm
> >> +    %endif
> >> +
> >>
> >
> > You can remove this whole bit, the init function only gets called if
> > ARCH_X86_64 is true.
>
> Where did you get that from?  I don't require 64-bit for this.
>

Oh, right, I misread

> -    if (ARCH_X86 && bit_depth == 8)
> +#if ARCH_X86
> +    if (bit_depth == 8)
>          ff_spatial_idwt_init_x86(d, type);
> +    else if (bit_depth == 10)
> +        ff_spatial_idwt_init_10bit_x86(d, type);
> +#endif
> +

as ARCH_X86_64, nevermind.