[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions
James Darnley
jdarnley at obe.tv
Fri Jul 27 14:47:57 EEST 2018
On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
> On 26 July 2018 at 12:28, James Darnley <jdarnley at obe.tv> wrote:
> +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
>> + DECLARE_REG_TMP 4,5
>> +
>> + mova m2, [pd_1]
>> + mov r3d, wd
>> + and wd, ~(mmsize/4 - 1)
>> + shl wd, 2
>> + add b0q, wq
>> + add b1q, wq
>> + neg wq
>> +
>> + ALIGN 16
>> + .loop_simd:
>> + mova m0, [b0q + wq]
>> + mova m1, [b1q + wq]
>> + paddd m3, m1, m2
>> + psrad m3, 1
>> + psubd m0, m3
>> + paddd m1, m0
>> + mova [b0q + wq], m0
>> + mova [b1q + wq], m1
>> + add wq, mmsize
>> + jl .loop_simd
>> +
>> + and r3d, mmsize/4 - 1
>> + jz .end
>> + .loop_scalar:
>> + mov t0d, [b0q]
>> + mov t1d, [b1q]
>> + mov r2d, t1d
>> + add r2d, 1
>> + sar r2d, 1
>> + sub t0d, r2d
>> + add t1d, t0d
>> + mov [b0q], t0d
>> + mov [b1q], t1d
>> +
>> + add b0q, 4
>> + add b1q, 4
>> + sub r3d, 1
>> + jg .loop_scalar
>> +
>> + .end:
>> +RET
>> +
>> +%endmacro
>> +
>> +%macro HAAR_HORIZONTAL 0
>>
> +
>>
>
> Could you remove this newline from every patch? All asm I've written and
> seen keep them without a newline. It made me think there's something in the
> asm which checked the value of the macro, not that the entire function is
> macro'd.
What? I don't understand what you mean. Do you think I have too many
blank lines between things?
> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>> x, b2
>> + DECLARE_REG_TMP 2,5
>> + %if ARCH_X86_64
>> + %define tail r6d
>> + %else
>> + %define tail dword wm
>> + %endif
>> +
>> + mova m2, [pd_1]
>> + xor xd, xd
>> + shr wd, 1
>> + mov tail, wd
>> + lea b2q, [bq + 4*wq]
>> +
>> + ALIGN 16
>> + .loop_lo:
>> + mova m0, [bq + 4*xq]
>> + movu m1, [b2q + 4*xq]
>> + paddd m1, m2
>> + psrad m1, 1
>> + psubd m0, m1
>> + mova [temp_q + 4*xq], m0
>> + add xd, mmsize/4
>> + cmp xd, wd
>> + jl .loop_lo
>> +
>> + xor xd, xd
>> + and wd, ~(mmsize/4 - 1)
>> +
>> + ALIGN 16
>> + .loop_hi:
>> + mova m0, [temp_q + 4*xq]
>> + movu m1, [b2q + 4*xq]
>> + paddd m1, m0
>> + paddd m0, m2
>> + paddd m1, m2
>> + psrad m0, 1
>> + psrad m1, 1
>> + SBUTTERFLY dq, 0,1,3
>> + %if cpuflag(avx2)
>> + SBUTTERFLY dqqq, 0,1,3
>> + %endif
>> + mova [bq + 8*xq], m0
>> + mova [bq + 8*xq + mmsize], m1
>> + add xd, mmsize/4
>> + cmp xd, wd
>> + jl .loop_hi
>> +
>> + and tail, mmsize/4 - 1
>> + jz .end
>> + .loop_scalar:
>> + mov t0d, [temp_q + 4*xq]
>> + mov t1d, [b2q + 4*xq]
>> + add t1d, t0d
>> + add t0d, 1
>> + add t1d, 1
>> + sar t0d, 1
>> + sar t1d, 1
>> + mov [bq + 8*xq], t0d
>> + mov [bq + 8*xq + 4], t1d
>> + add xq, 1
>> + sub tail, 1
>> + jg .loop_scalar
>> +
>> + .end:
>> +REP_RET
>> +
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> +
>> +INIT_XMM avx
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>>
>
> You're not using any avx functions in that version, not unless a macro'd
> instruction inserts one for you. I think you should remove the avx version
> then.
> Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
> version you can just make a single macro to do both versions at the same
> time.
Now that I think about it there will be only one 3-operand instruction
in the SBUTTERFLY and the vertical function also only has 1. I will
remove it.
I can merge the two macros but I will look back at what I've done
previously. I think it is usually 1 macro per function.
> +
>> +INIT_YMM avx2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
>> b/libavcodec/x86/dirac_dwt_init_10bit.c
>> new file mode 100644
>> index 0000000000..289862d728
>> --- /dev/null
>> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
>> @@ -0,0 +1,76 @@
>> +/*
>> + * x86 optimized discrete wavelet transform
>> + * Copyright (c) 2018 James Darnley
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/x86/asm.h"
>> +#include "libavutil/x86/cpu.h"
>> +#include "libavcodec/dirac_dwt.h"
>> +
>> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +
>> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +
>> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type
>> type)
>> +{
>> +#if HAVE_X86ASM
>> + int cpu_flags = av_get_cpu_flags();
>> +
>> + if (EXTERNAL_SSE2(cpu_flags)) {
>> + switch (type) {
>> + case DWT_DIRAC_HAAR0:
>> + d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_sse2;
>> + break;
>> + case DWT_DIRAC_HAAR1:
>> + d->horizontal_compose = (void*)ff_horizontal_compose_
>> haar_10bit_sse2;
>> + d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_sse2;
>> + break;
>> + }
>> + }
>> +
>> + if (EXTERNAL_AVX(cpu_flags)) {
>> + switch (type) {
>> + case DWT_DIRAC_HAAR0:
>> + d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_avx;
>> + break;
>> + case DWT_DIRAC_HAAR1:
>> + d->horizontal_compose = (void*)ff_horizontal_compose_
>> haar_10bit_avx;
>> + d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_avx;
>> + break;
>> + }
>>
>
> We keep switches and cases on the same line.
I had a look and it is the most common style even though not everywhere
holds to it. Also `indent` does it that way. So I will change it and
keep it in mind for the future.
>> +
>> +%macro HAAR_HORIZONTAL 0
>> +
>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>> x, b2
>> + DECLARE_REG_TMP 2,5
>> + %if ARCH_X86_64
>> + %define tail r6d
>> + %else
>> + %define tail dword wm
>> + %endif
>> +
>>
>
> You can remove this whole bit, the init function only gets called if
> ARCH_X86_64 is true.
Where did you get that from? I don't require 64-bit for this.
More information about the ffmpeg-devel
mailing list