[FFmpeg-devel] [PATCH] avcodec/cfhd: add x86 SIMD

Paul B Mahol onemda at gmail.com
Sun Aug 16 18:01:40 EEST 2020


On 8/16/20, James Almer <jamrial at gmail.com> wrote:
> On 8/16/2020 11:09 AM, Paul B Mahol wrote:
>> On 8/16/20, Paul B Mahol <onemda at gmail.com> wrote:
>>> Hi,
>>>
>>> patch attached.
>>>
>>> Please help porting this to linux and 64bit calling convention.
>>>
>>
>> New patch attached, could build on x64, please report any build failure.
>
> [...]
>
>> diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm
>> new file mode 100644
>> index 0000000000..80371e65c9
>> --- /dev/null
>> +++ b/libavcodec/x86/cfhddsp.asm
>> @@ -0,0 +1,626 @@
>> +;******************************************************************************
>> +;* x86-optimized functions for the CFHD decoder
>> +;* Copyright (c) 2020 Paul B Mahol
>> +;*
>> +;* This file is part of FFmpeg.
>> +;*
>> +;* FFmpeg is free software; you can redistribute it and/or
>> +;* modify it under the terms of the GNU Lesser General Public
>> +;* License as published by the Free Software Foundation; either
>> +;* version 2.1 of the License, or (at your option) any later version.
>> +;*
>> +;* FFmpeg is distributed in the hope that it will be useful,
>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +;* Lesser General Public License for more details.
>> +;*
>> +;* You should have received a copy of the GNU Lesser General Public
>> +;* License along with FFmpeg; if not, write to the Free Software
>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> +;******************************************************************************
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +SECTION_RODATA
>> +
>> +factor_p1_p1: dw 1,  1, 1,  1, 1,  1, 1,  1,
>> +factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
>> +factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
>> +factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
>> +factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
>> +pd_4: times 4 dd 4
>> +pw_0: times 8 dw 0
>> +pw_1023: times 8 dw 1023
>> +pw_4095: times 8 dw 4095
>> +
>> +SECTION .text
>> +
>> +%macro CFHD_HORIZ_FILTER 1
>> +%if %1 == 1023
>> +cglobal cfhd_horiz_filter_clip10, 5, 6, 8, output, low, high, width, bpc
>> +    DEFINE_ARGS    output, low, high, width, x, temp
>> +    shl        widthd, 1
>> +%define ostrideq widthq
>> +%define lwidthq  widthq
>> +%define hwidthq  widthq
>> +%elif %1 == 4095
>> +cglobal cfhd_horiz_filter_clip12, 5, 6, 8, output, low, high, width, bpc
>> +    DEFINE_ARGS    output, low, high, width, x, temp
>> +    shl        widthd, 1
>> +%define ostrideq widthq
>> +%define lwidthq  widthq
>> +%define hwidthq  widthq
>> +%else
>> +%if ARCH_X86_64
>> +cglobal cfhd_horiz_filter, 11, 11, 8, output, ostride, low, lwidth, high,
>> hwidth, width, height
>> +DEFINE_ARGS    output, ostride, low, lwidth, high, hwidth, width, height,
>> x, y, temp
>> +    shl  ostrided, 1
>> +    shl   lwidthd, 1
>> +    shl   hwidthd, 1
>> +    shl    widthd, 1
>> +
>> +    mov        yq, heightq
>> +    neg        yq
>> +%else
>> +cglobal cfhd_horiz_filter, 6, 6, 8, 64, output, x, low, y, high, temp,
>> width, height
>> +    shl        xd, 1
>> +    shl        yd, 1
>> +    shl     tempd, 1
>> +
>> +    mov dword [rsp +  0], xq
>> +    mov dword [rsp +  8], yq
>> +    mov dword [rsp + 16], tempq
>
> These are four bytes on x86_32, not eight. Also, since all arguments
> come from stack, you can simply move them back doing
>
> mov xmp, xq
> mov ymp, yq
> mov tempmp, tempq
> %define ostrideq xm
> %define lwidthq ym
> %define hwidthq tempm
>
> Saving you the need to reserve space.
>
>> +
>> +    mov        yd, r6m
>
> Just load r6/width normally in cglobal, you can use up to seven regs on
> x86_32.

But I need 11 regs, thus I use stack.

>
>> +    shl        yd, 1
>> +    mov dword [rsp + 24], yq
>> +
>> +    mov        yd, r7m
>> +    neg        yq
>> +
>> +%define ostrideq [rsp +  0]
>> +%define lwidthq  [rsp +  8]
>> +%define hwidthq  [rsp + 16]
>> +%define widthq   [rsp + 24]
>
> If you're going to define widthq here like this, then you shouldn't
> define width in cglobal. But as i said above, you have a reg free to
> store it.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".


More information about the ffmpeg-devel mailing list