[FFmpeg-devel] [PATCH] avcodec/cfhd: add x86 SIMD
Anton Khirnov
anton at khirnov.net
Mon Aug 17 16:30:38 EEST 2020
Quoting Paul B Mahol (2020-08-14 14:24:25)
>From 874fd9e604a6dcd55cca77c7256a633e5739da77 Mon Sep 17 00:00:00 2001
>From: Paul B Mahol <onemda at gmail.com>
>Date: Sun, 9 Aug 2020 17:47:34 +0200
>Subject: [PATCH] avcodec/cfhd: add x86 SIMD
>
>---
> libavcodec/Makefile | 2 +-
> libavcodec/cfhd.c | 271 ++++++++--------------------------
> libavcodec/cfhd.h | 3 +
> libavcodec/cfhddsp.c | 110 ++++++++++++++
> libavcodec/cfhddsp.h | 42 ++++++
> libavcodec/x86/Makefile | 2 +
> libavcodec/x86/cfhddsp.asm | 227 ++++++++++++++++++++++++++++
> libavcodec/x86/cfhddsp_init.c | 44 ++++++
> 8 files changed, 488 insertions(+), 213 deletions(-)
> create mode 100644 libavcodec/cfhddsp.c
> create mode 100644 libavcodec/cfhddsp.h
> create mode 100644 libavcodec/x86/cfhddsp.asm
> create mode 100644 libavcodec/x86/cfhddsp_init.c
It would be way easier to review if the rearrangement of the C code was
separate from adding the x86 SIMD.
Also, checkasm tests would make it easier to test and benchmark.
>
>--- /dev/null
>+++ b/libavcodec/x86/cfhddsp.asm
>@@ -0,0 +1,227 @@
>+;******************************************************************************
>+;* x86-optimized functions for the CFHD decoder
>+;* Copyright (c) 2020 Paul B Mahol
>+;*
>+;* This file is part of FFmpeg.
>+;*
>+;* FFmpeg is free software; you can redistribute it and/or
>+;* modify it under the terms of the GNU Lesser General Public
>+;* License as published by the Free Software Foundation; either
>+;* version 2.1 of the License, or (at your option) any later version.
>+;*
>+;* FFmpeg is distributed in the hope that it will be useful,
>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+;* Lesser General Public License for more details.
>+;*
>+;* You should have received a copy of the GNU Lesser General Public
>+;* License along with FFmpeg; if not, write to the Free Software
>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+;******************************************************************************
>+
>+%include "libavutil/x86/x86util.asm"
>+
>+SECTION_RODATA
>+
>+factor_p1_p1: dw 1, 1, 1, 1, 1, 1, 1, 1,
>+factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
>+factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
>+pd_4: times 4 dd 4
>+pw_0: times 8 dw 0
>+pw_1023: times 8 dw 1023
>+pw_4095: times 8 dw 4095
>+
>+SECTION .text
>+
>+%macro CFHD_HORIZ_FILTER 1
>+%if %1 == 1023
>+cglobal cfhd_horiz_filter_clip10, 5, 6, 8, output, low, high, width, bpc
>+ DEFINE_ARGS output, low, high, width, x, temp
>+%elif %1 == 4095
>+cglobal cfhd_horiz_filter_clip12, 5, 6, 8, output, low, high, width, bpc
>+ DEFINE_ARGS output, low, high, width, x, temp
>+%else
>+cglobal cfhd_horiz_filter, 4, 6, 8, output, low, high, width, x
>+ DEFINE_ARGS output, low, high, width, x, temp
>+%endif
I don't think you need DEFINE_ARGS, temp can be added directly to
cglobal invocation.
Also, bpc seems unused.
Then you could call the macro with the bit depth as a parameter and
%define pixel_max ((1 << bpc) - 1)
for extra readability
>+ shl widthd, 1
>+
>+ movsx xq, word [lowq]
>+ imul xq, 11
>+
>+ movsx tempq, word [lowq + 2]
>+ imul tempq, -4
>+ add tempq, xq
>+
>+ movsx xq, word [lowq + 4]
>+ add tempq, xq
>+ add tempq, 4
>+ sar tempq, 3
>+
>+ movsx xq, word [highq]
>+ add tempq, xq
>+ sar tempq, 1
>+
>+%if %1
>+ movd xm0, tempd
>+ CLIPW m0, [pw_0], [pw_%1]
>+ pextrw [outputq], xm0, 0
>+%else
>+ mov word [outputq], tempw
>+%endif
>+
>+ movsx xq, word [lowq]
>+ imul xq, 5
>+
>+ movsx tempq, word [lowq + 2]
>+ imul tempq, 4
>+ add tempq, xq
>+
>+ movsx xq, word [lowq + 4]
>+ sub tempq, xq
>+ add tempq, 4
>+ sar tempq, 3
>+
>+ movsx xq, word [highq]
>+ sub tempq, xq
>+ sar tempq, 1
>+
>+%if %1
>+ movd xm0, tempd
>+ CLIPW m0, [pw_0], [pw_%1]
>+ pextrw [outputq + 2], xm0, 0
>+%else
>+ mov word [outputq + 2], tempw
>+%endif
>+
>+ mov xq, 0
>+
>+.loop:
>+ movu m4, [lowq + xq]
>+ movu m1, [lowq + xq + 4]
>+
>+ mova m5, m4
>+ punpcklwd m4, m1
>+ punpckhwd m5, m1
>+
>+ mova m6, m4
>+ mova m7, m5
>+
>+ pmaddwd m4, [factor_p1_n1]
>+ pmaddwd m5, [factor_p1_n1]
>+ pmaddwd m6, [factor_n1_p1]
>+ pmaddwd m7, [factor_n1_p1]
>+
>+ paddd m4, [pd_4]
>+ paddd m5, [pd_4]
>+ paddd m6, [pd_4]
>+ paddd m7, [pd_4]
I'd drop 32bit compatibility and load those constants into registers.
>+
>+ psrad m4, 3
>+ psrad m5, 3
>+ psrad m6, 3
>+ psrad m7, 3
>+
>+ movu m2, [lowq + xq + 2]
>+ movu m3, [highq + xq + 2]
>+
>+ mova m0, m2
>+ punpcklwd m2, m3
>+ punpckhwd m0, m3
>+
>+ mova m1, m2
>+ mova m3, m0
>+
>+ pmaddwd m2, [factor_p1_p1]
>+ pmaddwd m0, [factor_p1_p1]
>+ pmaddwd m1, [factor_p1_n1]
>+ pmaddwd m3, [factor_p1_n1]
>+
>+ paddd m2, m4
>+ paddd m0, m5
>+ paddd m1, m6
>+ paddd m3, m7
>+
>+ psrad m2, 1
>+ psrad m0, 1
>+ psrad m1, 1
>+ psrad m3, 1
>+
>+ packssdw m2, m0
>+ packssdw m1, m3
>+
>+ mova m0, m2
>+ punpcklwd m2, m1
>+ punpckhwd m0, m1
>+
>+%if %1
>+ CLIPW m2, [pw_0], [pw_%1]
>+ CLIPW m0, [pw_0], [pw_%1]
>+%endif
>+
>+ movu [outputq + xq * 2 + 4], m2
>+ movu [outputq + xq * 2 + mmsize + 4], m0
Is it guaranteed that this does not write out of bounds? Seems you
assume output to be at least 18 pixels, but I don't see anything in
cfhd.c enforcing that.
--
Anton Khirnov
More information about the ffmpeg-devel
mailing list