[FFmpeg-devel] [PATCH] avcodec/x86: add cfhdenc SIMD
Paul B Mahol
onemda at gmail.com
Wed Feb 24 18:41:45 EET 2021
Signed-off-by: Paul B Mahol <onemda at gmail.com>
---
libavcodec/cfhdencdsp.c | 3 +
libavcodec/x86/Makefile | 2 +
libavcodec/x86/cfhdencdsp.asm | 429 +++++++++++++++++++++++++++++++
libavcodec/x86/cfhdencdsp_init.c | 48 ++++
4 files changed, 482 insertions(+)
create mode 100644 libavcodec/x86/cfhdencdsp.asm
create mode 100644 libavcodec/x86/cfhdencdsp_init.c
diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c
index 0becb76d1d..b979e9e09a 100644
--- a/libavcodec/cfhdencdsp.c
+++ b/libavcodec/cfhdencdsp.c
@@ -73,4 +73,7 @@ av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c)
{
c->horiz_filter = horiz_filter;
c->vert_filter = vert_filter;
+
+ if (ARCH_X86)
+ ff_cfhdencdsp_init_x86(c);
}
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 884dc0c759..6361161180 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -51,6 +51,7 @@ OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp_init.o
+OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp_init.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
@@ -154,6 +155,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
X86ASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_CFHD_ENCODER) += x86/cfhdencdsp.o
X86ASM-OBJS-$(CONFIG_CFHD_DECODER) += x86/cfhddsp.o
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
new file mode 100644
index 0000000000..1753b7829a
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -0,0 +1,429 @@
+;******************************************************************************
+;* x86-optimized functions for the CFHD encoder
+;* Copyright (c) 2021 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1
+pw_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1
+pw_p5_n11: dw 5, -11, 5, -11, 5, -11, 5, -11
+pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
+pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
+pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
+pd_4: times 4 dd 4
+pw_p4: times 8 dw 4
+pw_n4: times 8 dw -4
+pw_p1: times 8 dw 1
+pw_n1: times 8 dw -1
+
+SECTION .text
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
+ shl istrided, 1
+ shl lwidthd, 1
+ shl hwidthd, 1
+ mova m7, [pd_4]
+ mova m8, [pw_p1]
+ mova m9, [pw_n1]
+ mova m10,[pw_p1_n1]
+ neg yq
+.looph:
+ movsx xq, word [inputq]
+
+ movsx tempq, word [inputq + 2]
+ add tempq, xq
+
+ movd xm0, tempd
+ packssdw m0, m0
+ pextrw tempd, xm0, 0
+ mov word [lowq], tempw
+
+ movsx xq, word [inputq]
+ imul xq, 5
+ movsx tempq, word [inputq + 2]
+ imul tempq, -11
+ add tempq, xq
+
+ movsx xq, word [inputq + 4]
+ imul xq, 4
+ add tempq, xq
+
+ movsx xq, word [inputq + 6]
+ imul xq, 4
+ add tempq, xq
+
+ movsx xq, word [inputq + 8]
+ imul xq, -1
+ add tempq, xq
+
+ movsx xq, word [inputq + 10]
+ imul xq, -1
+ add tempq, xq
+
+ add tempq, 4
+ sar tempq, 3
+
+ movd xm0, tempd
+ packssdw m0, m0
+ pextrw tempd, xm0, 0
+ mov word [highq], tempw
+
+ mov xq, 2
+
+.loopw:
+ movu m0, [inputq + xq * 2]
+ movu m1, [inputq + xq * 2 + mmsize]
+
+ pmaddwd m0, m8
+ pmaddwd m1, m8
+
+ packssdw m0, m1
+ movu [lowq+xq], m0
+
+ movu m2, [inputq + xq * 2 - 4]
+ movu m3, [inputq + xq * 2 - 4 + mmsize]
+
+ pmaddwd m2, m9
+ pmaddwd m3, m9
+
+ movu m0, [inputq + xq * 2 + 4]
+ movu m1, [inputq + xq * 2 + 4 + mmsize]
+
+ pmaddwd m0, m8
+ pmaddwd m1, m8
+
+ paddd m0, m2
+ paddd m1, m3
+
+ paddd m0, m7
+ paddd m1, m7
+
+ psrad m0, 3
+ psrad m1, 3
+
+ movu m5, [inputq + xq * 2 + 0]
+ movu m6, [inputq + xq * 2 + mmsize]
+
+ pmaddwd m5, m10
+ pmaddwd m6, m10
+
+ paddd m0, m5
+ paddd m1, m6
+
+ packssdw m0, m1
+ movu [highq+xq], m0
+
+ add xq, mmsize
+ cmp xq, widthq
+ jl .loopw
+
+ add lowq, widthq
+ add highq, widthq
+ add inputq, widthq
+ add inputq, widthq
+
+ movsx xq, word [inputq - 4]
+ movsx tempq, word [inputq - 2]
+ add tempq, xq
+
+ movd xm0, tempd
+ packssdw m0, m0
+ pextrw tempd, xm0, 0
+ mov word [lowq-2], tempw
+
+ movsx tempq, word [inputq - 4]
+ imul tempq, 11
+ movsx xq, word [inputq - 2]
+ imul xq, -5
+ add tempq, xq
+
+ movsx xq, word [inputq - 6]
+ imul xq, -4
+ add tempq, xq
+
+ movsx xq, word [inputq - 8]
+ imul xq, -4
+ add tempq, xq
+
+ movsx xq, word [inputq - 10]
+ add tempq, xq
+
+ movsx xq, word [inputq - 12]
+ add tempq, xq
+
+ add tempq, 4
+ sar tempq, 3
+
+ movd xm0, tempd
+ packssdw m0, m0
+ pextrw tempd, xm0, 0
+ mov word [highq-2], tempw
+
+ sub inputq, widthq
+ sub inputq, widthq
+ sub highq, widthq
+ sub lowq, widthq
+
+ add lowq, lwidthq
+ add highq, hwidthq
+ add inputq, istrideq
+ add yq, 1
+ jl .looph
+
+ RET
+%endif
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_vert_filter, 8, 11, 12, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
+ shl istrided, 1
+ shl widthd, 1
+
+ sub heightq, 2
+
+ xor xq, xq
+
+ mova m7, [pd_4]
+ mova m8, [pw_p1]
+ mova m9, [pw_n1]
+ mova m10,[pw_p1_n1]
+ mova m11,[pw_n1_p1]
+.loopw:
+ mov yq, 2
+
+ mov posq, xq
+ movu m0, [inputq + posq]
+ add posq, istrideq
+ movu m1, [inputq + posq]
+
+ paddsw m0, m1
+
+ movu [lowq + xq], m0
+
+ mov posq, xq
+
+ movu m0, [inputq + posq]
+ add posq, istrideq
+ movu m1, [inputq + posq]
+ add posq, istrideq
+ movu m2, [inputq + posq]
+ add posq, istrideq
+ movu m3, [inputq + posq]
+ add posq, istrideq
+ movu m4, [inputq + posq]
+ add posq, istrideq
+ movu m5, [inputq + posq]
+
+ mova m6, m0
+ punpcklwd m0, m1
+ punpckhwd m1, m6
+
+ mova m6, m2
+ punpcklwd m2, m3
+ punpckhwd m3, m6
+
+ mova m6, m4
+ punpcklwd m4, m5
+ punpckhwd m5, m6
+
+ pmaddwd m0, [pw_p5_n11]
+ pmaddwd m1, [pw_n11_p5]
+ pmaddwd m2, [pw_p4]
+ pmaddwd m3, [pw_p4]
+ pmaddwd m4, m9
+ pmaddwd m5, m9
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m4
+ paddd m1, m5
+
+ paddd m0, m7
+ paddd m1, m7
+
+ psrad m0, 3
+ psrad m1, 3
+ packssdw m0, m1
+
+ movu [highq + xq], m0
+
+.looph:
+
+ mov posq, istrideq
+ imul posq, yq
+ add posq, xq
+
+ movu m0, [inputq + posq]
+
+ add posq, istrideq
+ movu m1, [inputq + posq]
+
+ paddsw m0, m1
+
+ mov posq, lwidthq
+ imul posq, yq
+ add posq, xq
+
+ movu [lowq + posq], m0
+
+ add yq, -2
+
+ mov posq, istrideq
+ imul posq, yq
+ add posq, xq
+
+ movu m0, [inputq + posq]
+ add posq, istrideq
+ movu m1, [inputq + posq]
+ add posq, istrideq
+ movu m2, [inputq + posq]
+ add posq, istrideq
+ movu m3, [inputq + posq]
+ add posq, istrideq
+ movu m4, [inputq + posq]
+ add posq, istrideq
+ movu m5, [inputq + posq]
+
+ add yq, 2
+
+ mova m6, m0
+ punpcklwd m0, m1
+ punpckhwd m1, m6
+
+ mova m6, m2
+ punpcklwd m2, m3
+ punpckhwd m3, m6
+
+ mova m6, m4
+ punpcklwd m4, m5
+ punpckhwd m5, m6
+
+ pmaddwd m0, m9
+ pmaddwd m1, m9
+ pmaddwd m2, m10
+ pmaddwd m3, m11
+ pmaddwd m4, m8
+ pmaddwd m5, m8
+
+ paddd m0, m4
+ paddd m1, m5
+
+ paddd m0, m7
+ paddd m1, m7
+
+ psrad m0, 3
+ psrad m1, 3
+ paddd m0, m2
+ paddd m1, m3
+ packssdw m0, m1
+
+ mov posq, hwidthq
+ imul posq, yq
+ add posq, xq
+
+ movu [highq + posq], m0
+
+ add yq, 2
+ cmp yq, heightq
+ jl .looph
+
+ mov posq, istrideq
+ imul posq, yq
+ add posq, xq
+
+ movu m0, [inputq + posq]
+ add posq, istrideq
+ movu m1, [inputq + posq]
+
+ paddsw m0, m1
+
+ mov posq, lwidthq
+ imul posq, yq
+ add posq, xq
+
+ movu [lowq + posq], m0
+
+ sub yq, 4
+
+ mov posq, istrideq
+ imul posq, yq
+ add posq, xq
+
+ movu m0, [inputq + posq]
+ add posq, istrideq
+ movu m1, [inputq + posq]
+ add posq, istrideq
+ movu m2, [inputq + posq]
+ add posq, istrideq
+ movu m3, [inputq + posq]
+ add posq, istrideq
+ movu m4, [inputq + posq]
+ add posq, istrideq
+ movu m5, [inputq + posq]
+
+ add yq, 4
+
+ mova m6, m0
+ punpcklwd m0, m1
+ punpckhwd m1, m6
+
+ mova m6, m2
+ punpcklwd m2, m3
+ punpckhwd m3, m6
+
+ mova m6, m4
+ punpcklwd m4, m5
+ punpckhwd m5, m6
+
+ pmaddwd m0, m8
+ pmaddwd m1, m8
+ pmaddwd m2, [pw_n4]
+ pmaddwd m3, [pw_n4]
+ pmaddwd m4, [pw_p11_n5]
+ pmaddwd m5, [pw_n5_p11]
+
+ paddd m4, m2
+ paddd m5, m3
+
+ paddd m4, m0
+ paddd m5, m1
+
+ paddd m4, m7
+ paddd m5, m7
+
+ psrad m4, 3
+ psrad m5, 3
+ packssdw m4, m5
+
+ mov posq, hwidthq
+ imul posq, yq
+ add posq, xq
+
+ movu [highq + posq], m4
+
+ add xq, mmsize
+ cmp xq, widthq
+ jl .loopw
+ RET
+%endif
diff --git a/libavcodec/x86/cfhdencdsp_init.c b/libavcodec/x86/cfhdencdsp_init.c
new file mode 100644
index 0000000000..28f1dd504d
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp_init.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/cfhdencdsp.h"
+
+void ff_cfhdenc_horiz_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+ ptrdiff_t in_stride, ptrdiff_t low_stride,
+ ptrdiff_t high_stride,
+ int width, int height);
+void ff_cfhdenc_vert_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+ ptrdiff_t in_stride, ptrdiff_t low_stride,
+ ptrdiff_t high_stride,
+ int width, int height);
+
+av_cold void ff_cfhdencdsp_init_x86(CFHDEncDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_64
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->horiz_filter = ff_cfhdenc_horiz_filter_sse2;
+ c->vert_filter = ff_cfhdenc_vert_filter_sse2;
+ }
+#endif
+}
--
2.17.1
More information about the ffmpeg-devel
mailing list