[FFmpeg-devel] [PATCH v2] avfilter/vf_bwdif: add x86 SIMD
Thomas Mundt
loudmax at yahoo.de
Sat Mar 5 21:20:48 CET 2016
This new patch adds x86 SIMD support up to 12 bit.
Please comment.
Signed-off-by: Thomas Mundt <loudmax at yahoo.de>
---
libavfilter/bwdif.h | 72 +++++++++++
libavfilter/vf_bwdif.c | 69 +++--------
libavfilter/x86/Makefile | 2 +
libavfilter/x86/vf_bwdif.asm | 266 ++++++++++++++++++++++++++++++++++++++++
libavfilter/x86/vf_bwdif_init.c | 78 ++++++++++++
5 files changed, 432 insertions(+), 55 deletions(-)
create mode 100644 libavfilter/bwdif.h
create mode 100644 libavfilter/x86/vf_bwdif.asm
create mode 100644 libavfilter/x86/vf_bwdif_init.c
diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
new file mode 100644
index 0000000..8b42c76
--- /dev/null
+++ b/libavfilter/bwdif.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_BWDIF_H
+#define AVFILTER_BWDIF_H
+
+#include "libavutil/pixdesc.h"
+#include "avfilter.h"
+
+enum BWDIFMode {
+ BWDIF_MODE_SEND_FRAME = 0, ///< send 1 frame for each frame
+ BWDIF_MODE_SEND_FIELD = 1, ///< send 1 frame for each field
+};
+
+enum BWDIFParity {
+ BWDIF_PARITY_TFF = 0, ///< top field first
+ BWDIF_PARITY_BFF = 1, ///< bottom field first
+ BWDIF_PARITY_AUTO = -1, ///< auto detection
+};
+
+enum BWDIFDeint {
+ BWDIF_DEINT_ALL = 0, ///< deinterlace all frames
+ BWDIF_DEINT_INTERLACED = 1, ///< only deinterlace frames marked as interlaced
+};
+
+typedef struct BWDIFContext {
+ const AVClass *class;
+
+ int mode; ///< BWDIFMode
+ int parity; ///< BWDIFParity
+ int deint; ///< BWDIFDeint
+
+ int frame_pending;
+
+ AVFrame *cur;
+ AVFrame *next;
+ AVFrame *prev;
+ AVFrame *out;
+
+ void (*filter_intra)(void *dst1, void *cur1, int w, int prefs, int mrefs,
+ int prefs3, int mrefs3, int parity, int clip_max);
+ void (*filter_line)(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max);
+ void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int parity, int clip_max, int spat);
+
+ const AVPixFmtDescriptor *csp;
+ int inter_field;
+ int eof;
+} BWDIFContext;
+
+void ff_bwdif_init_x86(BWDIFContext *bwdif);
+
+#endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 7985054..d402aa4 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -37,6 +37,7 @@
#include "formats.h"
#include "internal.h"
#include "video.h"
+#include "bwdif.h"
/*
* Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
@@ -48,51 +49,6 @@ static const uint16_t coef_lf[2] = { 4309, 213 };
static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
static const uint16_t coef_sp[2] = { 5077, 981 };
-enum BWDIFMode {
- BWDIF_MODE_SEND_FRAME = 0, ///< send 1 frame for each frame
- BWDIF_MODE_SEND_FIELD = 1, ///< send 1 frame for each field
-};
-
-enum BWDIFParity {
- BWDIF_PARITY_TFF = 0, ///< top field first
- BWDIF_PARITY_BFF = 1, ///< bottom field first
- BWDIF_PARITY_AUTO = -1, ///< auto detection
-};
-
-enum BWDIFDeint {
- BWDIF_DEINT_ALL = 0, ///< deinterlace all frames
- BWDIF_DEINT_INTERLACED = 1, ///< only deinterlace frames marked as interlaced
-};
-
-typedef struct BWDIFContext {
- const AVClass *class;
-
- int mode; ///< BWDIFMode
- int parity; ///< BWDIFParity
- int deint; ///< BWDIFDeint
-
- int frame_pending;
-
- AVFrame *cur;
- AVFrame *next;
- AVFrame *prev;
- AVFrame *out;
-
- void (*filter_intra)(void *dst1, void *cur1, int w, int prefs, int mrefs,
- int prefs3, int mrefs3, int parity, int clip_max);
- void (*filter_line)(void *dst, void *prev, void *cur, void *next,
- int w, int prefs, int mrefs, int prefs2, int mrefs2,
- int prefs3, int mrefs3, int prefs4, int mrefs4,
- int parity, int clip_max);
- void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
- int w, int prefs, int mrefs, int prefs2, int mrefs2,
- int parity, int clip_max, int spat);
-
- const AVPixFmtDescriptor *csp;
- int inter_field;
- int eof;
-} BWDIFContext;
-
typedef struct ThreadData {
AVFrame *frame;
int plane;
@@ -177,10 +133,10 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
FILTER_INTRA()
}
-static void filter_line(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int prefs2, int mrefs2,
- int prefs3, int mrefs3, int prefs4, int mrefs4,
- int parity, int clip_max)
+static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max)
{
uint8_t *dst = dst1;
uint8_t *prev = prev1;
@@ -222,10 +178,10 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
FILTER_INTRA()
}
-static void filter_line_16bit(void *dst1, void *prev1, void *cur1, void *next1,
- int w, int prefs, int mrefs, int prefs2, int mrefs2,
- int prefs3, int mrefs3, int prefs4, int mrefs4,
- int parity, int clip_max)
+static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max)
{
uint16_t *dst = dst1;
uint16_t *prev = prev1;
@@ -557,14 +513,17 @@ static int config_props(AVFilterLink *link)
s->csp = av_pix_fmt_desc_get(link->format);
if (s->csp->comp[0].depth > 8) {
s->filter_intra = filter_intra_16bit;
- s->filter_line = filter_line_16bit;
+ s->filter_line = filter_line_c_16bit;
s->filter_edge = filter_edge_16bit;
} else {
s->filter_intra = filter_intra;
- s->filter_line = filter_line;
+ s->filter_line = filter_line_c;
s->filter_edge = filter_edge;
}
+ if (ARCH_X86)
+ ff_bwdif_init_x86(s);
+
return 0;
}
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 33de380..ed294e0 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,4 +1,5 @@
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o
+OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o
@@ -21,6 +22,7 @@ OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o
OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
YASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o
+YASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
new file mode 100644
index 0000000..11aa025
--- /dev/null
+++ b/libavfilter/x86/vf_bwdif.asm
@@ -0,0 +1,266 @@
+;*****************************************************************************
+;* x86-optimized functions for bwdif filter
+;*
+;* Copyright (C) 2016 Thomas Mundt <loudmax at yahoo.de>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_coefhf: times 4 dw 1016, 5570
+pw_coefhf1: times 8 dw -3801
+pw_coefsp: times 4 dw 5077, -981
+pw_splfdif: times 4 dw -768, 768
+
+SECTION .text
+
+%macro LOAD8 2
+ movh %1, %2
+ punpcklbw %1, m7
+%endmacro
+
+%macro LOAD12 2
+ movu %1, %2
+%endmacro
+
+%macro DISP8 0
+ packuswb m2, m2
+ movh [dstq], m2
+%endmacro
+
+%macro DISP12 0
+ CLIPW m2, m7, m12
+ movu [dstq], m2
+%endmacro
+
+%macro FILTER 5
+ pxor m7, m7
+.loop%1:
+ LOAD%4 m0, [curq+t0*%5]
+ LOAD%4 m1, [curq+t1*%5]
+ LOAD%4 m2, [%2]
+ LOAD%4 m3, [%3]
+ mova m4, m3
+ paddw m3, m2
+ psubw m2, m4
+ ABS1 m2, m4
+ mova m8, m3
+ mova m9, m2
+ LOAD%4 m3, [prevq+t0*%5]
+ LOAD%4 m4, [prevq+t1*%5]
+ psubw m3, m0
+ psubw m4, m1
+ ABS2 m3, m4, m5, m6
+ paddw m3, m4
+ psrlw m2, 1
+ psrlw m3, 1
+ pmaxsw m2, m3
+ LOAD%4 m3, [nextq+t0*%5]
+ LOAD%4 m4, [nextq+t1*%5]
+ psubw m3, m0
+ psubw m4, m1
+ ABS2 m3, m4, m5, m6
+ paddw m3, m4
+ psrlw m3, 1
+ pmaxsw m2, m3
+
+ LOAD%4 m3, [%2+t0*2*%5]
+ LOAD%4 m4, [%3+t0*2*%5]
+ LOAD%4 m5, [%2+t1*2*%5]
+ LOAD%4 m6, [%3+t1*2*%5]
+ paddw m3, m4
+ paddw m5, m6
+ mova m6, m3
+ paddw m6, m5
+ mova m10, m6
+ psrlw m3, 1
+ psrlw m5, 1
+ psubw m3, m0
+ psubw m5, m1
+ mova m6, m3
+ pminsw m3, m5
+ pmaxsw m5, m6
+ mova m4, m8
+ psraw m4, 1
+ mova m6, m4
+ psubw m6, m0
+ psubw m4, m1
+ pmaxsw m3, m6
+ pminsw m5, m6
+ pmaxsw m3, m4
+ pminsw m5, m4
+ mova m6, m7
+ psubw m6, m3
+ pmaxsw m6, m5
+ mova m3, m2
+ pcmpgtw m3, m7
+ pand m6, m3
+ pmaxsw m2, m6
+ mova m11, m2
+
+ LOAD%4 m2, [%2+t0*4*%5]
+ LOAD%4 m3, [%3+t0*4*%5]
+ LOAD%4 m4, [%2+t1*4*%5]
+ LOAD%4 m5, [%3+t1*4*%5]
+ paddw m2, m3
+ paddw m4, m5
+ paddw m2, m4
+ mova m3, m2
+ punpcklwd m2, m8
+ punpckhwd m3, m8
+ pmaddwd m2, [pw_coefhf]
+ pmaddwd m3, [pw_coefhf]
+ mova m4, m10
+ mova m6, m4
+ pmullw m4, [pw_coefhf1]
+ pmulhw m6, [pw_coefhf1]
+ mova m5, m4
+ punpcklwd m4, m6
+ punpckhwd m5, m6
+ paddd m2, m4
+ paddd m3, m5
+ psrad m2, 2
+ psrad m3, 2
+
+ mova m4, m0
+ paddw m0, m1
+%if ARCH_X86_64
+ LOAD%4 m5, [curq+t2*%5]
+ LOAD%4 m6, [curq+t3*%5]
+%else
+ mov r4, prefs3mp
+ mov r5, mrefs3mp
+ LOAD%4 m5, [curq+t0*%5]
+ LOAD%4 m6, [curq+t1*%5]
+ mov r4, prefsmp
+ mov r5, mrefsmp
+%endif
+ paddw m6, m5
+ psubw m1, m4
+ ABS1 m1, m4
+ pcmpgtw m1, m9
+ mova m4, m1
+ punpcklwd m1, m4
+ punpckhwd m4, m4
+ pand m2, m1
+ pand m3, m4
+ mova m5, [pw_splfdif]
+ mova m7, m5
+ pand m5, m1
+ pand m7, m4
+ paddw m5, [pw_coefsp]
+ paddw m7, [pw_coefsp]
+ mova m4, m0
+ punpcklwd m0, m6
+ punpckhwd m4, m6
+ pmaddwd m0, m5
+ pmaddwd m4, m7
+ paddd m2, m0
+ paddd m3, m4
+ psrad m2, 13
+ psrad m3, 13
+ packssdw m2, m3
+
+ mova m4, m8
+ psraw m4, 1
+ mova m0, m11
+ mova m3, m4
+ psubw m4, m0
+ paddw m3, m0
+ CLIPW m2, m4, m3
+ pxor m7, m7
+ DISP%4
+
+ add dstq, STEP
+ add prevq, STEP
+ add curq, STEP
+ add nextq, STEP
+ sub DWORD wm, mmsize/2
+ jg .loop%1
+%endmacro
+
+%macro PROC 2
+%if ARCH_X86_64
+ movsxd r5, DWORD prefsm
+ movsxd r6, DWORD mrefsm
+ movsxd r7, DWORD prefs3m
+ movsxd r8, DWORD mrefs3m
+ DECLARE_REG_TMP 5, 6, 7, 8
+%else
+ %define m8 [rsp+ 0]
+ %define m9 [rsp+16]
+ %define m10 [rsp+32]
+ %define m11 [rsp+48]
+ mov r4, prefsmp
+ mov r5, mrefsmp
+ DECLARE_REG_TMP 4, 5
+%endif
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq, %1, %2
+ jmp .ret
+.parity0:
+ FILTER 0, curq, nextq, %1, %2
+.ret:
+ RET
+%endmacro
+
+%macro BWDIF 0
+%if ARCH_X86_64
+cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \
+ mrefs, prefs2, mrefs2, prefs3, mrefs3, \
+ prefs4, mrefs4, parity, clip_max
+%else
+cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \
+ mrefs, prefs2, mrefs2, prefs3, mrefs3, \
+ prefs4, mrefs4, parity, clip_max
+%endif
+ %define STEP mmsize/2
+ PROC 8, 1
+
+%if ARCH_X86_64
+cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
+ prefs, mrefs, prefs2, mrefs2, \
+ prefs3, mrefs3, prefs4, \
+ mrefs4, parity, clip_max
+ movd m12, DWORD clip_maxm
+ SPLATW m12, m12, 0
+%else
+cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, prefs2, mrefs2, \
+ prefs3, mrefs3, prefs4, \
+ mrefs4, parity, clip_max
+ %define m12 [rsp+64]
+ movd m0, DWORD clip_maxm
+ SPLATW m0, m0, 0
+ mova m12, m0
+%endif
+ %define STEP mmsize
+ PROC 12, 2
+%endmacro
+
+INIT_XMM ssse3
+BWDIF
+INIT_XMM sse2
+BWDIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+BWDIF
+%endif
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
new file mode 100644
index 0000000..1cb8438
--- /dev/null
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2016 Thomas Mundt <loudmax at yahoo.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/bwdif.h"
+
+void ff_bwdif_filter_line_mmxext(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_sse2(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
+
+void ff_bwdif_filter_line_12bit_mmxext(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *next,
+ int w, int prefs, int mrefs, int prefs2,
+ int mrefs2, int prefs3, int mrefs3, int prefs4,
+ int mrefs4, int parity, int clip_max);
+
+av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
+{
+ int cpu_flags = av_get_cpu_flags();
+ int bit_depth = (!bwdif->csp) ? 8 : bwdif->csp->comp[0].depth;
+
+ if (bit_depth <= 8) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+ } else if (bit_depth <= 12) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_12bit_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+ }
+}
--
1.9.2
More information about the ffmpeg-devel
mailing list