[FFmpeg-devel] [PATCH] avfilter/vf_overlay: add x86 SIMD for yuv444 format when main stream has no alpha
Paul B Mahol
onemda at gmail.com
Mon Apr 30 19:17:40 EEST 2018
Signed-off-by: Paul B Mahol <onemda at gmail.com>
---
libavfilter/vf_overlay.c | 76 ++++++++-----------------------
libavfilter/vf_overlay.h | 84 ++++++++++++++++++++++++++++++++++
libavfilter/x86/Makefile | 2 +
libavfilter/x86/vf_overlay.asm | 94 +++++++++++++++++++++++++++++++++++++++
libavfilter/x86/vf_overlay_init.c | 39 ++++++++++++++++
5 files changed, 238 insertions(+), 57 deletions(-)
create mode 100644 libavfilter/vf_overlay.h
create mode 100644 libavfilter/x86/vf_overlay.asm
create mode 100644 libavfilter/x86/vf_overlay_init.c
diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 8c1895cca4..81522d31a4 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -39,6 +39,7 @@
#include "drawutils.h"
#include "framesync.h"
#include "video.h"
+#include "vf_overlay.h"
typedef struct ThreadData {
AVFrame *dst, *src;
@@ -59,21 +60,6 @@ static const char *const var_names[] = {
NULL
};
-enum var_name {
- VAR_MAIN_W, VAR_MW,
- VAR_MAIN_H, VAR_MH,
- VAR_OVERLAY_W, VAR_OW,
- VAR_OVERLAY_H, VAR_OH,
- VAR_HSUB,
- VAR_VSUB,
- VAR_X,
- VAR_Y,
- VAR_N,
- VAR_POS,
- VAR_T,
- VAR_VARS_NB
-};
-
#define MAIN 0
#define OVERLAY 1
@@ -92,45 +78,6 @@ enum EvalMode {
EVAL_MODE_NB
};
-enum OverlayFormat {
- OVERLAY_FORMAT_YUV420,
- OVERLAY_FORMAT_YUV422,
- OVERLAY_FORMAT_YUV444,
- OVERLAY_FORMAT_RGB,
- OVERLAY_FORMAT_GBRP,
- OVERLAY_FORMAT_AUTO,
- OVERLAY_FORMAT_NB
-};
-
-typedef struct OverlayContext {
- const AVClass *class;
- int x, y; ///< position of overlaid picture
-
- uint8_t main_is_packed_rgb;
- uint8_t main_rgba_map[4];
- uint8_t main_has_alpha;
- uint8_t overlay_is_packed_rgb;
- uint8_t overlay_rgba_map[4];
- uint8_t overlay_has_alpha;
- int format; ///< OverlayFormat
- int alpha_format;
- int eval_mode; ///< EvalMode
-
- FFFrameSync fs;
-
- int main_pix_step[4]; ///< steps per pixel for each plane of the main output
- int overlay_pix_step[4]; ///< steps per pixel for each plane of the overlay
- int hsub, vsub; ///< chroma subsampling values
- const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
-
- double var_values[VAR_VARS_NB];
- char *x_expr, *y_expr;
-
- AVExpr *x_pexpr, *y_pexpr;
-
- int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-} OverlayContext;
-
static av_cold void uninit(AVFilterContext *ctx)
{
OverlayContext *s = ctx->priv;
@@ -509,6 +456,7 @@ static av_always_inline void blend_plane(AVFilterContext *ctx,
int jobnr,
int nb_jobs)
{
+ OverlayContext *octx = ctx->priv;
int src_wp = AV_CEIL_RSHIFT(src_w, hsub);
int src_hp = AV_CEIL_RSHIFT(src_h, vsub);
int dst_wp = AV_CEIL_RSHIFT(dst_w, hsub);
@@ -538,8 +486,17 @@ static av_always_inline void blend_plane(AVFilterContext *ctx,
s = sp + k;
a = ap + (k<<hsub);
da = dap + ((xp+k) << hsub);
-
- for (kmax = FFMIN(-xp + dst_wp, src_wp); k < kmax; k++) {
+ kmax = FFMIN(-xp + dst_wp, src_wp);
+
+ if (octx->blend_row) {
+ octx->blend_row(d, da, s, a, kmax - k, k, j, src_wp, src_hp);
+ dp += dst->linesize[dst_plane];
+ sp += src->linesize[i];
+ ap += (1 << vsub) * src->linesize[3];
+ dap += (1 << vsub) * dst->linesize[3];
+ continue;
+ }
+ for (; k < kmax; k++) {
int alpha_v, alpha_h, alpha;
// average alpha for color components, improve quality
@@ -916,7 +873,7 @@ static int config_input_main(AVFilterLink *inlink)
}
if (!s->alpha_format)
- return 0;
+ goto end;
switch (s->format) {
case OVERLAY_FORMAT_YUV420:
@@ -960,6 +917,11 @@ static int config_input_main(AVFilterLink *inlink)
}
break;
}
+
+end:
+ if (ARCH_X86)
+ ff_overlay_init_x86(s, s->format, s->alpha_format, s->main_has_alpha);
+
return 0;
}
diff --git a/libavfilter/vf_overlay.h b/libavfilter/vf_overlay.h
new file mode 100644
index 0000000000..8eb91d9a34
--- /dev/null
+++ b/libavfilter/vf_overlay.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_OVERLAY_H
+#define AVFILTER_OVERLAY_H
+
+#include "libavutil/eval.h"
+#include "libavutil/pixdesc.h"
+#include "framesync.h"
+#include "avfilter.h"
+
+enum var_name {
+ VAR_MAIN_W, VAR_MW,
+ VAR_MAIN_H, VAR_MH,
+ VAR_OVERLAY_W, VAR_OW,
+ VAR_OVERLAY_H, VAR_OH,
+ VAR_HSUB,
+ VAR_VSUB,
+ VAR_X,
+ VAR_Y,
+ VAR_N,
+ VAR_POS,
+ VAR_T,
+ VAR_VARS_NB
+};
+
+enum OverlayFormat {
+ OVERLAY_FORMAT_YUV420,
+ OVERLAY_FORMAT_YUV422,
+ OVERLAY_FORMAT_YUV444,
+ OVERLAY_FORMAT_RGB,
+ OVERLAY_FORMAT_GBRP,
+ OVERLAY_FORMAT_AUTO,
+ OVERLAY_FORMAT_NB
+};
+
+typedef struct OverlayContext {
+ const AVClass *class;
+ int x, y; ///< position of overlaid picture
+
+ uint8_t main_is_packed_rgb;
+ uint8_t main_rgba_map[4];
+ uint8_t main_has_alpha;
+ uint8_t overlay_is_packed_rgb;
+ uint8_t overlay_rgba_map[4];
+ uint8_t overlay_has_alpha;
+ int format; ///< OverlayFormat
+ int alpha_format;
+ int eval_mode; ///< EvalMode
+
+ FFFrameSync fs;
+
+ int main_pix_step[4]; ///< steps per pixel for each plane of the main output
+ int overlay_pix_step[4]; ///< steps per pixel for each plane of the overlay
+ int hsub, vsub; ///< chroma subsampling values
+ const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
+
+ double var_values[VAR_VARS_NB];
+ char *x_expr, *y_expr;
+
+ AVExpr *x_pexpr, *y_pexpr;
+
+ void (*blend_row)(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, int w, int x, int y, int src_w, int src_h);
+ int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
+} OverlayContext;
+
+void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha);
+
+#endif /* AVFILTER_OVERLAY_H */
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index f60de3b73b..b484c8bd1c 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
+OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay_init.o
OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
@@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
+X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o
X86ASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
X86ASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o
X86ASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm
new file mode 100644
index 0000000000..41f74fe946
--- /dev/null
+++ b/libavfilter/x86/vf_overlay.asm
@@ -0,0 +1,94 @@
+;*****************************************************************************
+;* x86-optimized functions for overlay filter
+;*
+;* Copyright (C) 2018 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_128: times 4 dd 128
+pd_255: times 4 dd 255
+pd_257: times 4 dd 257
+pb_b2dw: db 0,-1,-1,-1, 1,-1,-1,-1, 2,-1,-1,-1, 3,-1,-1,-1
+pb_dw2b: db 0, 4, 8,12,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+
+SECTION .text
+
+INIT_XMM sse4
+cglobal overlay_row_yuv444, 9, 14, 5, 0, d, da, s, a, w, x, y, src_w, src_h, r, x, t, u, v
+ xor xq, xq
+ movsxdifnidn wq, wd
+ mov rq, wq
+ and rq, mmsize/4 - 1
+ cmp wq, mmsize/4
+ jl .loop1
+ pxor m0, m0
+ sub wq, rq
+ .loop0:
+ movu m1, [dq + xq]
+ movu m2, [aq + xq]
+ movu m3, [sq + xq]
+
+ pshufb m1, [pb_b2dw]
+ pshufb m2, [pb_b2dw]
+ pshufb m3, [pb_b2dw]
+ mova m4, [pd_255]
+ psubd m4, m2
+ pmulld m1, m4
+ pmulld m3, m2
+ paddd m1, m3
+ paddd m1, [pd_128]
+ pmulld m1, [pd_257]
+ psrad m1, 16
+ pshufb m1, [pb_dw2b]
+ movd [dq+xq], m1
+
+ add xq, mmsize / 4
+ cmp xq, wq
+ jl .loop0
+
+ cmp rq, 0
+ je .end
+ add wq, rq
+
+ .loop1:
+ xor tq, tq
+ xor uq, uq
+ xor vq, vq
+ mov rd, 255
+ mov tb, [aq + xq]
+ neg tb
+ add rb, tb
+ mov ub, [sq + xq]
+ neg tb
+ imul ud, td
+ mov vb, [dq + xq]
+ imul rd, vd
+ add rd, ud
+ add rd, 128
+ imul rd, 257
+ sar rd, 16
+ mov [dq + xq], rb
+ add xq, 1
+ cmp xq, wq
+ jl .loop1
+ .end:
+ RET
diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c
new file mode 100644
index 0000000000..f57c850a30
--- /dev/null
+++ b/libavfilter/x86/vf_overlay_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_overlay.h"
+
+void ff_overlay_row_yuv444_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+ int w, int x, int y, int src_w, int src_h);
+
+av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) &&
+ (format == OVERLAY_FORMAT_YUV444 ||
+ format == OVERLAY_FORMAT_GBRP) &&
+ alpha_format == 0 && main_has_alpha == 0) {
+ s->blend_row = ff_overlay_row_yuv444_sse4;
+ }
+}
--
2.11.0
More information about the ffmpeg-devel
mailing list