[FFmpeg-devel] [PATCH] SSE2 version of vf_idet's filter_line()
Clément Bœsch
u at pkh.me
Wed Sep 3 18:19:33 CEST 2014
On Wed, Sep 03, 2014 at 05:50:32PM +0200, Pascal Massimino wrote:
[...]
> removed this step in both mmx and sse2 version.
>
> -> new patch attached.
>
> /skal
> From d2249b05b4a881ec3c9de8fc105b2a40c680a0ea Mon Sep 17 00:00:00 2001
> From: skal <pascal.massimino at gmail.com>
> Date: Wed, 3 Sep 2014 11:02:32 +0200
> Subject: [PATCH] MMX/MMXEXT/SSE2 implementation of idet's filter_line()
>
> integration by Neil Birkbeck, with help from Vitor Sessak.
> core SSE2 loop by Skal (pascal.massimino at gmail.com)
> ---
> MAINTAINERS | 1 +
> libavfilter/vf_idet.c | 38 ++----------
> libavfilter/vf_idet.h | 58 ++++++++++++++++++
> libavfilter/x86/Makefile | 2 +
> libavfilter/x86/vf_idet.asm | 134 +++++++++++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_idet_init.c | 71 ++++++++++++++++++++++
> 6 files changed, 272 insertions(+), 32 deletions(-)
> create mode 100644 libavfilter/vf_idet.h
> create mode 100644 libavfilter/x86/vf_idet.asm
> create mode 100644 libavfilter/x86/vf_idet_init.c
>
[...]
> @@ -77,7 +48,7 @@ static const char *type2str(Type type)
> return NULL;
> }
>
> -static int filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w)
> +int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w)
why do you need to export it?
> {
> int x;
> int ret=0;
> @@ -271,7 +242,10 @@ static av_cold int init(AVFilterContext *ctx)
> idet->last_type = UNDETERMINED;
> memset(idet->history, UNDETERMINED, HIST_SIZE);
>
> - idet->filter_line = filter_line_c;
> + idet->filter_line = ff_idet_filter_line_c;
> +
> + if (ARCH_X86)
> + ff_idet_init_x86(idet);
wrong indent
[...]
> diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
> new file mode 100644
> index 0000000..3fb265c
> --- /dev/null
> +++ b/libavfilter/x86/vf_idet.asm
> @@ -0,0 +1,134 @@
> +;; *****************************************************************************
> +;; * x86-optimized functions for idet filter
> +;; *
> +;; * This file is part of FFmpeg.
> +;; *
> +;; * FFmpeg is free software; you can redistribute it and/or modify
> +;; * it under the terms of the GNU General Public License as published by
> +;; * the Free Software Foundation; either version 2 of the License, or
> +;; * (at your option) any later version.
> +;; *
> +;; * FFmpeg is distributed in the hope that it will be useful,
> +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +;; * GNU General Public License for more details.
> +;; *
> +;; * You should have received a copy of the GNU General Public License along
> +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> +;; ******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
SECTION_TEXT
> +
> +; Implementation that does 8-bytes at a time using single-word operations.
> +%macro IDET_FILTER_LINE 0
> +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index
> + xor indexq, indexq
> +%define m_zero m7
> +%define m_sum m6
> + pxor m_sum, m_sum
> + pxor m_zero, m_zero
> +
> +.loop:
> + movq m0, [aq+indexq*1]
movu
> + movq m1, m0
> + punpcklbw m0, m_zero
> + punpckhbw m1, m_zero
Here and later, you can do:
punpckhbw m1, m0, m_zero
punpcklbw m0, m_zero
> +
> + movq m3, [cq+indexq*1]
movu
> + movq m4, m3
mova
etc
> + punpcklbw m3, m_zero
> + punpckhbw m4, m_zero
> +
> + paddsw m0, m3
> + paddsw m1, m4
> +
> + movq m3, [bq+indexq*1]
> + movq m4, m3
> + punpcklbw m3, m_zero
> + punpckhbw m4, m_zero
> +
> + paddw m3, m3
> + paddw m4, m4
> + psubsw m0, m3
> + psubsw m1, m4
> +
> + ABS1 m0, m5
> + ABS1 m1, m5
ABS2?
> + paddw m0, m1
> + movq m1, m0
> + punpcklwd m0, m_zero
> + punpckhwd m1, m_zero
> + paddd m0, m1
> + paddd m_sum, m0
> +
> + add indexq, 0x8
> + CMP widthq, indexq
Someone needs to confirm this, but I think you'll need to make width a
ptrdiff_t and not an int
Also... stupid question but what's CMP?
> + jg .loop
> +
> + movq m0, m_sum
> + psrlq m_sum, 0x20
> + paddq m0, m_sum
> + movd eax, m0
> + RET
> +%endmacro
> +
> +%if ARCH_X86_32
> +INIT_MMX mmxext
> +IDET_FILTER_LINE
> +
> +INIT_MMX mmx
> +IDET_FILTER_LINE
> +%endif
> +
> +;; SSE2 8-bit implementation that does 16-bytes at a time:
> +;;
> +;; const int w2 = w >> 4
> +;; for (int i = 0; i < w2, ++i) {
> +;; const __m128i A = _mm_loadu_si128(&p_a[i])
> +;; const __m128i B = _mm_loadu_si128(&p_b[i])
> +;; const __m128i C = _mm_loadu_si128(&p_c[i])
> +;; const __m128i ab = _mm_subs_epu8(A, B)
> +;; const __m128i ba = _mm_subs_epu8(B, A)
> +;; const __m128i bc = _mm_subs_epu8(B, C)
> +;; const __m128i cb = _mm_subs_epu8(C, B)
> +;; const __m128i s1 = _mm_sad_epu8(ab, bc)
> +;; const __m128i s2 = _mm_sad_epu8(ba, cb)
> +;; result1 = _mm_add_epi64(result1, s1)
> +;; result2 = _mm_add_epi64(result2, s2)
> +;; }
please drop this
> +INIT_XMM sse2
> +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index, total
> + xor indexq, indexq
> + pxor m0, m0
> + pxor m1, m1
> +
> +.sse2_loop:
> + movdqu m2, [bq+indexq*1] ; B
> + movdqu m3, [aq+indexq*1] ; A
movu
> + movdqa m5, m2
> + movdqa m6, m2
> + movdqa m4, m3
mova
etc
> + psubusb m5, m3 ; ba
psubusb m5, m2, m3 and drop the previous mov
> +
> + movdqu m3, [cq+indexq*1] ; C
> + add indexq, 0x10
> + psubusb m4, m2 ; ab
> + CMP indexq, widthq
> +
> + psubusb m6, m3 ; bc
> + psubusb m3, m2 ; cb
> +
> + psadbw m4, m6 ; |ab - bc|
> + paddq m0, m4
> + psadbw m5, m3 ; |ba - cb|
> + paddq m1, m5
> + jl .sse2_loop
> +
> + paddq m0, m1
> + movhlps m1, m0
> + paddq m0, m1
> + movd eax, m0
> + RET
> diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c
> new file mode 100644
> index 0000000..98086ef
> --- /dev/null
> +++ b/libavfilter/x86/vf_idet_init.c
> @@ -0,0 +1,71 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/vf_idet.h"
> +
> +/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
> +#define FUNC_MAIN_DECL(KIND, SPAN) \
> +int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
> + const uint8_t *c, int w); \
> +static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
> + const uint8_t *c, int w) { \
> + int sum = 0; \
> + const int left_over = w % SPAN; \
> + w -= left_over; \
> + if (w > 0) \
> + sum += ff_idet_filter_line_##KIND(a, b, c, w); \
> + if (left_over > 0) \
> + sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over); \
> + return sum; \
> +}
wrong indent, 4 spaces
[...]
--
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140903/cd6598fe/attachment.asc>
More information about the ffmpeg-devel
mailing list