[FFmpeg-cvslog] Merge commit '994c4bc10751e39c7ed9f67ffd0c0dea5223daf2'
James Almer
git at videolan.org
Sat Oct 21 18:26:47 EEST 2017
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Sat Oct 21 12:07:16 2017 -0300| [2904db90458a1253e4aea6844ba9a59ac11923b6] | committer: James Almer
Merge commit '994c4bc10751e39c7ed9f67ffd0c0dea5223daf2'
* commit '994c4bc10751e39c7ed9f67ffd0c0dea5223daf2':
x86util: Port all macros to cpuflags
See d5f8a642f6eb1c6e305c41dabddd0fd36ffb3f77
Merged-by: James Almer <jamrial at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2904db90458a1253e4aea6844ba9a59ac11923b6
---
libavcodec/x86/audiodsp.asm | 5 +----
libavfilter/x86/yadif-16.asm | 24 ------------------------
libavutil/x86/x86util.asm | 42 ++++++++++++++++++++++++++----------------
libswscale/x86/scale.asm | 10 +---------
4 files changed, 28 insertions(+), 53 deletions(-)
diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index 3973808ca5..de395e5fa8 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -62,7 +62,7 @@ SCALARPRODUCT
; %1 = number of xmm registers used
; %2 = number of inline load/process/store loops per asm loop
; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
-; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
+; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
; %5 = suffix
%macro VECTOR_CLIP_INT32 4-5
cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
@@ -118,14 +118,11 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
%endmacro
INIT_MMX mmx
-%define CLIPD CLIPD_MMX
VECTOR_CLIP_INT32 0, 1, 0, 0
INIT_XMM sse2
VECTOR_CLIP_INT32 6, 1, 0, 0, _int
-%define CLIPD CLIPD_SSE2
VECTOR_CLIP_INT32 6, 2, 0, 1
INIT_XMM sse4
-%define CLIPD CLIPD_SSE41
%ifdef m8
VECTOR_CLIP_INT32 11, 1, 1, 0
%else
diff --git a/libavfilter/x86/yadif-16.asm b/libavfilter/x86/yadif-16.asm
index 79d127dfaa..9053b378a5 100644
--- a/libavfilter/x86/yadif-16.asm
+++ b/libavfilter/x86/yadif-16.asm
@@ -54,30 +54,6 @@ SECTION .text
%endif
%endmacro
-%macro PMINSD 3
-%if cpuflag(sse4)
- pminsd %1, %2
-%else
- mova %3, %2
- pcmpgtd %3, %1
- pand %1, %3
- pandn %3, %2
- por %1, %3
-%endif
-%endmacro
-
-%macro PMAXSD 3
-%if cpuflag(sse4)
- pmaxsd %1, %2
-%else
- mova %3, %1
- pcmpgtd %3, %2
- pand %1, %3
- pandn %3, %2
- por %1, %3
-%endif
-%endmacro
-
%macro PMAXUW 2
%if cpuflag(sse4)
pmaxuw %1, %2
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index e1220dfc1a..21419125d5 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -357,7 +357,7 @@
%endif
%endmacro
-%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3)
+%macro ABSB 2 ; source mmreg, temp mmreg (unused for SSSE3)
%if cpuflag(ssse3)
pabsb %1, %1
%else
@@ -381,7 +381,7 @@
%endif
%endmacro
-%macro ABSD2_MMX 4
+%macro ABSD2 4
pxor %3, %3
pxor %4, %4
pcmpgtd %3, %1
@@ -475,7 +475,7 @@
%else
palignr %1, %2, %3
%endif
-%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
+%else ; [dst,] src1, src2, imm, tmp
%define %%dst %1
%if %0==5
%ifnidn %1, %2
@@ -799,37 +799,47 @@
pminsw %1, %3
%endmacro
-%macro PMINSD_MMX 3 ; dst, src, tmp
+%macro PMINSD 3 ; dst, src, tmp/unused
+%if cpuflag(sse4)
+ pminsd %1, %2
+%elif cpuflag(sse2)
+ cvtdq2ps %1, %1
+ minps %1, %2
+ cvtps2dq %1, %1
+%else
mova %3, %2
pcmpgtd %3, %1
pxor %1, %2
pand %1, %3
pxor %1, %2
+%endif
%endmacro
-%macro PMAXSD_MMX 3 ; dst, src, tmp
+%macro PMAXSD 3 ; dst, src, tmp/unused
+%if cpuflag(sse4)
+ pmaxsd %1, %2
+%else
mova %3, %1
pcmpgtd %3, %2
pand %1, %3
pandn %3, %2
por %1, %3
+%endif
%endmacro
-%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
- PMINSD_MMX %1, %3, %4
- PMAXSD_MMX %1, %2, %4
-%endmacro
-
-%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
+%macro CLIPD 3-4
+%if cpuflag(sse4); src/dst, min, max, unused
+ pminsd %1, %3
+ pmaxsd %1, %2
+%elif cpuflag(sse2) ; src/dst, min (float), max (float), unused
cvtdq2ps %1, %1
minps %1, %3
maxps %1, %2
cvtps2dq %1, %1
-%endmacro
-
-%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
- pminsd %1, %3
- pmaxsd %1, %2
+%else ; src/dst, min, max, tmp
+ PMINSD %1, %3, %4
+ PMAXSD %1, %2, %4
+%endif
%endmacro
%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index f9781703a9..83cabff722 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -364,15 +364,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
movd [dstq+wq*2], m0
%endif ; %3 ==/!= X
%else ; %2 == 19
-%if mmsize == 8
- PMINSD_MMX m0, m2, m4
-%elif cpuflag(sse4)
- pminsd m0, m2
-%else ; sse2/ssse3
- cvtdq2ps m0, m0
- minps m0, m2
- cvtps2dq m0, m0
-%endif ; mmx/sse2/ssse3/sse4
+ PMINSD m0, m2, m4
%ifnidn %3, X
mova [dstq+wq*(4>>wshr)], m0
%else ; %3 == X
======================================================================
diff --cc libavfilter/x86/yadif-16.asm
index 79d127dfaa,0000000000..9053b378a5
mode 100644,000000..100644
--- a/libavfilter/x86/yadif-16.asm
+++ b/libavfilter/x86/yadif-16.asm
@@@ -1,317 -1,0 +1,293 @@@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni at gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang at gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+pw_8000: times 8 dw 0x8000
+pd_1: times 4 dd 1
+pd_8000: times 4 dd 0x8000
+
+SECTION .text
+
+%macro PABS 2
+%if cpuflag(ssse3)
+ pabsd %1, %1
+%else
+ pxor %2, %2
+ pcmpgtd %2, %1
+ pxor %1, %2
+ psubd %1, %2
+%endif
+%endmacro
+
+%macro PACK 1
+%if cpuflag(sse4)
+ packusdw %1, %1
+%else
+ psubd %1, [pd_8000]
+ packssdw %1, %1
+ paddw %1, [pw_8000]
+%endif
+%endmacro
+
- %macro PMINSD 3
- %if cpuflag(sse4)
- pminsd %1, %2
- %else
- mova %3, %2
- pcmpgtd %3, %1
- pand %1, %3
- pandn %3, %2
- por %1, %3
- %endif
- %endmacro
-
- %macro PMAXSD 3
- %if cpuflag(sse4)
- pmaxsd %1, %2
- %else
- mova %3, %1
- pcmpgtd %3, %2
- pand %1, %3
- pandn %3, %2
- por %1, %3
- %endif
- %endmacro
-
+%macro PMAXUW 2
+%if cpuflag(sse4)
+ pmaxuw %1, %2
+%else
+ psubusw %1, %2
+ paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+ movu m2, [curq+t1+%1*2]
+ movu m3, [curq+t0+%2*2]
+ mova m4, m2
+ mova m5, m2
+ pxor m4, m3
+ pavgw m5, m3
+ pand m4, [pw_1]
+ psubusw m5, m4
+ RSHIFT m5, 2
+ punpcklwd m5, m7
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ mova m4, m2
+ RSHIFT m3, 2
+ RSHIFT m4, 4
+ punpcklwd m2, m7
+ punpcklwd m3, m7
+ punpcklwd m4, m7
+ paddd m2, m3
+ paddd m2, m4
+%endmacro
+
+%macro CHECK1 0
+ mova m3, m0
+ pcmpgtd m3, m2
+ PMINSD m0, m2, m6
+ mova m6, m3
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+%macro CHECK2 0
+ paddd m6, [pd_1]
+ pslld m6, 30
+ paddd m2, m6
+ mova m3, m0
+ pcmpgtd m3, m2
+ PMINSD m0, m2, m4
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
+; am not sure whether it is any faster. A rewrite or refactor of the filter
+; code should make it possible to eliminate the move instruction at the end. It
+; exists to satisfy the expectation that the "score" values are in m1.
+
+; %macro CHECK2 0
+; mova m3, m0
+; pcmpgtd m0, m2
+; pand m0, m6
+; mova m6, m0
+; pand m5, m6
+; pand m2, m0
+; pandn m6, m1
+; pandn m0, m3
+; por m6, m5
+; por m0, m2
+; mova m1, m6
+; %endmacro
+
+%macro LOAD 2
+ movh %1, %2
+ punpcklwd %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+ pxor m7, m7
+ LOAD m0, [curq+t1]
+ LOAD m1, [curq+t0]
+ LOAD m2, [%2]
+ LOAD m3, [%3]
+ mova m4, m3
+ paddd m3, m2
+ psrad m3, 1
+ mova [rsp+ 0], m0
+ mova [rsp+16], m3
+ mova [rsp+32], m1
+ psubd m2, m4
+ PABS m2, m4
+ LOAD m3, [prevq+t1]
+ LOAD m4, [prevq+t0]
+ psubd m3, m0
+ psubd m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddd m3, m4
+ psrld m2, 1
+ psrld m3, 1
+ PMAXSD m2, m3, m6
+ LOAD m3, [nextq+t1]
+ LOAD m4, [nextq+t0]
+ psubd m3, m0
+ psubd m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddd m3, m4
+ psrld m3, 1
+ PMAXSD m2, m3, m6
+ mova [rsp+48], m2
+
+ paddd m1, m0
+ paddd m0, m0
+ psubd m0, m1
+ psrld m1, 1
+ PABS m0, m2
+
+ movu m2, [curq+t1-1*2]
+ movu m3, [curq+t0-1*2]
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ RSHIFT m3, 4
+ punpcklwd m2, m7
+ punpcklwd m3, m7
+ paddd m0, m2
+ paddd m0, m3
+ psubd m0, [pd_1]
+
+ CHECK -2, 0
+ CHECK1
+ CHECK -3, 1
+ CHECK2
+ CHECK 0, -2
+ CHECK1
+ CHECK 1, -3
+ CHECK2
+
+ mova m6, [rsp+48]
+ cmp DWORD r8m, 2
+ jge .end%1
+ LOAD m2, [%2+t1*2]
+ LOAD m4, [%3+t1*2]
+ LOAD m3, [%2+t0*2]
+ LOAD m5, [%3+t0*2]
+ paddd m2, m4
+ paddd m3, m5
+ psrld m2, 1
+ psrld m3, 1
+ mova m4, [rsp+ 0]
+ mova m5, [rsp+16]
+ mova m7, [rsp+32]
+ psubd m2, m4
+ psubd m3, m7
+ mova m0, m5
+ psubd m5, m4
+ psubd m0, m7
+ mova m4, m2
+ PMINSD m2, m3, m7
+ PMAXSD m3, m4, m7
+ PMAXSD m2, m5, m7
+ PMINSD m3, m5, m7
+ PMAXSD m2, m0, m7
+ PMINSD m3, m0, m7
+ pxor m4, m4
+ PMAXSD m6, m3, m7
+ psubd m4, m2
+ PMAXSD m6, m4, m7
+
+.end%1:
+ mova m2, [rsp+16]
+ mova m3, m2
+ psubd m2, m6
+ paddd m3, m6
+ PMAXSD m1, m2, m7
+ PMINSD m1, m3, m7
+ PACK m1
+
+ movh [dstq], m1
+ add dstq, mmsize/2
+ add prevq, mmsize/2
+ add curq, mmsize/2
+ add nextq, mmsize/2
+ sub DWORD r4m, mmsize/4
+ jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+ mov r4, r5mp
+ mov r5, r6mp
+ DECLARE_REG_TMP 4,5
+%else
+ movsxd r5, DWORD r5m
+ movsxd r6, DWORD r6m
+ DECLARE_REG_TMP 5,6
+%endif
+
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq
+ jmp .ret
+
+.parity0:
+ FILTER 0, curq, nextq
+
+.ret:
+ RET
+%endmacro
+
+INIT_XMM sse4
+YADIF
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --cc libavutil/x86/x86util.asm
index e1220dfc1a,81a3ada6e6..21419125d5
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@@ -825,32 -643,18 +836,31 @@@
minps %1, %3
maxps %1, %2
cvtps2dq %1, %1
- %endmacro
-
- %macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
- pminsd %1, %3
- pmaxsd %1, %2
+ %else ; src/dst, min, max, tmp
+ PMINSD %1, %3, %4
+ PMAXSD %1, %2, %4
+ %endif
%endmacro
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
- vbroadcastss %1, %2
-%else ; sse
- movss %1, %2
- shufps %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+ vbroadcastss %1, %2
+%elif cpuflag(avx)
+ %ifnum sizeof%2 ; avx1 register
+ shufps xmm%1, xmm%2, xmm%2, q0000
+ %if sizeof%1 >= 32 ; mmsize>=32
+ vinsertf128 %1, %1, xmm%1, 1
+ %endif
+ %else ; avx1 memory
+ vbroadcastss %1, %2
+ %endif
+%else
+ %ifnum sizeof%2 ; sse register
+ shufps %1, %2, %2, q0000
+ %else ; sse memory
+ movss %1, %2
+ shufps %1, %1, 0
+ %endif
%endif
%endmacro
More information about the ffmpeg-cvslog
mailing list