[FFmpeg-devel] [PATCH v2 5/5] avfilter/vf_yadif: Add x86_64 avx yadif asm

Thu Jul 21 05:25:14 EEST 2022

Add a new version of yadif_filter_line performed using packed bytes
instead of the packed words used by the current implementaiton. As
a result this implementation runs almost 2x as fast as the current
fastest SSSE3 implementation.

This implementation is created from scratch based on the C code, with
the goal of keeping all intermediate values within 8-bits so that
the vectorized code can be computed using packed bytes. differences
are as follows:
- Use algorithms to compute avg and abs difference using only 8-bit
 intermediate values.
- Reworked the mode 1 code by applying various mathematical identities
 to keep all intermediate values within 8-bits.
- Attempt to compute the spatial score using only 8-bits. The actual
 spatial score fits within this range 97% (content dependent) of the
 time for the entire 128-bit xmm vector. In the case that spatial
 score needs more than 8-bits to be represented, we detect this case,
 and recompute the spatial score using 16-bit packed words instead.

In 3% of cases the spatial_score will need more than 8-bytes to store
so we have a slow path, where the spatial score is computed using
packed words instead.

This implementation is currently limited to x86_64 due to the number
of registers required. x86_32 is possible, but the performance benefit
over the existing SSSE3 implentation is not as great, due to all of the
stack spills that would result from having far fewer registers. ASM was
not generated for the 32-bit varient due to limited ROI, as most AVX
users are likely on 64-bit OS at this point and 32-bit users would
lose out on most of the performance benefit.

Signed-off-by: Chris Phlipot <cphlipot0 at gmail.com>
---
 libavfilter/x86/Makefile         |   2 +-
 libavfilter/x86/vf_yadif_init.c  |  11 +
 libavfilter/x86/vf_yadif_x64.asm | 492 +++++++++++++++++++++++++++++++
 3 files changed, 504 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/x86/vf_yadif_x64.asm

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index e87481bd7a..19161ffa23 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -80,4 +80,4 @@ X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o
 X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
 X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
 X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
-X86ASM-OBJS-$(CONFIG_YADIF_FILTER)           += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
+X86ASM-OBJS-$(CONFIG_YADIF_FILTER)           += x86/vf_yadif.o x86/vf_yadif_x64.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 9dd73f8e44..1369081690 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,11 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
 void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
                                 void *next, int w, int prefs,
                                 int mrefs, int parity, int mode);
+#if ARCH_X86_64
+void ff_yadif_filter_line_avx(void *dst, void *prev, void *cur,
+                              void *next, int w, int prefs,
+                              int mrefs, int parity, int mode);
+#endif
 
 void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
                                      void *next, int w, int prefs,
@@ -71,5 +76,11 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
             yadif->filter_line = ff_yadif_filter_line_sse2;
         if (EXTERNAL_SSSE3(cpu_flags))
             yadif->filter_line = ff_yadif_filter_line_ssse3;
+#if ARCH_X86_64
+        if (EXTERNAL_AVX(cpu_flags)) {
+            yadif->filter_line = ff_yadif_filter_line_avx;
+            yadif->req_align = 16;
+        }
+#endif
     }
 }
diff --git a/libavfilter/x86/vf_yadif_x64.asm b/libavfilter/x86/vf_yadif_x64.asm
new file mode 100644
index 0000000000..c7e236ade7
--- /dev/null
+++ b/libavfilter/x86/vf_yadif_x64.asm
@@ -0,0 +1,492 @@
+;******************************************************************************
+;* Copyright (C) 2006-2011 Michael Niedermayer <michaelni at gmx.at>
+;*               2010      James Darnley <james.darnley at gmail.com>
+;*               2013-2022 Chris Phlipot <cphlipot0 at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software;* you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation;* either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%if ARCH_X86_64
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pb_127: times 16 db 127
+pb_128: times 16 db 128
+
+SECTION .text
+
+; Rename a register so that it can be used for a new purpose. The old name is
+; will become undefined so that any additional usage of the old name will
+; result in a compiler/assembler error.
+%macro RENAME_REGISTER 2
+    %ifidni %1,%2
+        %error "Can't rename a register to itself."
+    %endif
+    %xdefine %1 %2
+    %undef %2
+%endmacro
+
+; Usage: dst, arg1, arg2, temp1
+; Compute the absolute difference of arg1 and arg2 and places them in dst.
+; all operations are perfomed using packed bytes. Unlike ARM NEON there is no
+; instruction to do this, so instead we emulate it with multiple instructions.
+; eg. dst = abs(arg1 - arg2)
+%macro absdif_pb 4
+    %ifidni %1,%3
+        %error "arg1 and arg3 must be different"
+    %elifidni %1,%4
+        %error "arg1 and arg4 must be different"
+    %elifidni %3,%4
+        %error "arg3 and arg4 must be different"
+    %endif
+    psubusb        %4, %3, %2
+    psubusb        %1, %2, %3
+    por            %1, %1, %4
+%endmacro
+
+; Usage: dst, arg1, arg2, pb_1, temp1
+; Compute the average of 2 unsigned values rounded down.
+; SSE provides pavgb, which rounds up. Unlike ARM NEON, SSE doen't provide
+; an instruction that computes the avg of 2 unsigned bytes rounded down, so
+; instead we emulate it with this macro.
+; eg. dst = (arg1 + arg2) >> 1
+%macro avg_truncate_pb 5
+    %ifidni %1,%3
+        %error "arg1 and arg3 must be different"
+    %elifidni %1,%4
+        %error "arg1 and arg5 must be different"
+    %endif
+    pxor   %5, %2, %3
+    pavgb  %1, %2, %3
+    pand   %5, %5, %4
+    psubb  %1, %1, %5
+%endmacro
+
+INIT_XMM avx
+
+cglobal yadif_filter_line, 5, 15, 8, 240, dst, prev, cur, next, width, prefs, \
+                                        mrefs, parity, mode
+%xdefine cur_plus_prefs  r5
+%xdefine cur_plus_mrefs  r6
+%xdefine prefs r7
+%xdefine next2  r8
+%xdefine prev2_2mrefs  r9
+%xdefine mrefs r10
+%xdefine prev2_2prefs  r11
+%xdefine next2_2mrefs  r12
+%xdefine prev_plus_mrefs  r13
+%xdefine next_plus_mrefs  r14
+%xdefine prev2_2mrefs_stack_spill  [rsp - 24]
+%xdefine pb_1_reg m15
+
+%xdefine old_absdif_ahead_stack [rsp - 128]
+%xdefine absdif_here [rsp - 80]
+%xdefine absdif_behind [rsp - 64]
+
+%xdefine spatial_predicate_stack [rsp - 112]
+%xdefine spatial_pred_check_minus_1 [rsp - 16]
+
+; unaligned loads are slower than aligned loads. It is often benificial to
+; store values in an aligned location after doing an aligned load so that all
+; future loads of that value will be aligned.
+%xdefine cur_plus_prefs_x_stack [rsp]
+%xdefine cur_plus_mrefs_x_stack [rsp + 16]
+%xdefine cur_plus_mrefs_x_2_stack [rsp + 96]
+%xdefine cur_plus_prefs_x_minus_2 [rsp + 80]
+
+; Absolute differences used for CHECK(-1)
+%xdefine chkneg1_ad2_stack [rsp - 96]
+%xdefine chkneg1_ad1_stack [rsp + 176]
+%xdefine chkneg1_ad0_stack [rsp - 48]
+
+; Absolute differences used for CHECK(-2)
+%xdefine chkneg2_ad2_stack [rsp + 160]
+%xdefine chkneg2_ad1_stack [rsp + 144]
+%xdefine chkneg2_ad0_stack [rsp + 208]
+
+; Absolute differences used for CHECK(1)
+%xdefine chkpos1_ad2_stack [rsp + 112]
+%xdefine chkpos1_ad1_stack [rsp + 128]
+; chkpos1_ad0 has no stack locatation since it is kept in a register.
+
+; Absolute differences used for CHECK(2)
+%xdefine chkpos2_ad2_stack [rsp + 64]
+%xdefine chkpos2_ad1_stack [rsp + 48]
+%xdefine chkpos2_ad0_stack [rsp + 32]
+
+    movsxd          prefs, DWORD prefsm
+    movsxd          mrefs, DWORD mrefsm
+; Bail out early if width is zero.
+    test            widthd, widthd
+    jle             .return
+
+; Initialize all pointers. Unlike the C code the pointers all point to the
+; location where x equals 0 and remain unchanged instead of the pointers being
+; incremented on every loop iteration. Instead only x is incremented, and x86
+; memory addressing is used to add the current value of x on every memory
+; access at (most likely) zero cost.
+    lea             cur_plus_prefs, [curq + prefs]
+    movu            m0, [curq + prefs - 1]
+    lea             cur_plus_mrefs, [curq + mrefs]
+    movu            m1, [curq + mrefs - 1]
+    absdif_pb       m0, m0, m1, m5
+    cmp             dword paritym, 0
+    mov             next2, curq
+RENAME_REGISTER prev2, curq
+    cmove           next2, nextq
+    pslldq          m8, m0, 14
+    mova            old_absdif_ahead_stack, m8
+    cmovne          prev2, prevq
+    lea             prev_plus_mrefs, [prevq + mrefs]
+    add             prevq, prefs
+RENAME_REGISTER prev_plus_prefs, prevq
+    lea             next_plus_mrefs, [nextq + mrefs]
+    add             nextq, prefs
+RENAME_REGISTER next_plus_prefs, nextq
+    lea             prev2_2mrefs, [prev2 + 2*mrefs]
+    mov             prev2_2mrefs_stack_spill, prev2_2mrefs
+    lea             prev2_2prefs, [prev2 + 2*prefs]
+    lea             next2_2mrefs, [next2 + 2*mrefs]
+RENAME_REGISTER next2_2prefs, mrefs
+    lea             next2_2prefs, [next2 + 2*prefs]
+RENAME_REGISTER x, prefs
+    xor             x, x
+    mova            pb_1_reg, [pb_1]
+    mov             prev2_2mrefs,  prev2_2mrefs_stack_spill
+
+    jmp             .loop_start
+.loop_tail:
+    paddusb         m3, m2, m1
+    pminub          m0, m9, m3
+    psubusb         m2, m2, m1
+    pmaxub          m0, m0, m2
+    movu            [dstq + x], m0
+    add             x, 16
+    cmp             x, widthq
+    jge             .return
+.loop_start:
+; Start by computing the spatial score
+; We attempt to Compute the spatial score using saturated adds. In real
+; world content the entire spatial score 16-byte xmm vector will be able
+; to accurately represent the spatial score in 8-bits > 97% of the
+; time. Because of this we try computing the spatial score with 8-bit
+; first since it is 2x as fast, and check if we saturated the computation later.
+; The original spatial score can potentially be in the range of -1 to 765
+; Instead for this approach, we map the lower end of that to 8-bits using
+; the range -128 to 127.
+; If we detected that this assumption may have failed we instead re-compute
+; the spatial score using the full 16-bit range needed to represent -1 to 765.
+;
+; Before we compute the spatial score, we pre-compute most of the absolute
+; difference values used in the C code's CHECK() macros. These absolute
+; differences are then stored to the stack so that they can be re-used for the
+; slower 16-bit spatial score approach in case that is needed.
+    movu            m6, [cur_plus_mrefs + x - 3]
+    movu            m11, [cur_plus_mrefs + x - 2]
+    movu            m2, [cur_plus_mrefs + x - 1]
+    movu            m3, [cur_plus_mrefs + x]
+    movu            m13, [cur_plus_prefs + x]
+    movu            m0, [cur_plus_mrefs + x + 1]
+    movu            m1, [cur_plus_prefs + x + 1]
+    absdif_pb       m14, m0, m1, m5             ; abs(cur[mrefs+1]-cur[prefs+1])
+    avg_truncate_pb m10, m13, m3, pb_1_reg, m5  ; spatial_pred = (c+d) >> 2
+    mova            spatial_predicate_stack, m10
+    movu            m7, [cur_plus_prefs + x + 2]
+    absdif_pb       m10, m11, m13, m5           ; abs(cur[mrefs-2]-cur[prefs])
+    mova            chkneg1_ad2_stack, m10
+    absdif_pb       m8, m2, m1, m5              ; abs(cur[mrefs-1]-cur[prefs+1])
+    absdif_pb       m9, m3, m7, m5              ; abs(cur[mrefs]-cur[prefs+2])
+    absdif_pb       m10, m6, m1, m5             ; abs(cur[mrefs-3]-cur[prefs+1])
+    mova            chkneg2_ad2_stack, m10
+    absdif_pb       m10, m11, m7, m5            ; abs(cur[mrefs-2]-cur[prefs+2])
+    mova            chkneg2_ad1_stack, m10
+    movu            m4, [cur_plus_prefs + x + 3]
+    absdif_pb       m10, m2, m4, m5             ; abs(cur[mrefs-1]-cur[prefs+3])
+    mova            chkneg2_ad0_stack, m10
+    movu            m12, [cur_plus_mrefs + x + 2]
+    absdif_pb       m10, m12, m13, m5           ; abs(cur[mrefs+2]-cur[prefs])
+    mova            cur_plus_prefs_x_stack, m13
+    mova            chkpos1_ad2_stack, m10
+    movu            m6, [cur_plus_prefs + x - 1]
+    absdif_pb       m10, m0, m6, m5             ; abs(cur[mrefs+1]-cur[prefs-1])
+    mova            chkpos1_ad1_stack, m10
+    movu            m10, [cur_plus_prefs + x - 2]
+    mova            cur_plus_mrefs_x_stack, m3
+    absdif_pb       m13, m10, m3, m5            ; abs(cur[mrefs]-cur[prefs-2])
+    movu            m4, [cur_plus_mrefs + x + 3]
+    absdif_pb       m4, m4, m6, m5              ; abs(cur[mrefs+3]-cur[prefs-1])
+    mova            chkpos2_ad2_stack, m4
+    absdif_pb       m3, m12, m10, m5            ; abs(cur[mrefs+2]-cur[prefs-2])
+    mova            chkpos2_ad1_stack, m3
+    movu            m4, [cur_plus_prefs + x - 3]
+    absdif_pb       m3, m0, m4, m5              ; abs(cur[mrefs+1]-cur[prefs-3])
+    mova            chkpos2_ad0_stack, m3
+    mova            chkneg1_ad1_stack, m8
+    paddusb         m5, m8, chkneg1_ad2_stack
+    mova            chkneg1_ad0_stack, m9
+    paddusb         m4, m9, pb_1_reg
+    paddusb         m5, m5, m4
+    mova            m3, old_absdif_ahead_stack
+    palignr         m4, m14, m3, 15
+    palignr         m3, m14, m3, 14
+    mova            old_absdif_ahead_stack, m14
+    mova            absdif_here, m4
+    paddusb         m4, m14, m4
+    mova            absdif_behind, m3
+    paddusb         m4, m4, m3
+    pxor            m4, m4, [pb_128]
+    pxor            m5, m5, [pb_128]
+    pcmpgtb         m8, m4, m5
+    pcmpeqb         m14, m4, [pb_127]
+    por             m8, m8, m14
+    pminsb          m4, m4, m5
+    avg_truncate_pb m1, m1, m2, pb_1_reg, m5
+    mova            spatial_pred_check_minus_1, m1
+    mova            m1, chkneg2_ad1_stack
+    paddusb         m2, m1, chkneg2_ad2_stack
+    paddusb         m5, pb_1_reg, chkneg2_ad0_stack
+    paddusb         m2, m2, m5
+    avg_truncate_pb m11, m11, m7, pb_1_reg, m5
+    mova            m3, chkpos1_ad1_stack
+    paddusb         m3, m3, chkpos1_ad2_stack
+    paddusb         m5, m13, pb_1_reg
+    paddusb         m7, m3, m5
+    avg_truncate_pb m3, m6, m0, pb_1_reg, m5
+    pxor            m0, m2, [pb_128]
+    pcmpgtb         m2, m4, m0
+    pand            m5, m8, m2
+    pblendvb        m6, m4, m0, m5
+    pcmpeqb         m0, m4, [pb_127]
+    pand            m2, m5, m0
+    pxor            m7, m7, [pb_128]
+    pminsb          m0, m6, m7
+    pcmpeqb         m4, m0, [pb_127]
+    por             m2, m4, m2
+    ptest           m2, m2
+    jne             .spatial_check_16_bit
+; At this point we know if we can continue on the fast path with saturating
+; spatial score computation while maintaining bit-accuracy, or if we need to
+; bail out and perform the spatial score computation using full 16-bit words
+; to store the score value. check_2_saturate is only executed here if we know
+; we don't need to go down the slow path.
+.check_2_saturate:
+    mova            m2, spatial_predicate_stack
+    pblendvb        m1, m2, spatial_pred_check_minus_1, m8
+    pblendvb        m1, m11, m5
+    pcmpgtb         m2, m6, m7
+    pcmpeqb         m5, m6, [pb_127]
+    por             m2, m5, m2
+    pblendvb        m1, m3, m2
+    mova            m3, chkpos2_ad1_stack
+    paddusb         m3, m3, chkpos2_ad2_stack
+    paddusb         m4, pb_1_reg, chkpos2_ad0_stack
+    paddusb         m3, m3, m4
+    pxor            m3, m3, [pb_128]
+    pcmpgtb         m0, m0, m3
+    pand            m0, m0, m2
+    avg_truncate_pb m2, m12, m10, pb_1_reg, m5
+    pblendvb        m9, m1, m2, m0
+.temporal_check:
+    mova            m0, cur_plus_mrefs_x_stack
+    mova            m8, cur_plus_prefs_x_stack
+    movu            m1, [prev2 + x]
+    movu            m6, [next2 + x]
+    avg_truncate_pb m2, m6, m1, pb_1_reg, m5
+    absdif_pb       m1, m1, m6, m5
+    movu            m6, [prev_plus_mrefs + x]
+    movu            m4, [prev_plus_prefs + x]
+    absdif_pb       m6, m6, m0, m5
+    absdif_pb       m4, m4, m8, m5
+    avg_truncate_pb m6, m6, m4, pb_1_reg, m5
+    movu            m4, [next_plus_mrefs + x]
+    movu            m3, [next_plus_prefs + x]
+    absdif_pb       m4, m4, m0, m5
+    absdif_pb       m3, m3, m8, m5
+    avg_truncate_pb m4, m4, m3, pb_1_reg, m5
+    pmaxub          m6, m6, m4
+    psrlw           m1, m1, 1
+    pand            m1, m1, [pb_127]
+    pmaxub          m1, m1, m6
+    cmp             DWORD modem, 1
+    jg              .loop_tail
+.handle_mode_1:
+; Handle the "if (!(mode&2))" section.
+; This section has undergone some complex
+; tranformations with respect to the c implementation in order to
+; ensure that all inputs, outputs and intermeidate values can be
+; stored in 8-bit unsigned values. The code is transformed with
+; various identities to prevent signed intermediate values which
+; would require an extra 9th bit for the sign, which we don't have.
+; The main identities are applied:
+; 1. -MAX(a-b, c-d) = MIN(b-c, d-c)
+; 2. MIN(a-c, b-c) = MIN(a, b)-c
+; The following from the C code:
+;
+; int max = FFMAX3(d-e, d-c, FFMIN(b-c, f-e));
+; diff = FFMAX3(diff, min, -max);
+;
+; becomes:
+; int negative_max = FFMIN( FFMIN(e, c)-d, FFMAX(c-b, e-f))
+; diff = FFMAX3(diff, min, negative_max);
+;
+; Lastly we know that diff must be non-negative in the end, so
+; intermediate negative values don't matter. to keep computations
+; within 8 bits, we use saturating subtraction which replaces all
+; negative intermediate results with 0, but doesn't affect the
+; final value assigned to diff.
+    movu            m6, [prev2_2mrefs + x]
+    movu            m4, [next2_2mrefs + x]
+    avg_truncate_pb m6, m6, m4, pb_1_reg, m5
+    movu            m4, [prev2_2prefs + x]
+    movu            m3, [next2_2prefs + x]
+    avg_truncate_pb m4, m4, m3, pb_1_reg, m5
+    psubusb         m3, m8, m2
+    psubusb         m5, m0, m2
+    pminub          m3, m3, m5
+    psubusb         m5, m0, m6
+    psubusb         m7, m8, m4
+    pmaxub          m5, m5, m7
+    pminub          m3, m3, m5
+    psubusb         m5, m2, m8
+    psubusb         m7, m2, m0
+    pminub          m5, m5, m7
+    psubusb         m6, m6, m0
+    psubusb         m4, m4, m8
+    pmaxub          m6, m6, m4
+    pminub          m6, m5, m6
+    pmaxub          m6, m6, m3
+    pmaxub          m1, m1, m6
+    jmp             .loop_tail
+.spatial_check_16_bit:
+; Assuming all else fails, we compute the spatial score using packed words to
+; store the temporary values. Every input register containing packed bytes is
+; unpacked into 2 separate registers with packed words, which are then
+; processed identically. This path should generally be run < 3% of time, and
+; is kept mainly to ensure output is bit-accurate compared to the C
+; impelmentation
+    mova            cur_plus_mrefs_x_2_stack, m12
+    mova            cur_plus_prefs_x_minus_2, m10
+    mova            m5, old_absdif_ahead_stack
+    pmovzxbw        m0, m5
+    mova            m4, absdif_here
+    pmovzxbw        m2, m4
+    paddw           m0, m0, m2
+    pxor            m12, m12, m12
+    punpckhbw       m2, m4, m12
+    punpckhbw       m5, m5, m12
+    paddw           m2, m5, m2
+    mova            m7, absdif_behind
+    pmovzxbw        m5, m7
+    pcmpeqd         m4, m4, m4
+    paddw           m5, m5, m4
+    paddw           m9, m0, m5
+    punpckhbw       m5, m7, m12
+    paddw           m5, m5, m4
+    paddw           m7, m2, m5
+    mova            m0, chkneg1_ad2_stack
+    pmovzxbw        m2, m0
+    mova            m4, chkneg1_ad1_stack
+    pmovzxbw        m5, m4
+    paddw           m2, m2, m5
+    punpckhbw       m5, m0, m12
+    punpckhbw       m4, m4, m12
+    paddw           m4, m5, m4
+    mova            m0, chkneg1_ad0_stack
+    pmovzxbw        m5, m0
+    paddw           m5, m2, m5
+    pminsw          m6, m9, m5
+    punpckhbw       m2, m0, m12
+    paddw           m2, m4, m2
+    pminsw          m14, m2, m7
+    pcmpgtw         m4, m9, m5
+    pcmpgtw         m10, m7, m2
+    packsswb        m0, m4, m10
+    mova            m2, spatial_predicate_stack
+    pblendvb        m0, m2, spatial_pred_check_minus_1, m0
+    mova            spatial_predicate_stack, m0
+    mova            m0, chkneg2_ad2_stack
+    pmovzxbw        m2, m0
+    mova            m1, chkneg2_ad0_stack
+    pmovzxbw        m7, m1
+    paddw           m2, m7, m2
+    mova            m9, chkneg2_ad1_stack
+    pmovzxbw        m7, m9
+    paddw           m2, m2, m7
+    punpckhbw       m7, m0, m12
+    punpckhbw       m5, m1, m12
+    paddw           m5, m5, m7
+    punpckhbw       m7, m9, m12
+    paddw           m5, m5, m7
+    mova            m0, chkpos1_ad2_stack
+    pmovzxbw        m7, m0
+    mova            m1, chkpos1_ad1_stack
+    pmovzxbw        m8, m1
+    paddw           m8, m8, m7
+    punpckhbw       m7, m0, m12
+    punpckhbw       m0, m1, m12
+    paddw           m7, m0, m7
+    pmovzxbw        m1, m13
+    paddw           m9, m8, m1
+    punpckhbw       m0, m13, m12
+    paddw           m8, m7, m0
+    pcmpgtw         m0, m6, m2
+    pand            m0, m0, m4
+    pcmpgtw         m4, m14, m5
+    pand            m4, m10, m4
+    pblendvb        m1, m6, m2, m0
+    pblendvb        m14, m5, m4
+    packsswb        m0, m0, m4
+    mova            m5, spatial_predicate_stack
+    pblendvb        m0, m5, m11, m0
+    pcmpgtw         m5, m1, m9
+    pcmpgtw         m4, m14, m8
+    packsswb        m6, m5, m4
+    pblendvb        m13, m0, m3, m6
+    mova            m0, chkpos2_ad2_stack
+    pmovzxbw        m3, m0
+    mova            m7, chkpos2_ad1_stack
+    pmovzxbw        m6, m7
+    paddw           m3, m6, m3
+    punpckhbw       m6, m0, m12
+    punpckhbw       m0, m7, m12
+    paddw           m0, m0, m6
+    mova            m7, chkpos2_ad0_stack
+    pmovzxbw        m6, m7
+    paddw           m3, m3, m6
+    punpckhbw       m6, m7, m12
+    paddw           m0, m0, m6
+    pminsw          m1, m9, m1
+    pcmpgtw         m1, m1, m3
+    pminsw          m14, m8, m14
+    pcmpgtw         m14, m14, m0
+    pand            m1, m1, m5
+    pand            m14, m14, m4
+    packsswb        m14, m1, m14
+    mova            m0, cur_plus_mrefs_x_2_stack
+    mova            m5, cur_plus_prefs_x_minus_2
+    pxor            m1, m5, m0
+    pavgb           m0, m0, m5
+    pand            m1, m1, [pb_1]
+    psubb           m1, m0, m1
+    pblendvb        m9, m13, m1, m14
+    jmp     .temporal_check
+.return:
+    RET
+
+%endif  ; ARCH_X86_64
-- 
2.25.1