[FFmpeg-cvslog] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms
Henrik Gramner
git at videolan.org
Mon May 19 17:40:34 EEST 2025
ffmpeg | branch: master | Henrik Gramner <gramner at twoorioles.com> | Fri May 16 15:18:14 2025 +0200| [fd18ae88ae736b5aabff34e17394fcd103f9e5ad] | committer: Henrik Gramner
avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=fd18ae88ae736b5aabff34e17394fcd103f9e5ad
---
libavcodec/x86/Makefile | 1 +
libavcodec/x86/vp9dsp_init.c | 15 +
libavcodec/x86/vp9itxfm_avx512.asm | 1629 ++++++++++++++++++++++++++++++++++++
libavutil/mem_internal.h | 2 +
tests/checkasm/vp9dsp.c | 14 +-
5 files changed, 1654 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 821c410a0f..bf752f5da2 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -184,6 +184,7 @@ X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
+ x86/vp9itxfm_avx512.o \
x86/vp9itxfm_16bpp.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 8d11dbc348..4373fa3f04 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -114,7 +114,9 @@ itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
itxfm_func(iwht, iwht, 4, mmx);
itxfm_funcs(16, avx2);
+itxfm_funcs(16, avx512icl);
itxfm_func(idct, idct, 32, avx2);
+itxfm_func(idct, idct, 32, avx512icl);
#undef itxfm_func
#undef itxfm_funcs
@@ -406,6 +408,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
init_ipred(32, avx2, tm, TM_VP8);
}
+#if ARCH_X86_64
+ if (EXTERNAL_AVX512ICL(cpu_flags)) {
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx512icl;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx512icl;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx512icl;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl;
+ dsp->itxfm_add[TX_32X32][ADST_ADST] =
+ dsp->itxfm_add[TX_32X32][ADST_DCT] =
+ dsp->itxfm_add[TX_32X32][DCT_ADST] =
+ dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl;
+ }
+#endif
+
#undef init_fpel
#undef init_subpel1
#undef init_subpel2
diff --git a/libavcodec/x86/vp9itxfm_avx512.asm b/libavcodec/x86/vp9itxfm_avx512.asm
new file mode 100644
index 0000000000..d51c50756d
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_avx512.asm
@@ -0,0 +1,1629 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2025 Two Orioles, LLC
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL
+
+SECTION_RODATA 64
+
+dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
+ db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
+ db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
+itx_perm: dq 0x0000000820150440, 0x0000000231372604
+ dq 0x0000000ca8041551, 0x00000006b9263715
+ dq 0x00000001ec9d8c62, 0x0000000bfdbfae26
+ dq 0x00000005648c9d73, 0x0000000f75aebf37
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+pw_512: times 4 dw 512
+pw_m512: times 4 dw -512
+pw_15137_6270x2x4: times 4 dw 15137*2
+ times 4 dw 6270*2
+pw_11585_m11585x2x4: times 4 dw 11585*2
+pw_m11585_11585x2x4: times 4 dw -11585*2
+pw_11585_11585x2: times 4 dw 11585*2
+int_mshift: db 142, 150, 0, 0, 174, 182, 0, 0
+pd_8192: dd 8192
+pw_804x2: times 2 dw 804*2
+pw_1606x2: times 2 dw 1606*2
+pw_3196x2: times 2 dw 3196*2
+pw_3981x2: times 2 dw 3981*2
+pw_6270x2: times 2 dw 6270*2
+pw_7005x2: times 2 dw 7005*2
+pw_7723x2: times 2 dw 7723*2
+pw_9760x2: times 2 dw 9760*2
+pw_12140x2: times 2 dw 12140*2
+pw_12665x2: times 2 dw 12665*2
+pw_13160x2: times 2 dw 13160*2
+pw_13623x2: times 2 dw 13623*2
+pw_14053x2: times 2 dw 14053*2
+pw_14449x2: times 2 dw 14449*2
+pw_14811x2: times 2 dw 14811*2
+pw_15137x2: times 2 dw 15137*2
+pw_15426x2: times 2 dw 15426*2
+pw_15679x2: times 2 dw 15679*2
+pw_15893x2: times 2 dw 15893*2
+pw_16069x2: times 2 dw 16069*2
+pw_16207x2: times 2 dw 16207*2
+pw_16305x2: times 2 dw 16305*2
+pw_16364x2: times 2 dw 16364*2
+pw_m2404x2: times 2 dw -2404*2
+pw_m4756x2: times 2 dw -4756*2
+pw_m5520x2: times 2 dw -5520*2
+pw_m8423x2: times 2 dw -8423*2
+pw_m9102x2: times 2 dw -9102*2
+pw_m10394x2: times 2 dw -10394*2
+pw_m11003x2: times 2 dw -11003*2
+pw_804_16364x2: dw 804*2, 16364*2
+pw_1606_16305x2: dw 1606*2, 16305*2
+pw_3196_16069x2: dw 3196*2, 16069*2
+pw_3981_15893x2: dw 3981*2, 15893*2
+pw_7005_14811x2: dw 7005*2, 14811*2
+pw_7723_14449x2: dw 7723*2, 14449*2
+pw_9760_13160x2: dw 9760*2, 13160*2
+pw_m2404_16207x2: dw -2404*2, 16207*2
+pw_m4756_15679x2: dw -4756*2, 15679*2
+pw_m5520_15426x2: dw -5520*2, 15426*2
+pw_m8423_14053x2: dw -8423*2, 14053*2
+pw_m9102_13623x2: dw -9102*2, 13623*2
+pw_m10394_12665x2: dw -10394*2, 12665*2
+pw_m11003_12140x2: dw -11003*2, 12140*2
+
+%macro COEF_PAIR 2-3 0
+%if %3 & 4
+pw_%1_m%2: dw %1, -%2
+%else
+pw_%1_%2: dw %1, %2
+%if %3 & 2
+pw_m%1_%2: dw -%1, %2
+%else
+pw_m%2_%1: dw -%2, %1
+%endif
+%endif
+%if %3 & 1
+pw_m%1_m%2: dw -%1, -%2
+%endif
+%endmacro
+
+COEF_PAIR 804, 16364
+COEF_PAIR 1606, 16305
+COEF_PAIR 3196, 16069, 1
+COEF_PAIR 3981, 15893
+COEF_PAIR 6270, 15137, 1
+COEF_PAIR 7005, 14811
+COEF_PAIR 7723, 14449
+COEF_PAIR 9102, 13623
+COEF_PAIR 9760, 13160
+COEF_PAIR 11585, 11585, 1
+COEF_PAIR 12140, 11003
+COEF_PAIR 12665, 10394
+COEF_PAIR 13623, 9102, 1
+COEF_PAIR 14053, 8423
+COEF_PAIR 15137, 6270
+COEF_PAIR 15426, 5520
+COEF_PAIR 15679, 4756
+COEF_PAIR 16069, 3196
+COEF_PAIR 16207, 2404
+
+; ADST16-only:
+COEF_PAIR 2404, 9760, 2
+COEF_PAIR 5520, 7005, 2
+COEF_PAIR 8423, 3981, 2
+COEF_PAIR 11003, 804, 2
+COEF_PAIR 12140, 16364, 5
+COEF_PAIR 14053, 15893, 5
+COEF_PAIR 15426, 14811, 5
+COEF_PAIR 16207, 13160, 5
+pw_11585_m11585: dw 11585, -11585
+pw_16069_m3196: dw 16069, -3196
+pw_9102_m13623: dw 9102, -13623
+pw_15137_m6270: dw 15137, -6270
+pw_6270_m15137: dw 6270, -15137
+
+%define pw_11585x2 pw_11585_11585x2
+%define pw_m11585x2 pw_m11585_11585x2x4
+
+SECTION .text
+
+%define o_base pw_512 + 128
+%define o(x) (r6 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
+; 16 = special_mul1, 32 = special_mul2, 64 = dst_in_tmp1
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+ mova m%2, m%4
+%if %7 & 16
+ vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd}
+ mova m%3, m%4
+%if %7 & 32
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%else
+ vpdpwssd m%3, m%1, m%6
+%endif
+%elif %7 & 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%elif %6 < 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, m%6
+%elif %7 & 1
+ vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd}
+%else
+ vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd}
+%endif
+%if %7 & 2
+ psrld m%2, 14
+ pslld m%3, 2
+ vpshrdd m%1, m%3, m%2, 16
+%elif %7 & 4
+ ; compared to using shifts (as above) this has better throughput,
+ ; but worse latency and requires setting up the opmask/index
+ ; registers, so only use this method for the larger transforms
+%if %7 & 64
+ pslld m%2, 2
+ vpmultishiftqb m%2{k7}, m13, m%3
+%else
+ pslld m%1, m%2, 2
+ vpmultishiftqb m%1{k7}, m13, m%3
+%endif
+%else
+ psrad m%2, 14
+ psrad m%3, 14
+%if %7 & 8 == 0
+ packssdw m%1, m%3, m%2
+%endif
+%endif
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+ punpcklwd m%3, m%2, m%1
+ punpckhwd m%2, m%1
+%if %7 < 32
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, m%7
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, m%7
+%else
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd}
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd}
+%endif
+ psrad m%1, 14
+ psrad m%4, 14
+ packssdw m%1, m%4
+ mova m%4, m%5
+%if %7 < 32
+ vpdpwssd m%4, m%2, m%6
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, m%6
+%else
+ vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd}
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd}
+%endif
+ psrad m%4, 14
+ psrad m%2, 14
+ packssdw m%2, m%4
+%endmacro
+
+; flags: 1 = swap, 2 = invert2, 4 = invert1
+%macro ADST_MULSUB_4W 10-11 0 ; dst1/src1, src2, dst2, tmp[1-2], rnd, coef[1-4], flags
+ mova m%3, m%6
+%if %11 & 1
+ vpdpwssd m%3, m%1, [o(pw_m%8_%7)] {bcstd}
+%else
+ vpdpwssd m%3, m%1, [o(pw_%7_%8)] {bcstd}
+%endif
+%if %11 & 4
+ vpbroadcastd m%4, [o(pw_m%9_%10)]
+%elif %11 & 2
+ vpbroadcastd m%4, [o(pw_%9_m%10)]
+%elif %11 & 1
+ vpbroadcastd m%4, [o(pw_%10_%9)]
+%else
+ vpbroadcastd m%4, [o(pw_%9_%10)]
+%endif
+ pmaddwd m%4, m%2
+ mova m%5, m%6
+%if %11 & 4
+ vpdpwssd m%5, m%1, [o(pw_%8_m%7)] {bcstd}
+%elif %11 & 1
+ vpdpwssd m%5, m%1, [o(pw_%7_%8)] {bcstd}
+%else
+ vpdpwssd m%5, m%1, [o(pw_m%8_%7)] {bcstd}
+%endif
+%if %11 & 2
+ vpbroadcastd m%1, [o(pw_%10_%9)]
+%elif %11 & 1
+ vpbroadcastd m%1, [o(pw_%9_m%10)]
+%else
+ vpbroadcastd m%1, [o(pw_m%10_%9)]
+%endif
+ pmaddwd m%2, m%1
+ paddd m%1, m%3, m%4
+ psubd m%3, m%4
+ paddd m%4, m%5, m%2
+ psubd m%5, m%2
+ pslld m%1, 2
+ pslld m%3, 2
+ vpmultishiftqb m%1{k7}, m13, m%4
+ vpmultishiftqb m%3{k7}, m13, m%5
+%endmacro
+
+%macro WRAP_YMM 1+
+ INIT_YMM cpuname
+ %1
+ INIT_ZMM cpuname
+%endmacro
+
+%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset
+cglobal vp9_i%1_i%2_%3_add, 4, 5, 0, dst, stride, c, eob, tx2
+ %undef cmp
+ %define %%p1 m(vp9_i%1_%3_internal)
+ lea r6, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(vp9_i%2_%3_internal).pass2]
+%ifidn %1_%2, dct_dct
+ cmp eobd, 1
+ jne %%p1
+%else
+%if %4
+ add eobd, %4
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, 16x16, %3
+%ifidn %1_%2, dct_dct
+ movd xmm0, [o(pw_11585x2)]
+ pmulhrsw xmm3, xmm0, [cq]
+ pxor ym2, ym2
+ pmulhrsw xmm3, xmm0
+ pmulhrsw xmm3, [o(pw_512)]
+ mova [cq], xm2
+ add r3d, 7
+ vpbroadcastw ym3, xmm3
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ paddw ym0, ym3
+ paddw ym1, ym3
+ packuswb ym0, ym1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro IDCT16_MAIN 0-1 0 ; idct32
+%if mmsize == 64 && %1 == 0
+.main_fast:
+%endif
+ vpbroadcastd m2, [o(pw_1606_16305x2)]
+ vpbroadcastd m4, [o(pw_m10394_12665x2)]
+ vpbroadcastd m11, [o(pw_7723_14449x2)]
+ vpbroadcastd m12, [o(pw_m4756_15679x2)]
+ pmulhrsw m8, m2 ; t8a t15a
+ vpbroadcastd m2, [o(pw_3196_16069x2)]
+ pmulhrsw m0, m4 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m9102_13623x2)]
+ pmulhrsw m5, m11 ; t10a t13a
+ vpbroadcastd m11, [o(pw_11585_11585x2)]
+ pmulhrsw m1, m12 ; t11a t12a
+ vbroadcasti32x4 m12, [o(pw_15137_6270x2x4)]
+ pmulhrsw m7, m2 ; t4a t7a
+ pmulhrsw m3, m4 ; t5a t6a
+ pmulhrsw m9, m11 ; t0 t1
+ pmulhrsw m6, m12 ; t3 t2
+%if mmsize == 64 && %1 == 0
+ jmp %%main2
+ALIGN function_align
+.main:
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+ ITX_MUL2X_PACK 8, 2, 4, 10, 1606, 16305, 5 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 12665, 10394, 5 ; t9a t14a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 7723, 14449, 5 ; t10a t13a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 15679, 4756, 5 ; t11a t12a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 3196, 16069, 5 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 13623, 9102, 5 ; t5a t6a
+ ITX_MUL2X_PACK 9, 2, 4, 10, 11585, 11585 ; t0 t1
+ ITX_MUL2X_PACK 6, 2, 4, 10, 6270, 15137 ; t3 t2
+%%main2:
+%endif
+ psubw m2, m8, m0 ; t9 t14
+ paddw m8, m0 ; t8 t15
+ psubw m4, m1, m5 ; t10 t13
+ paddw m1, m5 ; t11 t12
+ ITX_MUL2X_PACK 2, 0, 5, 10, 6270, 15137, (1|%1*4) ; t9a t14a
+ ITX_MUL2X_PACK 4, 0, 5, 10, m15137, 6270, (1|%1*4) ; t10a t13a
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ psubw m0, m8, m1 ; t11a t12a
+ paddw m8, m1 ; t8a t15a
+ psubw m1, m7, m3 ; t5a t6a
+ paddw m7, m3 ; t4 t7
+ pshufb m8, m5
+ pshufb m7, m5
+ paddw m3, m2, m4 ; t9 t14
+ psubw m2, m4 ; t10 t13
+%if %1
+ vpbroadcastd m12, [o(pw_11585_11585)]
+ vpbroadcastd m11, [o(pw_m11585_11585)]
+ pshufb m3, m5
+ ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6
+ ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12
+ ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a
+ packssdw m5, m11 ; t12 t13a
+ packssdw m4, m0 ; t11 t10a
+%else
+ pshufb m0, m5
+ ITX_MUL2X_PACK 1, 4, 5, 10, 11585_11585, m11585_11585, 48 ; t5 t6
+ vpbroadcastd m11, [o(pw_11585x2)]
+ punpckhqdq m5, m0, m2 ; t12a t13
+ punpcklqdq m0, m2 ; t11a t10
+ psubw m4, m5, m0
+ paddw m5, m0
+ pmulhrsw m4, m11 ; t11 t10a
+ pmulhrsw m5, m11 ; t12 t13a
+%endif
+ punpckhqdq m2, m7, m1 ; t7 t6
+ punpcklqdq m7, m1 ; t4 t5
+ psubw m1, m9, m6 ; t3 t2
+ paddw m9, m6 ; t0 t1
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ psubw m3, m9, m2 ; t7 t6
+ paddw m9, m2 ; t0 t1
+ psubw m2, m1, m7 ; t4 t5
+ paddw m1, m7 ; t3 t2
+ psubw m7, m9, m0 ; out15 out14
+ paddw m0, m9 ; out0 out1
+ psubw m6, m1, m5 ; out12 out13
+ paddw m1, m5 ; out3 out2
+ psubw m5, m2, m4 ; out11 out10
+ paddw m2, m4 ; out4 out5
+ psubw m4, m3, m8 ; out8 out9
+ paddw m3, m8 ; out7 out6
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst, 39-23
+
+cglobal vp9_idct_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2
+ mova m15, [o(itx_perm)]
+ vpbroadcastd m10, [o(pd_8192)]
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+ sub eobd, 39
+ jl .pass1_fast
+ vpermq m0, m15, [cq+64*0]
+ vpermq m1, m15, [cq+64*1]
+ vpermq m2, m15, [cq+64*2]
+ vpermq m3, m15, [cq+64*3]
+ vpermq m4, m15, [cq+64*4]
+ vpermq m5, m15, [cq+64*5]
+ vpermq m6, m15, [cq+64*6]
+ vpermq m7, m15, [cq+64*7]
+ call .main
+ vbroadcasti32x4 m12, [o(int_shuf1)]
+ vbroadcasti32x4 m11, [o(int_shuf2)]
+ pshufb m0, m12
+ pshufb m8, m1, m11
+ pshufb m2, m12
+ pshufb m9, m3, m11
+ pshufb m4, m12
+ pshufb m14, m5, m11
+ pshufb m6, m12
+ pshufb m11, m7, m11
+ punpckhdq m1, m0, m8
+ punpckldq m0, m8
+ punpckhdq m3, m2, m9
+ punpckldq m2, m9
+ punpckhdq m5, m4, m14
+ punpckldq m4, m14
+ punpckhdq m7, m6, m11
+ punpckldq m6, m11
+.pass1_end:
+ vshufi32x4 m8, m4, m6, q3232
+ vinserti32x8 m4, ym6, 1
+ vshufi32x4 m6, m0, m2, q3232
+ vinserti32x8 m0, ym2, 1
+ vshufi32x4 m9, m5, m7, q3232
+ vinserti32x8 m5, ym7, 1
+ vshufi32x4 m7, m1, m3, q3232
+ vinserti32x8 m1, ym3, 1
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 1
+ jmp tx2q
+.pass1_fast:
+ mova ym3, [o(dup16_perm)]
+ vbroadcasti32x4 ym9, [cq+32*0]
+ vbroadcasti32x4 ym6, [cq+32*4]
+ vpermb ym8, ym3, [cq+32*1]
+ vpermb ym0, ym3, [cq+32*7]
+ vpermb ym5, ym3, [cq+32*5]
+ vpermb ym1, ym3, [cq+32*3]
+ vpermb ym7, ym3, [cq+32*2]
+ vpermb ym3, ym3, [cq+32*6]
+ shufpd ym9, ym9, 0x0c
+ shufpd ym6, ym6, 0x0c
+ WRAP_YMM IDCT16_MAIN
+ vbroadcasti32x4 m8, [o(int_shuf1)]
+ vbroadcasti32x4 m9, [o(int_shuf2)]
+ vinserti32x8 m0, ym2, 1 ; 0 1 | 4 5
+ vinserti32x8 m4, ym6, 1 ; 8 9 | 12 13
+ vinserti32x8 m1, ym3, 1 ; 3 2 | 7 6
+ vinserti32x8 m5, ym7, 1 ; 11 10 | 15 14
+ vshufi32x4 m2, m0, m4, q3131
+ vshufi32x4 m0, m4, q2020
+ vshufi32x4 m4, m1, m5, q2020
+ vshufi32x4 m1, m5, q3131
+ pshufb m2, m8
+ pshufb m0, m8
+ pshufb m4, m9
+ pshufb m1, m9
+ punpckhdq m3, m2, m1 ; 6-7
+ punpckldq m2, m1 ; 4-5
+ punpckhdq m1, m0, m4 ; 2-3
+ punpckldq m0, m4 ; 0-1
+ jmp tx2q
+.pass2:
+ test eobd, eobd
+ jl .pass2_fast
+ call .main
+ jmp .pass2_end
+.pass2_fast:
+ punpcklqdq m9, m0, m0
+ punpckhwd m8, m0, m0
+ punpcklwd m7, m1, m1
+ punpckhwd m1, m1
+ punpcklqdq m6, m2, m2
+ punpckhwd m5, m2, m2
+ punpckhwd m0, m3, m3
+ punpcklwd m3, m3
+ call .main_fast
+.pass2_end:
+ psrldq m8, m15, 1
+ psrlq m12, m15, 12
+ psrldq m9, m15, 2
+ psrlq m13, m15, 20
+ mova m10, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ mova m11, m9
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m13, m3
+ vpbroadcastd m2, [o(pw_512)]
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m12, m6
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+ REPX {pmulhrsw x, m2}, m0, m1, m4, m5, m8, m9, m10, m11
+.pass2_end2:
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ lea r5, [dstq+strideq*8]
+ lea r6, [r4 +strideq*8]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r3 ], 1
+ vinserti32x4 m3, [r4+strideq*0], 2
+ vinserti32x4 m6, [r4+strideq*2], 2
+ vinserti32x4 m3, [r4+strideq*1], 3
+ vinserti32x4 m6, [r4+r3 ], 3
+ mova xm12, [r5+strideq*0]
+ mova xm13, [r5+strideq*2]
+ vinserti32x4 ym12, [r5+strideq*1], 1
+ vinserti32x4 ym13, [r5+r3 ], 1
+ vinserti32x4 m12, [r6+strideq*0], 2
+ vinserti32x4 m13, [r6+strideq*2], 2
+ vinserti32x4 m12, [r6+strideq*1], 3
+ vinserti32x4 m13, [r6+r3 ], 3
+ pxor m7, m7
+ REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m8, m3
+ packuswb m0, m8
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m2
+ paddw m9, m6
+ packuswb m1, m9
+ punpcklbw m2, m12, m7
+ punpckhbw m12, m7
+ paddw m2, m4
+ paddw m10, m12
+ packuswb m2, m10
+ punpcklbw m3, m13, m7
+ punpckhbw m13, m7
+ paddw m3, m5
+ paddw m11, m13
+ packuswb m3, m11
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r3 ], ym1, 1
+ vextracti32x4 [r4+strideq*0], m0, 2
+ vextracti32x4 [r4+strideq*1], m0, 3
+ vextracti32x4 [r4+strideq*2], m1, 2
+ vextracti32x4 [r4+r3 ], m1, 3
+ mova [r5+strideq*0], xm2
+ vextracti32x4 [r5+strideq*1], ym2, 1
+ mova [r5+strideq*2], xm3
+ vextracti32x4 [r5+r3 ], ym3, 1
+ vextracti32x4 [r6+strideq*0], m2, 2
+ vextracti32x4 [r6+strideq*1], m2, 3
+ vextracti32x4 [r6+strideq*2], m3, 2
+ vextracti32x4 [r6+r3 ], m3, 3
+ RET
+ALIGN function_align
+ IDCT16_MAIN
+ ret
+
+%macro IADST16_MAIN 0
+%if mmsize == 64
+.main_fast:
+%endif
+ punpcklwd m4, m3, m0 ; in7 in0
+ punpcklwd m11, m1, m2 ; in3 in4
+ punpckhwd m9, m2, m1 ; in5 in2
+ punpckhwd m7, m0, m3 ; in1 in6
+ ITX_MUL2X_PACK 4, 0, 6, 10, 11003_804, 12140_m16364, 116 ; t1a t0a
+ ITX_MUL2X_PACK 4, 5, 6, 10, m11003_804, m12140_m16364, 52 ; t9a t8a
+ ITX_MUL2X_PACK 11, 2, 6, 10, 5520_7005, 15426_m14811, 116 ; t5a t4a
+ ITX_MUL2X_PACK 11, 5, 6, 10, m5520_7005, m15426_m14811, 52 ; t13a t12a
+ ITX_MUL2X_PACK 9, 1, 6, 10, 8423_3981, 14053_m15893, 116 ; t3a t2a
+ ITX_MUL2X_PACK 9, 5, 6, 10, m8423_3981, m14053_m15893, 52 ; t11a t10a
+ ITX_MUL2X_PACK 7, 3, 6, 10, 2404_9760, 16207_m13160, 116 ; t7a t6a
+ ITX_MUL2X_PACK 7, 5, 6, 10, m2404_9760, m16207_m13160, 52 ; t15a t14a
+%if mmsize == 64 ; for the ymm variant we only ever use the fast path
+ jmp %%main2
+ALIGN function_align
+.main:
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ ADST_MULSUB_4W 0, 5, 4, 9, 11, 10, 804, 16364, 12140, 11003 ; t1a t0a, t9a t8a
+ ADST_MULSUB_4W 2, 7, 11, 5, 9, 10, 7005, 14811, 15426, 5520 ; t5a t4a, t13a t12a
+ ADST_MULSUB_4W 1, 6, 9, 5, 7, 10, 3981, 15893, 14053, 8423 ; t3a t2a, t11a t10a
+ ADST_MULSUB_4W 3, 8, 7, 5, 6, 10, 9760, 13160, 16207, 2404 ; t7a t6a, t15a t14a
+%%main2:
+%endif
+ psubw m5, m1, m3 ; t7 t6
+ paddw m6, m1, m3 ; t3 t2
+ psubw m1, m0, m2 ; t5 t4
+ paddw m2, m0 ; t1 t0
+ ADST_MULSUB_4W 4, 11, 8, 3, 0, 10, 3196, 16069, 16069, 3196, 1 ; t8a t9a, t12a t13a
+ ADST_MULSUB_4W 9, 7, 0, 3, 11, 10, 13623, 9102, 9102, 13623, 1 ; t10a t11a, t14a t15a
+ ADST_MULSUB_4W 1, 5, 11, 3, 7, 10, 6270, 15137, 15137, 6270, 2 ; out12 -out3, t7 t6
+ psubw m3, m2, m6 ; t3a t2a
+ paddw m2, m6 ; -out15 out0
+ ADST_MULSUB_4W 8, 0, 5, 6, 7, 10, 15137, 6270, 6270, 15137, 6 ; -out13 out2, t15a t14
+ vbroadcasti32x4 m12, [o(deint_shuf)]
+ paddw m0, m4, m9 ; -out1 out14
+ psubw m4, m9 ; t10 t11
+ pshufb m2, m12
+ pshufb m1, m12
+ pshufb m8, m12
+ pshufb m0, m12
+ punpcklqdq m6, m1, m8 ; out12 -out13
+ shufps m7, m0, m2, q1032 ; out14 -out15
+%endmacro
+
+%macro IADST16_PASS1_END 0
+ shufps m0, m2, m0, q1032 ; out0 -out1
+ punpckhqdq m1, m8, m1 ; out2 -out3
+ mova m2, m10
+ vpdpwssd m2, m5, [o(pw_m11585_m11585)] {bcstd} ; out5
+ mova m8, m10
+ vpdpwssd m8, m11, [o(pw_11585_11585)] {bcstd} ; out4
+ mova m9, m10
+ vpdpwssd m9, m5, [o(pw_m11585_11585)] {bcstd} ; out10
+ mova m5, m10
+ vpdpwssd m5, m11, [o(pw_11585_m11585)] {bcstd} ; out11
+ mova m11, m10
+ vpdpwssd m11, m3, [o(pw_m11585_m11585)] {bcstd} ; out7
+ mova m14, m10
+ vpdpwssd m14, m4, [o(pw_11585_11585)] {bcstd} ; out6
+ mova m12, m10
+ vpdpwssd m12, m3, [o(pw_m11585_11585)] {bcstd} ; out8
+ mova m3, m10
+ vpdpwssd m3, m4, [o(pw_m11585_11585)] {bcstd} ; out9
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct, 39-18
+INV_TXFM_16X16_FN adst, adst
+
+cglobal vp9_iadst_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2
+ mova m15, [o(itx_perm)]
+ psrlq m7, m15, 4
+ vpermq m0, m15, [cq+64*0] ; 0 1
+ vpermq m1, m7, [cq+64*1] ; 3 2
+ vpermq m2, m15, [cq+64*2] ; 4 5
+ vpermq m3, m7, [cq+64*3] ; 7 6
+ vpbroadcastd m10, [o(pd_8192)]
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+ sub eobd, 39
+ jl .pass1_fast
+ vpermq m4, m15, [cq+64*4] ; 8 9
+ vpermq m5, m7, [cq+64*5] ; 11 10
+ vpermq m6, m15, [cq+64*6] ; 12 13
+ vpermq m7, m7, [cq+64*7] ; 15 14
+ call .main
+ IADST16_PASS1_END
+ REPX {psrad x, 14}, m2, m8, m9, m5, m11, m14, m12, m3
+ packssdw m2, m8, m2 ; out4 out5
+ packssdw m5, m9, m5 ; out10 out11
+ packssdw m4, m12, m3 ; out8 out9
+ packssdw m3, m14, m11 ; out6 out7
+ pxor m9, m9
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ psubw m8, m9, m8
+ punpckhwd m1, m0, m8
+ punpcklwd m0, m8
+ punpckhwd m8, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m2, m8
+ punpcklwd m2, m8
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m4, m8
+ punpcklwd m4, m8
+ punpckhwd m8, m6, m7
+ punpcklwd m6, m7
+ psubw m8, m9, m8
+ punpckhwd m7, m6, m8
+ punpcklwd m6, m8
+ jmp m(vp9_idct_16x16_internal).pass1_end
+.pass1_fast:
+ WRAP_YMM IADST16_MAIN
+ WRAP_YMM IADST16_PASS1_END
+ vinserti32x8 m0, ym6, 1
+ vinserti32x8 m1, ym7, 1
+ vinserti32x8 m8, ym12, 1
+ vinserti32x8 m2, ym3, 1
+ vinserti32x8 m14, ym9, 1
+ vinserti32x8 m11, ym5, 1
+ pslld m14, 2
+ pslld m11, 2
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ vpmultishiftqb m14{k7}, m13, m8
+ vpmultishiftqb m11{k7}, m13, m2
+ psrlq m1, m15, 24
+ pxor m2, m2
+ psubw m2, m4
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ psrlq m2, m15, 28
+ punpckhwd m4, m14, m11
+ punpcklwd m14, m11
+ mova m5, m2
+ vpermi2q m2, m0, m14
+ vpermt2q m0, m1, m14
+ vpermi2q m1, m3, m4
+ vpermt2q m3, m5, m4
+ jmp tx2q
+.pass2:
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ test eobd, eobd
+ jl .pass2_fast
+ pshufd m5, m5, q1032
+ pshufd m7, m7, q1032
+ call .main
+ jmp .pass2_end
+.pass2_fast:
+ call .main_fast
+.pass2_end:
+ vbroadcasti32x4 m9, [o(pw_11585_m11585x2x4)]
+ vbroadcasti32x4 m10, [o(pw_m11585_11585x2x4)]
+ punpckhqdq m1, m8 ; -out3 out2
+ shufps m0, m2, q3210 ; -out1 out0
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ vbroadcasti32x4 m11, [o(pw_512)]
+ vpbroadcastd m12, [o(pw_512)]
+ punpcklqdq m8, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ shufps m3, m4, q3210 ; t3a t11
+ psubsw m4, m2, m3
+ paddsw m3, m2
+ paddsw m2, m5, m8
+ psubsw m5, m8
+ pmulhrsw m4, m9 ; out8 out9
+ pmulhrsw m3, m10 ; out7 out6
+ pmulhrsw m2, m10 ; out5 out4
+ pmulhrsw m5, m9 ; out10 out11
+ pmulhrsw m6, m11
+ pmulhrsw m7, m11
+ pshufd m11, m11, q1032
+ pmulhrsw m0, m11
+ pmulhrsw m1, m11
+ REPX {pmulhrsw x, m12}, m2, m3, m4, m5
+ psrldq m8, m15, 2
+ psrlq m12, m15, 20
+ psrldq m10, m15, 1
+ psrlq m13, m15, 12
+ mova m9, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m12, m3
+ mova m11, m10
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m13, m6
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+ jmp m(vp9_idct_16x16_internal).pass2_end2
+ALIGN function_align
+ IADST16_MAIN
+ ret
+
+%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
+ pmovzxbw m10, [dstq+%3]
+ pmovzxbw m11, [r3 +%4]
+%if %2 < 8
+ paddw m8, m%2, m%1
+ psubw m9, m%2, m%1
+%else
+ mova m9, [rsp+64*(%2-8)]
+ paddw m8, m9, m%1
+ psubw m9, m%1
+%endif
+ pmulhrsw m8, m12
+ pmulhrsw m9, m12
+ paddw m8, m10
+ paddw m9, m11
+ packuswb m8, m9
+ vpermq m8, m13, m8
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+
+cglobal vp9_idct_idct_32x32_add, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r6, [o_base]
+ cmp eobd, 1
+ jne .pass1
+ movd xmm0, [o(pw_11585x2)]
+ pmulhrsw xmm3, xmm0, [cq]
+ pxor m2, m2
+ pmulhrsw xmm3, xmm0
+ pmulhrsw xmm3, [o(pw_512)]
+ movd [cq], xm2
+ add r3d, 15
+ vpbroadcastw m3, xmm3
+.dconly_loop:
+ mova ym1, [dstq+strideq*0]
+ vinserti32x8 m1, [dstq+strideq*1], 1
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .dconly_loop
+ RET
+.pass1:
+ PROLOGUE 0, 7, 30, 64*16, dst, stride, c, eob
+ sub eobd, 135
+ jl .fast
+ mova m0, [cq+64* 0]
+ mova m14, [cq+64* 2]
+ mova m1, [cq+64* 4]
+ mova m15, [cq+64* 6]
+ mova m2, [cq+64* 8]
+ mova m16, [cq+64*10]
+ mova m3, [cq+64*12]
+ mova m17, [cq+64*14]
+ mova m4, [cq+64*16]
+ mova m18, [cq+64*18]
+ mova m5, [cq+64*20]
+ mova m19, [cq+64*22]
+ mova m6, [cq+64*24]
+ mova m20, [cq+64*26]
+ mova m7, [cq+64*28]
+ mova m21, [cq+64*30]
+ call .idct16
+ mova [rsp+64*0], m14
+ mova [rsp+64*1], m15
+ mova [rsp+64*2], m16
+ mova [rsp+64*3], m17
+ mova [rsp+64*4], m18
+ mova [rsp+64*5], m19
+ mova [rsp+64*6], m20
+ mova [rsp+64*7], m21
+ mova m22, [cq+64* 1]
+ mova m23, [cq+64* 3]
+ mova m24, [cq+64* 5]
+ mova m25, [cq+64* 7]
+ mova m26, [cq+64* 9]
+ mova m27, [cq+64*11]
+ mova m28, [cq+64*13]
+ mova m29, [cq+64*15]
+ mova m14, [cq+64*17]
+ mova m15, [cq+64*19]
+ mova m16, [cq+64*21]
+ mova m17, [cq+64*23]
+ mova m18, [cq+64*25]
+ mova m19, [cq+64*27]
+ mova m20, [cq+64*29]
+ mova m21, [cq+64*31]
+ call .main
+ psubw m13, m0, m29 ; 31
+ paddw m0, m29 ; 0
+ psubw m29, m1, m28 ; 30
+ paddw m1, m28 ; 1
+ psubw m28, m2, m27 ; 29
+ paddw m2, m27 ; 2
+ psubw m27, m3, m26 ; 28
+ paddw m3, m26 ; 3
+ psubw m26, m4, m25 ; 27
+ paddw m4, m25 ; 4
+ psubw m25, m5, m24 ; 26
+ paddw m5, m24 ; 5
+ psubw m24, m6, m23 ; 25
+ paddw m6, m23 ; 6
+ psubw m23, m7, m22 ; 24
+ paddw m7, m22 ; 7
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ punpcklqdq m5, m23, m27 ; d00 d08 d16 d24
+ punpckhqdq m23, m27 ; d01 d09 d17 d25
+ punpckhqdq m27, m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m25, m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m26, m9, m24 ; d07 d15 d23 d31
+ punpcklqdq m9, m24 ; d06 d14 d22 d30
+ mova [rsp+64*12], m23
+ mova [rsp+64*13], m27
+ mova [rsp+64*14], m25
+ mova [rsp+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova m2, [rsp+64*0]
+ mova m11, [rsp+64*1]
+ mova m12, [rsp+64*2]
+ mova m29, [rsp+64*3]
+ mova m27, [rsp+64*4]
+ mova m26, [rsp+64*5]
+ mova m4, [rsp+64*6]
+ mova m28, [rsp+64*7]
+ psubw m1, m2, m21 ; 23
+ paddw m2, m21 ; 8
+ psubw m21, m11, m20 ; 22
+ paddw m11, m20 ; 9
+ psubw m20, m12, m19 ; 21
+ paddw m12, m19 ; 10
+ psubw m19, m29, m18 ; 20
+ paddw m29, m18 ; 11
+ psubw m18, m27, m17 ; 19
+ paddw m27, m17 ; 12
+ psubw m17, m26, m16 ; 18
+ paddw m26, m16 ; 13
+ paddw m16, m4, m15 ; 14
+ psubw m4, m15 ; 17
+ mova m15, m6
+ psubw m6, m28, m14 ; 16
+ paddw m28, m14 ; 15
+ mova m14, m7
+ punpcklwd m7, m6, m4
+ punpckhwd m6, m4
+ punpckhwd m4, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m6, m4
+ punpckldq m6, m4
+ punpckhdq m4, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m7, m17
+ punpckldq m7, m17
+ punpcklqdq m17, m28, m12 ; b02 b10 b18 b26
+ punpckhqdq m28, m12 ; b03 b11 b19 b27
+ punpckhqdq m12, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpckhqdq m27, m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m29, m16, m11 ; b07 b15 b23 b31
+ punpcklqdq m16, m11 ; b06 b14 b22 b30
+ mova [rsp+64* 8], m12
+ mova [rsp+64* 9], m28
+ mova [rsp+64*10], m27
+ mova [rsp+64*11], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m7, m19 ; c01 c09 c17 c25
+ punpcklqdq m7, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m6, m18 ; c05 c13 c21 c29
+ punpcklqdq m6, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m4 ; c07 c15 c23 c31
+ punpcklqdq m21, m4 ; c06 c14 c22 c30
+ mov r3d, 64*28
+ pxor m4, m4
+.zero_loop:
+ mova [cq+r3+64*0], m4
+ mova [cq+r3+64*1], m4
+ mova [cq+r3+64*2], m4
+ mova [cq+r3+64*3], m4
+ sub r3d, 64*4
+ jge .zero_loop
+ vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08
+ vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08
+ vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12
+ vshufi32x4 m3, m1, m6, q3131 ; 12
+ vshufi32x4 m1, m6, q2020 ; 4
+ vshufi32x4 m6, m4, m2, q3131 ; 24
+ vshufi32x4 m4, m2, q2020 ; 16
+ vshufi32x4 m2, m0, m7, q3131 ; 8
+ vshufi32x4 m0, m7, q2020 ; 0
+ vshufi32x4 m7, m5, m8, q3131 ; 28
+ vshufi32x4 m5, m8, q2020 ; 20
+ vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10
+ vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10
+ vshufi32x4 m13, m21, m9, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m21, ym9, 1 ; c06 c14 d06 d14
+ vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14
+ vshufi32x4 m16, m14, m20, q3131 ; 10
+ vshufi32x4 m14, m20, q2020 ; 2
+ vshufi32x4 m20, m18, m17, q3131 ; 26
+ vshufi32x4 m18, m17, q2020 ; 18
+ vshufi32x4 m17, m15, m21, q3131 ; 14
+ vshufi32x4 m15, m21, q2020 ; 6
+ vshufi32x4 m21, m19, m13, q3131 ; 30
+ vshufi32x4 m19, m13, q2020 ; 22
+ call .idct16
+ mova [rsp+64*0], m14
+ mova [rsp+64*1], m15
+ mova [rsp+64*2], m16
+ mova [rsp+64*3], m17
+ mova [rsp+64*4], m18
+ mova [rsp+64*5], m19
+ mova [rsp+64*6], m20
+ mova [rsp+64*7], m21
+ mova m15, [rsp+64* 8]
+ mova m16, [rsp+64* 9]
+ mova m17, [rsp+64*10]
+ mova m19, [rsp+64*11]
+ mova m20, [rsp+64*12]
+ mova m21, [rsp+64*13]
+ mova m13, [rsp+64*14]
+ mova m18, [rsp+64*15]
+ vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15
+ vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31
+ vshufi32x4 m18, m14, m26, q3131 ; 25
+ vshufi32x4 m14, m26, q2020 ; 17
+ vshufi32x4 m19, m15, m27, q3131 ; 27
+ vshufi32x4 m15, m27, q2020 ; 19
+ vshufi32x4 m20, m16, m28, q3131 ; 29
+ vshufi32x4 m16, m28, q2020 ; 21
+ vshufi32x4 m21, m17, m29, q3131 ; 31
+ vshufi32x4 m17, m29, q2020 ; 23
+ vshufi32x4 m26, m22, m8, q3131 ; 9
+ vshufi32x4 m22, m8, q2020 ; 1
+ vshufi32x4 m27, m23, m9, q3131 ; 11
+ vshufi32x4 m23, m9, q2020 ; 3
+ vshufi32x4 m28, m24, m11, q3131 ; 13
+ vshufi32x4 m24, m11, q2020 ; 5
+ vshufi32x4 m29, m25, m12, q3131 ; 15
+ vshufi32x4 m25, m12, q2020 ; 7
+ call .main
+ jmp .end
+.fast:
+ mova m14, [o(dup16_perm)]
+ pmovzxbw m9, [cq+64*0]
+ pmovzxbw m6, [cq+64*8]
+ vpermb m8, m14, [cq+64* 2]
+ vpermb m0, m14, [cq+64*14]
+ vpermb m5, m14, [cq+64*10]
+ vpermb m1, m14, [cq+64* 6]
+ vpermb m7, m14, [cq+64* 4]
+ vpermb m3, m14, [cq+64*12]
+ vpbroadcastd m10, [o(pd_8192)]
+ vpbroadcastq m13, [o(int_mshift)]
+ packuswb m9, m9
+ packuswb m6, m6
+ vpcmpub k7, m13, m10, 6
+ IDCT16_MAIN 1
+ vpermb m21, m14, [cq+64* 1]
+ vpermb m17, m14, [cq+64*15]
+ vpermb m20, m14, [cq+64* 9]
+ vpermb m15, m14, [cq+64* 7]
+ vpermb m18, m14, [cq+64* 5]
+ vpermb m16, m14, [cq+64*11]
+ vpermb m19, m14, [cq+64*13]
+ vpermb m14, m14, [cq+64* 3]
+ call .main_packed_fast
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m14, m16
+ punpckhwd m14, m16
+ punpcklwd m16, m15, m17
+ punpckhwd m15, m17
+ punpcklwd m17, m19, m21
+ punpckhwd m19, m21
+ punpckhwd m21, m18, m20
+ punpcklwd m18, m20
+ punpcklwd m20, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m15
+ punpckhwd m7, m15
+ punpcklwd m15, m14, m16
+ punpckhwd m14, m16
+ punpckhwd m16, m18, m19
+ punpcklwd m18, m19
+ punpcklwd m19, m21, m17
+ punpckhwd m21, m17
+ punpcklwd m17, m8, m0 ; a2 a6 aa ae
+ punpckhwd m8, m0 ; a3 a7 ab af
+ punpcklwd m0, m20, m1 ; a0 a4 a8 ac
+ punpckhwd m20, m1 ; a1 a5 a9 ad
+ punpcklwd m1, m2, m5 ; b0 b4 b8 bc
+ punpckhwd m2, m5 ; b1 b5 b9 bd
+ punpcklwd m5, m3, m4 ; b2 b6 ba be
+ punpckhwd m3, m4 ; b3 b7 bb bf
+ punpcklwd m4, m6, m15 ; c0 c4 c8 cc
+ punpckhwd m6, m15 ; c1 c5 c9 cd
+ punpcklwd m15, m7, m14 ; c2 c6 ca ce
+ punpckhwd m7, m14 ; c3 c7 cb cf
+ punpcklwd m14, m18, m19 ; d0 d4 d8 dc
+ punpckhwd m18, m19 ; d1 d5 d9 dd
+ punpcklwd m9, m16, m21 ; d2 d6 da de
+ punpckhwd m16, m21 ; d3 d7 db df
+ mov r3d, 64*12
+ pxor ym21, ym21
+.fast_zero_loop:
+ mova [cq+r3+64*0], ym21
+ mova [cq+r3+64*1], ym21
+ mova [cq+r3+64*2], ym21
+ mova [cq+r3+64*3], ym21
+ sub r3d, 64*4
+ jge .fast_zero_loop
+ vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc
+ vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4
+ vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6
+ vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be
+ vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7
+ vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf
+ vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4
+ vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc
+ vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5
+ vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd
+ vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5
+ vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd
+ vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6
+ vshufi32x4 m15, m9, q3232 ; ca ce da de
+ vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7
+ vshufi32x4 m7, m16, q3232 ; cb cf db df
+ vshufi32x4 m22, m14, m2, q2020 ; 1
+ vshufi32x4 m24, m14, m2, q3131 ; 5
+ vshufi32x4 m23, m17, m9, q2020 ; 3
+ vshufi32x4 m25, m17, m9, q3131 ; 7
+ vshufi32x4 m16, m5, m15, q2020 ; 10
+ vshufi32x4 m17, m5, m15, q3131 ; 14
+ vshufi32x4 m14, m1, m18, q2020 ; 2
+ vshufi32x4 m15, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m21, m4, q3131 ; 12
+ vshufi32x4 m2, m21, m4, q2020 ; 8
+ vshufi32x4 m26, m20, m6, q2020 ; 9
+ vshufi32x4 m28, m20, m6, q3131 ; 13
+ vshufi32x4 m27, m19, m7, q2020 ; 11
+ vshufi32x4 m29, m19, m7, q3131 ; 15
+ call .idct16_fast
+ mova [rsp+64*0], m14
+ mova [rsp+64*1], m15
+ mova [rsp+64*2], m16
+ mova [rsp+64*3], m17
+ mova [rsp+64*4], m18
+ mova [rsp+64*5], m19
+ mova [rsp+64*6], m20
+ mova [rsp+64*7], m21
+ call .main_fast
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_512)]
+ movshdup m13, [o(itx_perm)]
+ lea r3, [dstq+r4*8]
+ lea r5, [strideq+r4] ; stride*4
+ add r3, r5 ; dst+stride*28
+ IDCT_32x32_END 29, 0, strideq*0, r4
+ IDCT_32x32_END 28, 1, strideq*1, strideq*2
+ IDCT_32x32_END 27, 2, strideq*2, strideq*1
+ IDCT_32x32_END 26, 3, r4 , strideq*0
+ IDCT_32x32_END 25, 4, strideq*0, r4
+ IDCT_32x32_END 24, 5, strideq*1, strideq*2
+ IDCT_32x32_END 23, 6, strideq*2, strideq*1
+ IDCT_32x32_END 22, 7, r4 , strideq*0
+ IDCT_32x32_END 21, 8, strideq*0, r4
+ IDCT_32x32_END 20, 9, strideq*1, strideq*2
+ IDCT_32x32_END 19, 10, strideq*2, strideq*1
+ IDCT_32x32_END 18, 11, r4 , strideq*0
+ IDCT_32x32_END 17, 12, strideq*0, r4
+ IDCT_32x32_END 16, 13, strideq*1, strideq*2
+ IDCT_32x32_END 15, 14, strideq*2, strideq*1
+ IDCT_32x32_END 14, 15, r4 , strideq*0
+ RET
+ALIGN function_align
+.idct16_fast:
+ vpbroadcastd m21, [o(pw_16305x2)]
+ vpbroadcastd m8, [o(pw_1606x2)]
+ vpbroadcastd m18, [o(pw_m10394x2)]
+ vpbroadcastd m9, [o(pw_12665x2)]
+ pmulhrsw m21, m14 ; t15a
+ vpbroadcastd m19, [o(pw_14449x2)]
+ pmulhrsw m14, m8 ; t8a
+ vpbroadcastd m8, [o(pw_7723x2)]
+ pmulhrsw m18, m17 ; t9a
+ vpbroadcastd m20, [o(pw_m4756x2)]
+ pmulhrsw m17, m9 ; t14a
+ vpbroadcastd m9, [o(pw_15679x2)]
+ pmulhrsw m19, m16 ; t13a
+ vpbroadcastd m5, [o(pw_m9102x2)]
+ pmulhrsw m16, m8 ; t10a
+ vpbroadcastd m8, [o(pw_13623x2)]
+ pmulhrsw m20, m15 ; t11a
+ vpbroadcastd m7, [o(pw_16069x2)]
+ pmulhrsw m15, m9 ; t12a
+ vpbroadcastd m9, [o(pw_3196x2)]
+ pmulhrsw m5, m3 ; t5a
+ vpbroadcastd m6, [o(pw_15137x2)]
+ pmulhrsw m3, m8 ; t6a
+ vpbroadcastd m8, [o(pw_6270x2)]
+ pmulhrsw m7, m1 ; t7a
+ vpbroadcastd m4, [o(pw_11585x2)]
+ pmulhrsw m1, m9 ; t4
+ vpbroadcastd m10, [o(pd_8192)]
+ pmulhrsw m6, m2 ; t3
+ pmulhrsw m2, m8 ; t2
+ pmulhrsw m4, m0 ; t0
+ mova m0, m4 ; t1
+ jmp .idct16b
+ALIGN function_align
+.idct16:
+ vpbroadcastd m10, [o(pd_8192)]
+ ITX_MULSUB_2W 14, 21, 8, 9, 10, 1606, 16305 ; t8a, t15a
+ ITX_MULSUB_2W 18, 17, 8, 9, 10, 12665, 10394 ; t9a, t14a
+ ITX_MULSUB_2W 16, 19, 8, 9, 10, 7723, 14449 ; t10a, t13a
+ ITX_MULSUB_2W 20, 15, 8, 9, 10, 15679, 4756 ; t11a, t12
+ ITX_MULSUB_2W 5, 3, 8, 9, 10, 13623, 9102 ; t5a, t6a
+ ITX_MULSUB_2W 1, 7, 8, 9, 10, 3196, 16069 ; t4a, t7a
+ ITX_MULSUB_2W 2, 6, 8, 9, 10, 6270, 15137 ; t2, t3
+ ITX_MULSUB_2W 0, 4, 8, 9, 10, 11585, 11585 ; t1, t0
+.idct16b:
+ paddw m8, m20, m16 ; t11
+ psubw m20, m16 ; t10
+ paddw m16, m15, m19 ; t12
+ psubw m15, m19 ; t13
+ psubw m19, m14, m18 ; t9
+ paddw m14, m18 ; t8
+ psubw m18, m21, m17 ; t14
+ paddw m21, m17 ; t15
+ vpbroadcastd m11, [o(pw_6270_15137)]
+ vpbroadcastd m12, [o(pw_m15137_6270)]
+ ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a
+ vpbroadcastd m11, [o(pw_m6270_m15137)]
+ ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
+ vpbroadcastd m11, [o(pw_11585_11585)]
+ vpbroadcastd m12, [o(pw_m11585_11585)]
+ paddw m9, m7, m3 ; t7
+ psubw m3, m7, m3 ; t6a
+ paddw m7, m1, m5 ; t4
+ psubw m1, m5 ; t5a
+ psubw m17, m14, m8 ; t11a
+ paddw m8, m14 ; t8a
+ paddw m14, m18, m15 ; t9
+ psubw m18, m15 ; t10
+ psubw m15, m19, m20 ; t13
+ paddw m19, m20 ; t14
+ paddw m20, m21, m16 ; t15a
+ psubw m16, m21, m16 ; t12a
+ ITX_MULSUB_2W 3, 1, 5, 21, 10, 11, 12 ; t5, t6
+ ITX_MULSUB_2W 15, 18, 5, 21, 10, 11, 12 ; t10a, t13a
+ ITX_MULSUB_2W 16, 17, 5, 21, 10, 11, 12 ; t11, t12
+ psubw m5, m0, m2 ; t2
+ paddw m2, m0 ; t1
+ paddw m0, m4, m6 ; t0
+ psubw m4, m6 ; t3
+ psubw m6, m2, m1 ; t6
+ paddw m1, m2 ; t1
+ paddw m2, m5, m3 ; t2
+ psubw m5, m3 ; t5
+ paddw m3, m4, m7 ; t3
+ psubw m4, m7 ; t4
+ psubw m7, m0, m9 ; t7
+ paddw m0, m9 ; t0
+ psubw m21, m0, m20 ; out15
+ paddw m0, m20 ; out0
+ psubw m20, m1, m19 ; out14
+ paddw m1, m19 ; out1
+ psubw m19, m2, m18 ; out13
+ paddw m2, m18 ; out2
+ psubw m18, m3, m17 ; out12
+ paddw m3, m17 ; out3
+ psubw m17, m4, m16 ; out11
+ paddw m4, m16 ; out4
+ psubw m16, m5, m15 ; out10
+ paddw m5, m15 ; out5
+ psubw m15, m6, m14 ; out9
+ paddw m6, m14 ; out6
+ psubw m14, m7, m8 ; out8
+ paddw m7, m8 ; out7
+ ret
+ALIGN function_align
+.main_fast:
+ vpbroadcastd m21, [o(pw_16364x2)]
+ vpbroadcastd m8, [o(pw_804x2)]
+ vpbroadcastd m14, [o(pw_m11003x2)]
+ vpbroadcastd m9, [o(pw_12140x2)]
+ pmulhrsw m21, m22 ; t31a
+ vpbroadcastd m17, [o(pw_14811x2)]
+ pmulhrsw m22, m8 ; t16a
+ vpbroadcastd m8, [o(pw_7005x2)]
+ pmulhrsw m14, m29 ; t30a
+ vpbroadcastd m18, [o(pw_m5520x2)]
+ pmulhrsw m29, m9 ; t17a
+ vpbroadcastd m9, [o(pw_15426x2)]
+ pmulhrsw m17, m26 ; t29a
+ vpbroadcastd m19, [o(pw_15893x2)]
+ pmulhrsw m26, m8 ; t18a
+ vpbroadcastd m8, [o(pw_3981x2)]
+ pmulhrsw m18, m25 ; t19a
+ vpbroadcastd m16, [o(pw_m8423x2)]
+ pmulhrsw m25, m9 ; t28a
+ vpbroadcastd m9, [o(pw_14053x2)]
+ pmulhrsw m19, m24 ; t27a
+ vpbroadcastd m15, [o(pw_13160x2)]
+ pmulhrsw m24, m8 ; t20a
+ vpbroadcastd m8, [o(pw_9760x2)]
+ pmulhrsw m16, m27 ; t21a
+ vpbroadcastd m20, [o(pw_m2404x2)]
+ pmulhrsw m27, m9 ; t26a
+ vpbroadcastd m9, [o(pw_16207x2)]
+ pmulhrsw m15, m28 ; t25a
+ pmulhrsw m28, m8 ; t22a
+ pmulhrsw m20, m23 ; t23a
+ pmulhrsw m23, m9 ; t24a
+ jmp .main2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2W 22, 21, 8, 9, 10, 804, 16364 ; t16a, t31a
+ ITX_MULSUB_2W 14, 29, 8, 9, 10, 12140, 11003 ; t17a, t30a
+ ITX_MULSUB_2W 26, 17, 8, 9, 10, 7005, 14811 ; t18a, t29a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 15426, 5520 ; t19a, t28a
+ ITX_MULSUB_2W 24, 19, 8, 9, 10, 3981, 15893 ; t20a, t27a
+ ITX_MULSUB_2W 16, 27, 8, 9, 10, 14053, 8423 ; t21a, t26a
+ ITX_MULSUB_2W 28, 15, 8, 9, 10, 9760, 13160 ; t22a, t25a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 16207, 2404 ; t23a, t24a
+.main2:
+ psubw m8, m22, m14 ; t17
+ paddw m22, m14 ; t16
+ paddw m14, m18, m26 ; t19
+ psubw m18, m26 ; t18
+ psubw m26, m24, m16 ; t21
+ paddw m24, m16 ; t20
+ psubw m16, m20, m28 ; t22
+ paddw m28, m20 ; t23
+ psubw m20, m23, m15 ; t25
+ paddw m23, m15 ; t24
+ psubw m15, m21, m29 ; t30
+ paddw m21, m29 ; t31
+ psubw m29, m19, m27 ; t26
+ paddw m19, m27 ; t27
+ paddw m27, m25, m17 ; t28
+ psubw m25, m17 ; t29
+ ITX_MULSUB_2W 15, 8, 9, 17, 10, 3196, 16069 ; t17a, t30a
+ ITX_MULSUB_2W 25, 18, 9, 17, 10, m16069, 3196 ; t18a, t29a
+ ITX_MULSUB_2W 29, 26, 9, 17, 10, 13623, 9102 ; t21a, t26a
+ ITX_MULSUB_2W 20, 16, 9, 17, 10, m9102, 13623 ; t22a, t25a
+ psubw m17, m21, m27 ; t28a
+ paddw m21, m27 ; t31a
+ psubw m27, m15, m25 ; t18
+ paddw m15, m25 ; t17
+ psubw m25, m20, m29 ; t21
+ paddw m20, m29 ; t22
+ psubw m29, m8, m18 ; t29
+ paddw m8, m18 ; t30
+ psubw m18, m22, m14 ; t19a
+ paddw m22, m14 ; t16a
+ psubw m14, m28, m24 ; t20a
+ paddw m24, m28 ; t23a
+ paddw m28, m16, m26 ; t25
+ psubw m16, m26 ; t26
+ psubw m26, m23, m19 ; t27a
+ paddw m23, m19 ; t24a
+ vpbroadcastd m12, [o(pw_m15137_6270)]
+ vpbroadcastd m11, [o(pw_6270_15137)]
+ ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a
+ ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28
+ vpbroadcastd m11, [o(pw_m6270_m15137)]
+ ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a
+ ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27
+ vpbroadcastd m12, [o(pw_m11585_11585)]
+ vpbroadcastd m11, [o(pw_11585_11585)]
+ psubw m19, m27, m25 ; t26
+ paddw m27, m25 ; t29
+ psubw m25, m17, m26 ; t20a
+ paddw m17, m26 ; t19a
+ paddw m26, m18, m14 ; t28a
+ psubw m18, m14 ; t27a
+ paddw m14, m22, m24 ; t16
+ psubw m22, m24 ; t23
+ psubw m24, m29, m16 ; t21
+ paddw m16, m29 ; t18
+ paddw m29, m21, m23 ; t31
+ psubw m21, m23 ; t24
+ psubw m23, m15, m20 ; t22a
+ paddw m15, m20 ; t17a
+ psubw m20, m8, m28 ; t25a
+ paddw m28, m8 ; t30a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27
+ ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a
+ ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25
+ ret
+ALIGN function_align
+.main_packed_fast:
+ vpbroadcastd m8, [o(pw_804_16364x2)]
+ vpbroadcastd m9, [o(pw_m11003_12140x2)]
+ vpbroadcastd m11, [o(pw_7005_14811x2)]
+ vpbroadcastd m12, [o(pw_m5520_15426x2)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ vpbroadcastd m8, [o(pw_3981_15893x2)]
+ pmulhrsw m17, m9 ; t17a, t30a
+ vpbroadcastd m9, [o(pw_m8423_14053x2)]
+ pmulhrsw m20, m11 ; t18a, t29a
+ vpbroadcastd m11, [o(pw_9760_13160x2)]
+ pmulhrsw m15, m12 ; t19a, t28a
+ vpbroadcastd m12, [o(pw_m2404_16207x2)]
+ pmulhrsw m18, m8 ; t20a, t27a
+ pmulhrsw m16, m9 ; t21a, t26a
+ pmulhrsw m19, m11 ; t22a, t25a
+ pmulhrsw m14, m12 ; t23a, t24a
+ psubw m8, m21, m17 ; t17 t30
+ paddw m21, m17 ; t16 t31
+ psubw m17, m15, m20 ; t18 t29
+ paddw m20, m15 ; t19 t28
+ psubw m15, m18, m16 ; t21 t26
+ paddw m18, m16 ; t20 t27
+ psubw m16, m14, m19 ; t22 t25
+ paddw m14, m19 ; t23 t24
+ ITX_MUL2X_PACK 8, 9, 19, 10, 3196, 16069, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 9, 19, 10, m16069, 3196, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 9, 19, 10, 13623, 9102, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 9, 19, 10, m9102, 13623, 5 ; t22a t25a
+ vpbroadcastd m11, [o(pw_m15137_6270)]
+ psubw m19, m21, m20 ; t19a t28a
+ paddw m21, m20 ; t16a t31a
+ psubw m20, m14, m18 ; t20a t27a
+ paddw m14, m18 ; t23a t24a
+ psubw m18, m8, m17 ; t18 t29
+ paddw m8, m17 ; t17 t30
+ psubw m17, m16, m15 ; t21 t26
+ paddw m15, m16 ; t22 t25
+ ITX_MUL2X_PACK 18, 9, 16, 10, 6270_15137, 11, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 9, 16, 10, 6270_15137, 11, 20 ; t19 t28
+ ITX_MUL2X_PACK 20, 9, 16, 10, 11, m6270_m15137, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 9, 16, 10, 11, m6270_m15137, 36 ; t21a t26a
+ vbroadcasti32x4 m9, [o(deint_shuf)]
+ psubw m16, m21, m14 ; t23 t24
+ paddw m14, m21 ; t16 t31
+ psubw m21, m8, m15 ; t22a t25a
+ paddw m15, m8 ; t17a t30a
+ psubw m8, m18, m17 ; t21 t26
+ paddw m18, m17 ; t18 t29
+ paddw m17, m19, m20 ; t19a t28a
+ psubw m19, m20 ; t20a t27a
+ vpbroadcastd m11, [o(pw_m11585_11585)]
+ vpbroadcastd m12, [o(pw_11585_11585)]
+ REPX {pshufb x, m9}, m14, m15, m18, m17
+ mova m9, m10
+ vpdpwssd m9, m16, m11
+ mova m20, m10
+ vpdpwssd m20, m21, m11
+ psrad m9, 14
+ psrad m20, 14
+ packssdw m9, m20 ; t23a t22
+ mova m20, m10
+ vpdpwssd m20, m16, m12
+ mova m16, m10
+ vpdpwssd m16, m21, m12
+ psrad m20, 14
+ psrad m16, 14
+ packssdw m16, m20, m16 ; t24a t25
+ ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a
+ ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27
+ packssdw m11, m20 ; t27 t26a
+ packssdw m8, m21 ; t20 t21a
+ punpcklqdq m20, m14, m15 ; t16 t17a
+ punpckhqdq m14, m15 ; t31 t30a
+ punpckhqdq m15, m17, m18 ; t28a t29
+ punpcklqdq m17, m18 ; t19a t18
+ psubw m21, m0, m14 ; out31 out30
+ paddw m0, m14 ; out0 out1
+ psubw m14, m7, m20 ; out16 out17
+ paddw m7, m20 ; out15 out14
+ psubw m20, m1, m15 ; out28 out29
+ paddw m1, m15 ; out3 out2
+ psubw m15, m6, m17 ; out19 out18
+ paddw m6, m17 ; out12 out13
+ psubw m17, m4, m9 ; out23 out22
+ paddw m4, m9 ; out8 out9
+ psubw m18, m3, m16 ; out24 out25
+ paddw m3, m16 ; out7 out6
+ psubw m16, m5, m8 ; out20 out21
+ paddw m5, m8 ; out11 out10
+ psubw m19, m2, m11 ; out27 out26
+ paddw m2, m11 ; out4 out5
+ ret
+
+%endif
diff --git a/libavutil/mem_internal.h b/libavutil/mem_internal.h
index c027fa51c3..d58881d09c 100644
--- a/libavutil/mem_internal.h
+++ b/libavutil/mem_internal.h
@@ -131,4 +131,6 @@
#define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_D(32, t, v, __VA_ARGS__,,))
+#define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_D(64, t, v, __VA_ARGS__,,))
+
#endif /* AVUTIL_MEM_INTERNAL_H */
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index cecd0dee0f..bddc9a79fc 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -310,13 +310,13 @@ static int is_zero(const int16_t *c, int sz)
static void check_itxfm(void)
{
- LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);
- LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);
- LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
- LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
- LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);
- LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);
- LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);
+ LOCAL_ALIGNED_64(uint8_t, src, [32 * 32 * 2]);
+ LOCAL_ALIGNED_64(uint8_t, dst, [32 * 32 * 2]);
+ LOCAL_ALIGNED_64(uint8_t, dst0, [32 * 32 * 2]);
+ LOCAL_ALIGNED_64(uint8_t, dst1, [32 * 32 * 2]);
+ LOCAL_ALIGNED_64(int16_t, coef, [32 * 32 * 2]);
+ LOCAL_ALIGNED_64(int16_t, subcoef0, [32 * 32 * 2]);
+ LOCAL_ALIGNED_64(int16_t, subcoef1, [32 * 32 * 2]);
declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
VP9DSPContext dsp;
int y, x, tx, txtp, bit_depth, sub;
More information about the ffmpeg-cvslog
mailing list