[FFmpeg-devel] [PATCH] vp9: add x86 simd (sse2/ssse3) for iadst4 10bpp functions.
Ronald S. Bultje
rsbultje at gmail.com
Tue Oct 6 17:42:10 CEST 2015
---
libavcodec/x86/vp9dsp_init.h | 6 ++
libavcodec/x86/vp9dsp_init_16bpp_template.c | 21 +++++-
libavcodec/x86/vp9itxfm.asm | 58 ----------------
libavcodec/x86/vp9itxfm_16bpp.asm | 100 ++++++++++++++++++++++------
libavcodec/x86/vp9itxfm_template.asm | 58 ++++++++++++++++
5 files changed, 161 insertions(+), 82 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index 5d07b62..b3b0558 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -62,6 +62,12 @@ void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t
int16_t *block, \
int eob)
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct, idct, size, bpp, opt); \
+decl_itxfm_func(iadst, idct, size, bpp, opt); \
+decl_itxfm_func(idct, iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 6e12af3..93fc684 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -126,8 +126,11 @@ decl_ipred_fns(tm, BPC, mmxext, sse2);
decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
#if BPC == 10
-decl_itxfm_func(idct, idct, 4, BPC, mmxext);
-decl_itxfm_func(idct, idct, 4, BPC, ssse3);
+decl_itxfm_func(idct, idct, 4, BPC, mmxext);
+decl_itxfm_func(idct, iadst, 4, BPC, sse2);
+decl_itxfm_func(iadst, idct, 4, BPC, sse2);
+decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
+decl_itxfm_funcs(4, BPC, ssse3);
#endif
#endif /* HAVE_YASM */
@@ -169,6 +172,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \
init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \
init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
+#define init_itx_funcs(idx, size, bpp, opt) \
+ init_itx_func(idx, DCT_DCT, idct, idct, size, bpp, opt); \
+ init_itx_func(idx, ADST_DCT, idct, iadst, size, bpp, opt); \
+ init_itx_func(idx, DCT_ADST, iadst, idct, size, bpp, opt); \
+ init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
@@ -185,13 +193,20 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_subpel3(1, avg, BPC, sse2);
init_lpf_funcs(BPC, sse2);
init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+#if BPC == 10
+ if (!bitexact) {
+ init_itx_func(TX_4X4, ADST_DCT, idct, iadst, 4, 10, sse2);
+ init_itx_func(TX_4X4, DCT_ADST, iadst, idct, 4, 10, sse2);
+ init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
+ }
+#endif
}
if (EXTERNAL_SSSE3(cpu_flags)) {
init_lpf_funcs(BPC, ssse3);
#if BPC == 10
if (!bitexact) {
- init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, ssse3);
+ init_itx_funcs(TX_4X4, 4, BPC, ssse3);
}
#endif
}
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index 200f15e..a3e0f86 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -289,64 +289,6 @@ IDCT_4x4_FN ssse3
; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;-------------------------------------------------------------------------------------------
-%macro VP9_IADST4_1D 0
- movq2dq xmm0, m0
- movq2dq xmm1, m1
- movq2dq xmm2, m2
- movq2dq xmm3, m3
-%if cpuflag(ssse3)
- paddw m3, m0
-%endif
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
- pmaddwd xmm1, xmm0, [pw_5283_13377]
- pmaddwd xmm4, xmm0, [pw_9929_13377]
-%if notcpuflag(ssse3)
- pmaddwd xmm6, xmm0, [pw_13377_0]
-%endif
- pmaddwd xmm0, [pw_15212_m13377]
- pmaddwd xmm3, xmm2, [pw_15212_9929]
-%if notcpuflag(ssse3)
- pmaddwd xmm7, xmm2, [pw_m13377_13377]
-%endif
- pmaddwd xmm2, [pw_m5283_m15212]
-%if cpuflag(ssse3)
- psubw m3, m2
-%else
- paddd xmm6, xmm7
-%endif
- paddd xmm0, xmm2
- paddd xmm3, xmm5
- paddd xmm2, xmm5
-%if notcpuflag(ssse3)
- paddd xmm6, xmm5
-%endif
- paddd xmm1, xmm3
- paddd xmm0, xmm3
- paddd xmm4, xmm2
- psrad xmm1, 14
- psrad xmm0, 14
- psrad xmm4, 14
-%if cpuflag(ssse3)
- pmulhrsw m3, [pw_13377x2] ; out2
-%else
- psrad xmm6, 14
-%endif
- packssdw xmm0, xmm0
- packssdw xmm1, xmm1
- packssdw xmm4, xmm4
-%if notcpuflag(ssse3)
- packssdw xmm6, xmm6
-%endif
- movdq2q m0, xmm0 ; out3
- movdq2q m1, xmm1 ; out0
- movdq2q m2, xmm4 ; out1
-%if notcpuflag(ssse3)
- movdq2q m3, xmm6 ; out2
-%endif
- SWAP 0, 1, 2, 3
-%endmacro
-
%macro IADST4_FN 5
INIT_MMX %5
cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
index 58987d3..11f080c 100644
--- a/libavcodec/x86/vp9itxfm_16bpp.asm
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -38,6 +38,15 @@ pw_m15137_6270: times 4 dw -15137, 6270
pw_6270_15137: times 4 dw 6270, 15137
pw_11585x2: times 8 dw 11585*2
+pw_5283_13377: times 4 dw 5283, 13377
+pw_9929_13377: times 4 dw 9929, 13377
+pw_15212_m13377: times 4 dw 15212, -13377
+pw_15212_9929: times 4 dw 15212, 9929
+pw_m5283_m15212: times 4 dw -5283, -15212
+pw_13377x2: times 8 dw 13377*2
+pw_m13377_13377: times 4 dw -13377, 13377
+pw_13377_0: times 4 dw 13377, 0
+
SECTION .text
%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
@@ -126,6 +135,30 @@ IWHT4_FN 10, 1023
INIT_MMX mmxext
IWHT4_FN 12, 4095
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+ mova m5, [pw_2048]
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+%else
+ mova m5, [pw_8]
+ paddw m0, m5
+ paddw m1, m5
+ paddw m2, m5
+ paddw m3, m5
+ psraw m0, 4
+ psraw m1, 4
+ psraw m2, 4
+ psraw m3, 4
+%endif
+ mova m5, [pw_1023]
+ VP9_STORE_2X 0, 1, 6, 7, 4, 5
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X 2, 3, 6, 7, 4, 5
+%endmacro
+
; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
; in 15+1 words without additional effort, since the coefficients are 15bpp.
@@ -187,27 +220,7 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
pxor m4, m4
ZERO_BLOCK blockq, 16, 4, m4
-%if cpuflag(ssse3)
- mova m5, [pw_2048]
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- pmulhrsw m2, m5
- pmulhrsw m3, m5
-%else
- mova m5, [pw_8]
- paddw m0, m5
- paddw m1, m5
- paddw m2, m5
- paddw m3, m5
- psraw m0, 4
- psraw m1, 4
- psraw m2, 4
- psraw m3, 4
-%endif
- mova m5, [pw_1023]
- VP9_STORE_2X 0, 1, 6, 7, 4, 5
- lea dstq, [dstq+2*strideq]
- VP9_STORE_2X 2, 3, 6, 7, 4, 5
+ VP9_IDCT4_WRITEOUT
RET
%endmacro
@@ -215,3 +228,48 @@ INIT_MMX mmxext
IDCT4_10_FN
INIT_MMX ssse3
IDCT4_10_FN
+
+%macro IADST4_FN 4
+cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+%endif
+ movdqa xmm5, [pd_8192]
+ mova m0, [blockq+0*16+0]
+ mova m4, [blockq+0*16+8]
+ mova m1, [blockq+1*16+0]
+ mova m5, [blockq+1*16+8]
+ packssdw m0, m4
+ packssdw m1, m5
+ mova m2, [blockq+2*16+0]
+ mova m4, [blockq+2*16+8]
+ mova m3, [blockq+3*16+0]
+ mova m5, [blockq+3*16+8]
+ packssdw m2, m4
+ packssdw m3, m5
+
+%if cpuflag(ssse3)
+ mova m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+ movdq2q m7, xmm5
+%endif
+ VP9_%2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_%4_1D
+
+ pxor m4, m4
+ ZERO_BLOCK blockq, 16, 4, m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+INIT_MMX sse2
+IADST4_FN idct, IDCT4, iadst, IADST4
+IADST4_FN iadst, IADST4, idct, IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+INIT_MMX ssse3
+IADST4_FN idct, IDCT4, iadst, IADST4
+IADST4_FN iadst, IADST4, idct, IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm
index f1a05a5..d2f2257 100644
--- a/libavcodec/x86/vp9itxfm_template.asm
+++ b/libavcodec/x86/vp9itxfm_template.asm
@@ -82,3 +82,61 @@
VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
+
+%macro VP9_IADST4_1D 0
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ movq2dq xmm2, m2
+ movq2dq xmm3, m3
+%if cpuflag(ssse3)
+ paddw m3, m0
+%endif
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+ pmaddwd xmm1, xmm0, [pw_5283_13377]
+ pmaddwd xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+ pmaddwd xmm6, xmm0, [pw_13377_0]
+%endif
+ pmaddwd xmm0, [pw_15212_m13377]
+ pmaddwd xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+ pmaddwd xmm7, xmm2, [pw_m13377_13377]
+%endif
+ pmaddwd xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+ psubw m3, m2
+%else
+ paddd xmm6, xmm7
+%endif
+ paddd xmm0, xmm2
+ paddd xmm3, xmm5
+ paddd xmm2, xmm5
+%if notcpuflag(ssse3)
+ paddd xmm6, xmm5
+%endif
+ paddd xmm1, xmm3
+ paddd xmm0, xmm3
+ paddd xmm4, xmm2
+ psrad xmm1, 14
+ psrad xmm0, 14
+ psrad xmm4, 14
+%if cpuflag(ssse3)
+ pmulhrsw m3, [pw_13377x2] ; out2
+%else
+ psrad xmm6, 14
+%endif
+ packssdw xmm0, xmm0
+ packssdw xmm1, xmm1
+ packssdw xmm4, xmm4
+%if notcpuflag(ssse3)
+ packssdw xmm6, xmm6
+%endif
+ movdq2q m0, xmm0 ; out3
+ movdq2q m1, xmm1 ; out0
+ movdq2q m2, xmm4 ; out1
+%if notcpuflag(ssse3)
+ movdq2q m3, xmm6 ; out2
+%endif
+ SWAP 0, 1, 2, 3
+%endmacro
--
2.1.2
More information about the ffmpeg-devel
mailing list