[FFmpeg-devel] [PATCH] avcodec/vp9: add vp9_idct_idct_4x4_add_ssse3
Clément Bœsch
u at pkh.me
Mon Oct 28 20:56:29 CET 2013
---
libavcodec/x86/vp9dsp.asm | 141 +++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/vp9dsp_init.c | 3 +
2 files changed, 144 insertions(+)
diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm
index f81ac72..228a0a6 100644
--- a/libavcodec/x86/vp9dsp.asm
+++ b/libavcodec/x86/vp9dsp.asm
@@ -83,8 +83,149 @@ const filters_ssse3 ; smooth
F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2
F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1
+pw_11585x2: times 4 dw 23170
+pw_6270x2: times 4 dw 12540
+pw_15137x2: times 4 dw 30274
+pw_t2_coef: dw -15137, 6270, -15137, 6270
+pw_t3_coef: dw 6270, 15137, 6270, 15137
+pd_round: times 2 dd 1<<13
+pw_2048: times 4 dw 2048
+
SECTION .text
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 6 ; dst1, dst2, src (unchanged), round, coefs1, coefs2
+ movq m%1, [%5]
+ movq m%2, [%6]
+ pmaddwd m%1, m%3
+ pmaddwd m%2, m%3
+ paddd m%1, m%4
+ paddd m%2, m%4
+ psrad m%1, 14
+ psrad m%2, 14
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+ SUMSUB_BA w, 2, 0, 4
+ movq m4, [pw_11585x2]
+ pmulhrsw m0, m4 ; m0=t1
+ pmulhrsw m2, m4 ; m2=t0
+ movq m6, m3
+ punpckhwd m3, m1
+ VP9_MULSUB_2W_2X 4, 5, 3, 7, pw_t2_coef, pw_t3_coef
+ punpcklwd m6, m1
+ VP9_MULSUB_2W_2X 1, 3, 6, 7, pw_t2_coef, pw_t3_coef
+ packssdw m1, m4 ; m1=t2
+ packssdw m3, m5 ; m3=t3
+ SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
+ SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
+ SWAP 0, 3 ; 3102 -> 0132
+ SWAP 3, 2 ; 0132 -> 0123
+%endmacro
+
+%macro VP9_IDCT2_1D 0
+ pmulhrsw m0, m5 ; m0=t1
+ movq m2, m0 ; m2=t0
+ movq m3, m1
+ pmulhrsw m1, m6 ; m1=t2
+ pmulhrsw m3, m7 ; m3=t3
+ SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
+ SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
+ SWAP 0, 3 ; 3102 -> 0132
+ SWAP 3, 2 ; 0132 -> 0123
+%endmacro
+
+%macro VP9_STORE_2X 2
+ movd m6, [dstq]
+ movd m7, [dstq+strideq]
+ punpcklbw m6, m4
+ punpcklbw m7, m4
+ paddw m6, %1
+ paddw m7, %2
+ packuswb m6, m4
+ packuswb m7, m4
+ movd [dstq], m6
+ movd [dstq+strideq], m7
+%endmacro
+
+INIT_MMX ssse3
+cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
+
+ cmp eobd, 4 ; 2x2 or smaller
+ jg .idctfull
+
+ cmp eobd, 1 ; dc only
+ jne .idct2x2
+ movd m0, [blockq]
+ movq m5, [pw_11585x2]
+ pmulhrsw m0, m5
+ pmulhrsw m0, m5
+ pshufw m0, m0, 0
+ pxor m4, m4
+ movq [blockq], m4
+ movq m5, [pw_2048]
+ pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+ VP9_STORE_2X m0, m0
+ lea dstq, [dstq+2*strideq]
+ VP9_STORE_2X m0, m0
+ RET
+
+.idct2x2:
+ movd m0, [blockq+0]
+ movd m1, [blockq+8]
+ movq m5, [pw_11585x2]
+ movq m6, [pw_6270x2]
+ movq m7, [pw_15137x2]
+
+ VP9_IDCT2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT2_1D
+
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ movq [blockq+ 0], m4
+ movq [blockq+ 8], m4
+
+ movq m5, [pw_2048]
+ pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+ pmulhrsw m1, m5
+ VP9_STORE_2X m0, m1
+ lea dstq, [dstq+2*strideq]
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ VP9_STORE_2X m2, m3
+ RET
+
+.idctfull:
+ movq m0, [blockq+ 0]
+ movq m1, [blockq+ 8]
+ movq m2, [blockq+16]
+ movq m3, [blockq+24]
+
+ movq m7, [pd_round]
+ VP9_IDCT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IDCT4_1D
+
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ movq [blockq+ 0], m4
+ movq [blockq+ 8], m4
+ movq [blockq+16], m4
+ movq [blockq+24], m4
+
+ movq m5, [pw_2048]
+ pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+ pmulhrsw m1, m5
+ VP9_STORE_2X m0, m1
+ lea dstq, [dstq+2*strideq]
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ VP9_STORE_2X m2, m3
+ RET
+
%macro filter_h_fn 1
%assign %%px mmsize/2
cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index cf7a1a4..d131598 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -150,6 +150,8 @@ filters_8tap_1d_fn3(avg)
#undef filters_8tap_1d_fn3
#undef filter_8tap_1d_fn
+void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
#endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
@@ -203,6 +205,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
if (cpu_flags & AV_CPU_FLAG_SSSE3) {
init_subpel3(0, put, ssse3);
init_subpel3(1, avg, ssse3);
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
}
#undef init_fpel
--
1.8.4.1
More information about the ffmpeg-devel
mailing list