[FFmpeg-devel] [PATCH 2/2] Faster 8x8 IDCT.
Ronald S. Bultje
rsbultje
Wed Feb 16 18:39:21 CET 2011
after (MMX)
user 0m21.346s
user 0m21.408s
user 0m21.498s
avg 21.417 sec
before (C)
user 0m24.664s
user 0m24.604s
user 0m24.777s
avg 24.682
~15% faster overall.
---
libavcodec/x86/dsputil_mmx.c | 1 +
libavcodec/x86/dsputil_mmx.h | 4 +-
libavcodec/x86/vc1dsp_mmx.c | 3 +
libavcodec/x86/vc1dsp_yasm.asm | 139 +++++++++++++++++++++++++++++++++++++++-
4 files changed, 144 insertions(+), 3 deletions(-)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 39bf3f2..9d50058 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -48,6 +48,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x00040
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_12 ) = 0x000C000C000C000CULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
index a095e1e..ae62dda 100644
--- a/libavcodec/x86/dsputil_mmx.h
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -37,8 +37,8 @@ extern const xmm_reg ff_pw_3;
extern const xmm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_8;
-extern const uint64_t ff_pw_15;
-extern const xmm_reg ff_pw_16;
+extern const xmm_reg ff_pw_15;
+extern const uint64_t ff_pw_16;
extern const xmm_reg ff_pw_18;
extern const uint64_t ff_pw_20;
extern const xmm_reg ff_pw_27;
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 04b4aba..5d30bd0 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -712,6 +712,8 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
ff_vc1_h_loop_filter8_sse4(src, stride, pq);
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
}
+
+void ff_vc1_inv_trans_8x8_mmx(int16_t block[64]);
#endif
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
@@ -775,6 +777,7 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
#if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_MMX) {
ASSIGN_LF(mmx);
+ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_mmx;
}
return;
if (mm_flags & AV_CPU_FLAG_MMX2) {
diff --git a/libavcodec/x86/vc1dsp_yasm.asm b/libavcodec/x86/vc1dsp_yasm.asm
index 3ea9d8d..cc45fe7 100644
--- a/libavcodec/x86/vc1dsp_yasm.asm
+++ b/libavcodec/x86/vc1dsp_yasm.asm
@@ -1,6 +1,7 @@
;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 deblocking and IDCT optimizations
;* Copyright (c) 2009 David Conrad
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje at gmail.com>
;*
;* This file is part of FFmpeg.
;*
@@ -22,8 +23,12 @@
%include "x86inc.asm"
%include "x86util.asm"
+cextern pw_1
cextern pw_4
cextern pw_5
+cextern pw_6
+cextern pw_12
+cextern pw_64
section .text
@@ -328,3 +333,135 @@ cglobal vc1_h_loop_filter8_sse4, 3,5,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8
RET
+
+INIT_MMX
+; %1=src, %2/3=dst[0/1], %4=add for t1/t2, %5=shift, %6=only set for second round
+%macro VC1_IDCT_1D 5-6
+ movq m0, [%1+ 0] ; 4x src[ 0]
+ movq m1, [%1+64] ; 4x src[32]
+ movq m2, [%1+32] ; 4x src[16]
+ movq m3, [%1+96] ; 4x src[48]
+ pmullw m0, [pw_12] ; 4x(src[ 0]*12)
+ pmullw m1, [pw_12] ; 4x(src[32]*12)
+ paddw m0, [%4] ; 4x(src[ 0]*12+4/64)
+ SUMSUB_BA m1, m0, m4 ; mm1=t1, mm0=t2
+ movq m4, m2
+ movq m5, m3
+ pmullw m3, [pw_6] ; 4x(src[48]* 6)
+ psllw m4, 4 ; 4x(src[16]*16)
+ psllw m5, 4 ; 4x(src[48]*16)
+ pmullw m2, [pw_6] ; 4x(src[16]* 6)
+ paddw m3, m4 ; mm3=t3
+ psubw m2, m5 ; mm2=t4
+
+ SUMSUB_BA m3, m1, m4 ; mm3=t5, mm1=t8
+ SUMSUB_BA m2, m0, m4 ; mm2=t6, mm0=t7
+ movq [rsp], m0 ; save t7
+
+ movq m5, [%1+16] ; 4x src[ 8]
+ movq m0, m5
+ psllw m5, 2 ; 4x(src[ 8]*4)
+ movq m7, m5 ; t4, part 1
+ paddw m5, m5 ; 4x(src[ 8]*8)
+ movq m6, m5
+ paddw m5, m5 ; t1, part 1
+ paddw m6, m0 ; t3, part 1
+ movq m4, m5
+ psubw m5, m0 ; t2, part 1
+
+ movq m0, [%1+48] ; 4x src[24]
+ psubw m4, m0
+ psubw m7, m0
+ psllw m0, 2 ; 4x(src[24]* 4)
+ psubw m5, m0 ; t2, part 1-2
+ paddw m0, m0 ; 4x(src[24]* 8)
+ psubw m7, m0 ; t4, part 1-2
+ paddw m0, m0 ; 4x(src[24]*16)
+ psubw m6, m0 ; t3, part 1-2
+ paddw m4, m0 ; t1, part 1-2
+
+ movq m0, [%1+80] ; 4x src[40]
+ paddw m4, m0
+ psubw m7, m0
+ psllw m0, 2 ; 4x(src[40]* 4)
+ paddw m6, m0 ; t3, part 1-3
+ paddw m0, m0 ; 4x(src[40]* 8)
+ paddw m4, m0 ; t1, part 1-3
+ paddw m0, m0 ; 4x(src[40]*16)
+ psubw m5, m0 ; t2, part 1-3
+ paddw m7, m0 ; t4, part 1-3
+
+ movq m0, [%1+112] ; 4x src[56]
+ psubw m5, m0
+ psubw m6, m0
+ psllw m0, 2 ; 4x(src[56]* 4)
+ paddw m4, m0 ; t1
+ paddw m0, m0 ; 4x(src[56]* 8)
+ psubw m5, m0 ; t2
+ paddw m0, m0 ; 4x(src[56]*16)
+ paddw m6, m0 ; t3
+ psubw m7, m0 ; t4
+
+ SUMSUB_BA m7, m1, m0 ; mm7=t8+t4, mm1=t8-t4
+ SUMSUB_BA m5, m2, m0 ; mm5=t6+t2, mm2=t6-t2
+ SUMSUB_BA m4, m3, m0 ; mm4=t5+t1, mm3=t5-t1
+ movq m0, [rsp] ; restore t7
+ SUMSUB_BA m6, m0 ; mm6=t7+t3, mm0=t7-t3
+%if %0 == 5
+ movq [rsp], m0 ; save t7-t3
+%endif
+ psraw m4, %5
+ psraw m5, %5
+ psraw m6, %5
+ psraw m7, %5
+%if %0 == 5
+ TRANSPOSE4x4W 4, 5, 6, 7, 0
+ movq m0, [rsp] ; restore t7-t3
+%endif
+ movq [%2+ 0], m4
+%if %0 == 6
+ movq m4, [pw_1]
+ paddw m1, m4
+ paddw m0, m4
+ paddw m2, m4
+ paddw m3, m4
+%endif
+ psraw m1, %5
+ psraw m0, %5
+ psraw m2, %5
+ psraw m3, %5
+%if %0 == 5
+ TRANSPOSE4x4W 1, 0, 2, 3, 4
+%endif
+ movq [%2+16], m5
+ movq [%2+32], m6
+ movq [%2+48], m7
+ movq [%3+ 0], m1
+ movq [%3+16], m0
+ movq [%3+32], m2
+ movq [%3+48], m3
+%endmacro
+
+; void ff_vc1_inv_trans_8x8_<opt>(int16_t block[64])
+cglobal vc1_inv_trans_8x8_mmx, 1, 2, 0
+ sub rsp, 128
+ mov r1d, 2
+.idct1d_a:
+ VC1_IDCT_1D r0, rsp, rsp+8, pw_4, 3
+ add r0, 8
+ add rsp, 64
+ dec r1d
+ jnz .idct1d_a
+
+ sub rsp, 128
+ sub r0, 16
+ mov r1d, 2
+.idct1d_b:
+ VC1_IDCT_1D rsp, r0, r0+64, pw_64, 7, 1
+ add r0, 8
+ add rsp, 8
+ dec r1d
+ jnz .idct1d_b
+
+ add rsp, 112
+ RET
--
1.7.2.1
More information about the ffmpeg-devel
mailing list