[FFmpeg-devel] [PATCH 04/10] h264_idct_add
James Darnley
jdarnley at obe.tv
Fri Mar 17 15:18:39 EET 2017
1.20x faster (658±0.8 vs. 547±0.2 decicycles) compared with mmxext
---
libavcodec/x86/h264_idct.asm | 33 ++++++++++++++++++++++++++++++++-
libavcodec/x86/h264dsp_init.c | 3 +++
2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index bc4dce4..24fb4d2 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -65,7 +65,15 @@ SECTION .text
IDCT4_1D w, 0, 1, 2, 3, 4, 5
mova m6, [pw_32]
- TRANSPOSE4x4W 0, 1, 2, 3, 4
+ %if mmsize == 8
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ %else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ SBUTTERFLY dq, 0, 2, 4
+ MOVHL m1, m0
+ MOVHL m3, m2
+ %endif
paddw m0, m6
IDCT4_1D w, 0, 1, 2, 3, 4, 5
pxor m7, m7
@@ -1131,3 +1139,26 @@ INIT_MMX mmx
IDCT_DC_DEQUANT 0
INIT_MMX sse2
IDCT_DC_DEQUANT 7
+
+INIT_XMM avx
+
+; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+ movd %3, [%7]
+ movd %4, [%7+%8]
+ psraw %1, %6
+ psraw %2, %6
+ punpcklbw %3, %5
+ punpcklbw %4, %5
+ paddw %3, %1
+ paddw %4, %2
+ packuswb %3, %5
+ packuswb %4, %5
+ movd [%7], %3
+ movd [%7+%8], %4
+%endmacro
+
+cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
+ movsxdifnidn stride_q, stride_d
+ IDCT4_ADD dst_q, block_q, stride_q
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 0643b37..8ba085f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int stride);
IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, avx)
IDCT_ADD_FUNC(, 10, sse2)
IDCT_ADD_FUNC(_dc, 8, mmxext)
IDCT_ADD_FUNC(_dc, 10, mmxext)
@@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
}
+
+ c->h264_idct_add = ff_h264_idct_add_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
--
2.8.3
More information about the ffmpeg-devel
mailing list