[FFmpeg-devel] [PATCH 09/10] h264_idct8_add4
James Darnley
jdarnley at obe.tv
Fri Mar 17 15:18:44 EET 2017
1.00x faster (2884±63.9 vs. 2880±21.1 decicycles) compared with sse2
---
libavcodec/x86/h264_idct.asm | 60 +++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/h264dsp_init.c | 2 ++
2 files changed, 62 insertions(+)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index f1f2ce7..1515ea5 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1237,3 +1237,63 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, s
add16_sse2_cycle 6, 0x1e
add16_sse2_cycle 7, 0x26
RET
+
+; dst, block_offset, block, stride, nnzc, counter, coeff, dst2, picreg
+; 0 1 2 3 4 5 6 7 8
+cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst_, block_offset_, block_, stride_, nnzc_, counter_, coeff_, dst2_, picreg
+ movsxdifnidn stride_q, stride_d
+ xor counter_q, counter_q
+ %ifdef PIC
+ lea picregq, [scan8_mem]
+ %endif
+
+ .next_block:
+ movzx coeff_d, byte [scan8 + counter_q]
+ movzx coeff_d, byte [nnzc_q + coeff_q]
+ test coeff_d, coeff_d
+ jz .skip_block
+
+ cmp coeff_d, 1
+ jnz .no_dc
+
+ movsx coeff_d, word [block_q]
+ test coeff_d, coeff_d
+ jz .no_dc
+
+ mov word [block_q], 0
+ DC_ADD_INIT r6
+ %define stride3 r6
+ %if ARCH_X86_64 == 0
+ %define dst2_q r1
+ %define dst2_d r1d
+ %endif
+
+ mov dst2_d, dword [block_offset_q + 4*counter_q]
+ add dst2_q, dst_q
+ DC_ADD_MMXEXT_OP movq, dst2_q, stride_q, stride3
+ lea dst2_q, [dst2_q + 4*stride_q]
+ DC_ADD_MMXEXT_OP movq, dst2_q, stride_q, stride3
+ %if ARCH_X86_64 == 0
+ mov block_offset_q, block_offset_m
+ %endif
+
+ add counter_q, 4
+ add block_q, 128
+ cmp counter_q, 16
+ jl .next_block
+ RET
+
+ .no_dc:
+ mov dst2_d, dword [block_offset_q + 4*counter_q]
+ add dst2_q, dst_q
+ IDCT8_ADD_SSE dst2_q, block_q, stride_q, stride3
+ %if ARCH_X86_64 == 0
+ mov block_offset_q, block_offset_m
+ %endif
+
+ .skip_block:
+ add counter_q, 4
+ add block_q, 128
+ cmp counter_q, 16
+ jl .next_block
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 3396fd8..4050276 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -57,6 +57,7 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
+IDCT_ADD_REP_FUNC(8, 4, 8, avx)
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
@@ -348,6 +349,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_avx;
c->h264_idct_add16 = ff_h264_idct_add16_8_avx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
--
2.8.3
More information about the ffmpeg-devel
mailing list