[FFmpeg-devel] [PATCH 10/10] h264_idct_add16intra
James Darnley
jdarnley at obe.tv
Fri Mar 17 15:18:45 EET 2017
Broken FATE
1.02x faster (1580±4.8 vs. 1555±3.9 decicycles) compared with sse2
---
libavcodec/x86/h264_idct.asm | 43 +++++++++++++++++++++++++++++++++++++++++--
libavcodec/x86/h264dsp_init.c | 2 ++
2 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 1515ea5..16998dc 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -895,7 +895,7 @@ REP_RET
%else
add r0, r0m
%endif
- call h264_add8x4_idct_sse2
+ call h264_add8x4_idct_ %+ cpuname
jmp %%skip
%%trydc:
movsx r0, word [r2 ]
@@ -907,13 +907,15 @@ REP_RET
%else
add r0, r0m
%endif
- call h264_idct_dc_add8_mmxext
+ call h264_idct_dc_add8_ %+ cpuname
%%skip:
%if %1 < 7
add r2, 64
%endif
%endmacro
+%define h264_idct_dc_add8_sse2 h264_idct_dc_add8_mmxext
+
; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
@@ -1193,6 +1195,27 @@ ret
packuswb m1, m1
%endmacro
+ALIGN 16
+; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
+; FIXME: I produce incorrect output
+h264_idct_dc_add8_avx:
+ movsxdifnidn r3, r3d
+ movd m0, [r2 ] ; 0 0 X D
+ mov word [r2+ 0], 0
+ punpcklwd m0, [r2+32] ; x X d D
+ mov word [r2+32], 0
+ paddsw m0, [pw_32]
+ psraw m0, 6
+ punpcklwd m0, m0 ; d d D D
+ pxor m1, m1 ; 0 0 0 0
+ psubw m1, m0 ; -d-d-D-D
+ packuswb m0, m1 ; -d-d-D-D d d D D
+ pshuflw m1, m0, q3322 ; -d-d-d-d-D-D-D-D
+ punpcklwd m0, m0 ; d d d d D D D D
+ lea r6, [r3*3]
+ DC_ADD_MMXEXT_OP movq, r0, r3, r6
+ret
+
cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
movsxdifnidn stride_q, stride_d
IDCT4_ADD dst_q, block_q, stride_q
@@ -1238,6 +1261,22 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, s
add16_sse2_cycle 7, 0x26
RET
+; FIXME: I produce incorrect output
+cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8, dst_, block_offset_, block_, stride_, nnzc_
+ movsxdifnidn stride_q, stride_d
+ %if ARCH_X86_64
+ mov r7, r0
+ %endif
+ add16intra_sse2_cycle 0, 0xc
+ add16intra_sse2_cycle 1, 0x14
+ add16intra_sse2_cycle 2, 0xe
+ add16intra_sse2_cycle 3, 0x16
+ add16intra_sse2_cycle 4, 0x1c
+ add16intra_sse2_cycle 5, 0x24
+ add16intra_sse2_cycle 6, 0x1e
+ add16intra_sse2_cycle 7, 0x26
+RET
+
; dst, block_offset, block, stride, nnzc, counter, coeff, dst2, picreg
; 0 1 2 3 4 5 6 7 8
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst_, block_offset_, block_, stride_, nnzc_, counter_, coeff_, dst2_, picreg
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 4050276..e09566d 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -68,6 +68,7 @@ IDCT_ADD_REP_FUNC(, 16, 10, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
+IDCT_ADD_REP_FUNC(, 16intra, 8, avx)
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, avx)
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
@@ -350,6 +351,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_avx;
c->h264_idct_add16 = ff_h264_idct_add16_8_avx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_avx;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
--
2.8.3
More information about the ffmpeg-devel
mailing list