[FFmpeg-devel] [PATCH 6/6] avcodec/h264: mmx2, sse2, avx 10-bit 4:2:2 h chroma intra deblock/loop filter
James Darnley
jdarnley at obe.tv
Thu Dec 1 18:57:49 EET 2016
Yorkfield:
- mmx2: 1.21x faster (370 vs. 306 cycles)
- sse2: 1.20x faster (370 vs. 308 cycles)
Skylake:
- mmx2: 1.39x faster (240 vs. 172 cycles)
- sse2: 1.74x faster (240 vs. 138 cycles)
- avx: 1.78x faster (240 vs. 134 cycles)
---
libavcodec/x86/h264_deblock_10bit.asm | 27 +++++++++++++++++++++++++++
libavcodec/x86/h264dsp_init.c | 3 +++
2 files changed, 30 insertions(+)
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index fb10bfb..8832210 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -1100,6 +1100,33 @@ cglobal deblock_h_chroma_intra_10, 4, 6, 8, 2*mmsize, pix_, stride_, alpha_, bet
%endif
RET
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma422_intra_10(uint16_t *pix, int stride, int alpha,
+; int beta)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma422_intra_10, 4, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_
+ shl alpha_d, 2
+ shl beta_d, 2
+
+ mov r5, pix_q
+ lea r6, [3*stride_q]
+ add r5, r6
+
+ mov r4, -8
+ .loop:
+
+ CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
+ LOAD_AB m4, m5, alpha_d, beta_d
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
+
+ lea pix_q, [pix_q + (mmsize/2)*stride_q]
+ lea r5, [r5 + (mmsize/2)*stride_q]
+ add r4, (mmsize/4)
+ jl .loop
+RET
+
%endmacro
%if ARCH_X86_64 == 0
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 395b8c3..8154cd9 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -318,6 +318,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_10_mmxext;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_10_mmxext;
}
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
@@ -357,6 +358,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_10_sse2;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_10_sse2;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
@@ -398,6 +400,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_10_avx;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
+ c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_10_avx;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
--
2.10.2
More information about the ffmpeg-devel
mailing list