[FFmpeg-devel] [PATCH] libavcodec/vp9: ipred_dl_32x32_16 avx2 implementation
James Almer
jamrial at gmail.com
Mon Jun 5 20:41:44 EEST 2017
On 6/4/2017 2:52 PM, Ilia Valiakhmetov wrote:
> vp9_diag_downleft_32x32_8bpp_c: 580.2
> vp9_diag_downleft_32x32_8bpp_sse2: 75.6
> vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
> vp9_diag_downleft_32x32_8bpp_avx: 72.7
> vp9_diag_downleft_32x32_10bpp_c: 1101.2
> vp9_diag_downleft_32x32_10bpp_sse2: 145.4
> vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
> vp9_diag_downleft_32x32_10bpp_avx: 134.8
> vp9_diag_downleft_32x32_10bpp_avx2: 94.0
> vp9_diag_downleft_32x32_12bpp_c: 1108.5
> vp9_diag_downleft_32x32_12bpp_sse2: 145.5
> vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
> vp9_diag_downleft_32x32_12bpp_avx: 135.2
> vp9_diag_downleft_32x32_12bpp_avx2: 94.0
>
> ~30% faster than avx implementation
Nice.
>
> ---
> libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++
> libavcodec/x86/vp9intrapred_16bpp.asm | 63 +++++++++++++++++++++++++++++++++++
> 2 files changed, 65 insertions(+)
>
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
> index 4576ff1..d1b8fcd 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2);
> decl_ipred_fns(dc_top, 16, mmxext, sse2);
> decl_ipred_fns(dc_left, 16, mmxext, sse2);
> decl_ipred_fn(dl, 16, 16, avx2);
> +decl_ipred_fn(dl, 32, 16, avx2);
>
> #define decl_ipred_dir_funcs(type) \
> decl_ipred_fns(type, 16, sse2, sse2); \
> @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
> init_fpel_func(1, 1, 64, avg, _16, avx2);
> init_fpel_func(0, 1, 128, avg, _16, avx2);
> init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
> + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
> }
>
> #endif /* HAVE_YASM */
> diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
> index 212e413..5cd6a3e 100644
> --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
> DEFINE_ARGS dst, stride, stride3, cnt
> mov cntd, 2
> lea stride3q, [strideq*3]
> +
Trailing whitespaces.
> .loop:
> mova [dstq+strideq*0], m0
> vpalignr m3, m2, m0, 2
> @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
> dec cntd
> jg .loop
> RET
> +
Same.
> +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
> + movifnidn aq, amp
> + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop
> + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345
> + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
> + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx
> + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq
> + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr
> + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
> + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
> + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
> + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
> + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
> + vperm2i128 m2, m1, m4, q0201 ; Z......555555555
> + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
> + DEFINE_ARGS dst, stride, stride3, cnt
> + lea stride3q, [strideq*3]
> + mov cntd, 4
> +
Same.
Ronald can fix them before pushing (I think the git hooks would prevent
him to push this with them anyway), so no need to resend a fixed patch.
Just keep it in mind for future patchsets. Same with tabs on files other
than Makefile stuff.
> +.loop:
> + mova [dstq+strideq*0 + 0], m0
> + mova [dstq+strideq*0 +32], m1
> + vpalignr m3, m5, m0, 2
> + vpalignr m4, m2, m1, 2
> + mova [dstq+strideq*1 + 0], m3
> + mova [dstq+strideq*1 +32], m4
> + vpalignr m3, m5, m0, 4
> + vpalignr m4, m2, m1, 4
> + mova [dstq+strideq*2 + 0], m3
> + mova [dstq+strideq*2 +32], m4
> + vpalignr m3, m5, m0, 6
> + vpalignr m4, m2, m1, 6
> + mova [dstq+stride3q*1+ 0], m3
> + mova [dstq+stride3q*1+32], m4
> + lea dstq, [dstq+strideq*4]
> + vpalignr m3, m5, m0, 8
> + vpalignr m4, m2, m1, 8
> + mova [dstq+strideq*0 + 0], m3
> + mova [dstq+strideq*0 +32], m4
> + vpalignr m3, m5, m0, 10
> + vpalignr m4, m2, m1, 10
> + mova [dstq+strideq*1 + 0], m3
> + mova [dstq+strideq*1 +32], m4
> + vpalignr m3, m5, m0, 12
> + vpalignr m4, m2, m1, 12
> + mova [dstq+strideq*2+ 0], m3
> + mova [dstq+strideq*2+32], m4
> + vpalignr m3, m5, m0, 14
> + vpalignr m4, m2, m1, 14
> + mova [dstq+stride3q+ 0], m3
> + mova [dstq+stride3q+ 32], m4
> + vpalignr m3, m5, m0, 16
> + vpalignr m4, m2, m1, 16
> + vperm2i128 m5, m3, m4, q0201
> + vperm2i128 m2, m4, m4, q0101
> + mova m0, m3
> + mova m1, m4
> + lea dstq, [dstq+strideq*4]
> + dec cntd
> + jg .loop
> + RET
> %endif
>
> %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
>
More information about the ffmpeg-devel
mailing list