[FFmpeg-devel] [PATCH] libavcodec/vp9: ipred_dl_32x32_16 avx2 implementation
Ronald S. Bultje
rsbultje at gmail.com
Tue Jun 6 15:13:06 EEST 2017
Hi,
On Mon, Jun 5, 2017 at 1:41 PM, James Almer <jamrial at gmail.com> wrote:
> On 6/4/2017 2:52 PM, Ilia Valiakhmetov wrote:
> > vp9_diag_downleft_32x32_8bpp_c: 580.2
> > vp9_diag_downleft_32x32_8bpp_sse2: 75.6
> > vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
> > vp9_diag_downleft_32x32_8bpp_avx: 72.7
> > vp9_diag_downleft_32x32_10bpp_c: 1101.2
> > vp9_diag_downleft_32x32_10bpp_sse2: 145.4
> > vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
> > vp9_diag_downleft_32x32_10bpp_avx: 134.8
> > vp9_diag_downleft_32x32_10bpp_avx2: 94.0
> > vp9_diag_downleft_32x32_12bpp_c: 1108.5
> > vp9_diag_downleft_32x32_12bpp_sse2: 145.5
> > vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
> > vp9_diag_downleft_32x32_12bpp_avx: 135.2
> > vp9_diag_downleft_32x32_12bpp_avx2: 94.0
> >
> > ~30% faster than avx implementation
>
> Nice.
>
> >
> > ---
> > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++
> > libavcodec/x86/vp9intrapred_16bpp.asm | 63
> +++++++++++++++++++++++++++++++++++
> > 2 files changed, 65 insertions(+)
> >
> > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c
> b/libavcodec/x86/vp9dsp_init_16bpp.c
> > index 4576ff1..d1b8fcd 100644
> > --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> > @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2);
> > decl_ipred_fns(dc_top, 16, mmxext, sse2);
> > decl_ipred_fns(dc_left, 16, mmxext, sse2);
> > decl_ipred_fn(dl, 16, 16, avx2);
> > +decl_ipred_fn(dl, 32, 16, avx2);
> >
> > #define decl_ipred_dir_funcs(type) \
> > decl_ipred_fns(type, 16, sse2, sse2); \
> > @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> *dsp)
> > init_fpel_func(1, 1, 64, avg, _16, avx2);
> > init_fpel_func(0, 1, 128, avg, _16, avx2);
> > init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
> > + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
> > }
> >
> > #endif /* HAVE_YASM */
> > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
> b/libavcodec/x86/vp9intrapred_16bpp.asm
> > index 212e413..5cd6a3e 100644
> > --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> > @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride,
> l, a
> > DEFINE_ARGS dst, stride, stride3, cnt
> > mov cntd, 2
> > lea stride3q, [strideq*3]
> > +
>
> Trailing whitespaces.
>
> > .loop:
> > mova [dstq+strideq*0], m0
> > vpalignr m3, m2, m0, 2
> > @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst,
> stride, l, a
> > dec cntd
> > jg .loop
> > RET
> > +
>
> Same.
>
> > +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
> > + movifnidn aq, amp
> > + mova m0, [aq+mmsize*0+ 0] ;
> abcdefghijklmnop
> > + mova m1, [aq+mmsize*1+ 0] ;
> qrstuvwxyz012345
> > + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
> > + vperm2i128 m5, m0, m1, q0201 ;
> ijklmnopqrstuvwx
> > + vpalignr m2, m5, m0, 2 ;
> bcdefghijklmnopq
> > + vpalignr m3, m5, m0, 4 ;
> cdefghijklmnopqr
> > + LOWPASS 0, 2, 3 ;
> BCDEFGHIJKLMNOPQ
> > + vperm2i128 m5, m1, m4, q0201 ;
> yz01234555555555
> > + vpalignr m2, m5, m1, 2 ;
> rstuvwxyz0123455
> > + vpalignr m3, m5, m1, 4 ;
> stuvwxyz01234555
> > + LOWPASS 1, 2, 3 ;
> RSTUVWXYZ......5
> > + vperm2i128 m2, m1, m4, q0201 ;
> Z......555555555
> > + vperm2i128 m5, m0, m1, q0201 ;
> JKLMNOPQRSTUVWXY
> > + DEFINE_ARGS dst, stride, stride3, cnt
> > + lea stride3q, [strideq*3]
> > + mov cntd, 4
> > +
>
> Same.
>
> Ronald can fix them before pushing (I think the git hooks would prevent
> him to push this with them anyway), so no need to resend a fixed patch.
> Just keep it in mind for future patchsets. Same with tabs on files other
> than Makefile stuff.
Pushed with that fixed.
Ronald
More information about the ffmpeg-devel
mailing list