[FFmpeg-devel] [PATCH 01/15] vp9/x86: save one register in loopfilter surface coverage.
Hendrik Leppkes
h.leppkes at gmail.com
Sat Dec 27 17:37:22 CET 2014
On Sat, Dec 27, 2014 at 5:24 PM, Clément Bœsch <u at pkh.me> wrote:
> On Sat, Dec 27, 2014 at 11:02:36AM -0500, Ronald S. Bultje wrote:
> > ---
> > libavcodec/x86/vp9lpf.asm | 56
> +++++++++++++++++++++++------------------------
> > 1 file changed, 28 insertions(+), 28 deletions(-)
> >
> > diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
> > index 416f08f..e0f7386 100644
> > --- a/libavcodec/x86/vp9lpf.asm
> > +++ b/libavcodec/x86/vp9lpf.asm
> > @@ -278,22 +278,22 @@ SECTION .text
> > %endmacro
> >
> > %macro DEFINE_REAL_P7_TO_Q7 0-1 0
> > -%define P7 dst1q + 2*mstrideq + %1
> > -%define P6 dst1q + mstrideq + %1
> > -%define P5 dst1q + %1
> > -%define P4 dst1q + strideq + %1
> > -%define P3 dstq + 4*mstrideq + %1
> > -%define P2 dstq + mstride3q + %1
> > -%define P1 dstq + 2*mstrideq + %1
> > -%define P0 dstq + mstrideq + %1
> > -%define Q0 dstq + %1
> > -%define Q1 dstq + strideq + %1
> > -%define Q2 dstq + 2*strideq + %1
> > -%define Q3 dstq + stride3q + %1
> > -%define Q4 dstq + 4*strideq + %1
> > -%define Q5 dst2q + mstrideq + %1
> > -%define Q6 dst2q + %1
> > -%define Q7 dst2q + strideq + %1
> > +%define P7 dstq + 4*mstrideq + %1
> > +%define P6 dstq + mstride3q + %1
> > +%define P5 dstq + 2*mstrideq + %1
> > +%define P4 dstq + mstrideq + %1
> > +%define P3 dstq + %1
> > +%define P2 dstq + strideq + %1
> > +%define P1 dstq + 2* strideq + %1
> > +%define P0 dstq + stride3q + %1
> > +%define Q0 dstq + 4* strideq + %1
> > +%define Q1 dst2q + mstride3q + %1
> > +%define Q2 dst2q + 2*mstrideq + %1
> > +%define Q3 dst2q + mstrideq + %1
> > +%define Q4 dst2q + %1
> > +%define Q5 dst2q + strideq + %1
> > +%define Q6 dst2q + 2* strideq + %1
> > +%define Q7 dst2q + stride3q + %1
> > %endmacro
> >
> > ; ..............AB -> AAAAAAAABBBBBBBB
> > @@ -308,26 +308,26 @@ SECTION .text
> > %endmacro
> >
> > %macro LOOPFILTER 2 ; %1=v/h %2=size1
> > - lea mstrideq, [strideq]
> > - neg mstrideq
> > + mov mstrideq, strideq
> > + neg mstrideq
> >
> > - lea stride3q, [strideq+2*strideq]
> > - mov mstride3q, stride3q
> > - neg mstride3q
> > + lea stride3q, [strideq*3]
> > + lea mstride3q, [mstrideq*3]
> >
> > %ifidn %1, h
> > %if %2 > 16
> > %define movx movh
> > - lea dstq, [dstq + 8*strideq - 4]
> > + lea dstq, [dstq + 4*strideq - 4]
> > %else
> > %define movx movu
> > - lea dstq, [dstq + 8*strideq - 8] ; go from top center (h pos) to
> center left (v pos)
> > + lea dstq, [dstq + 4*strideq - 8] ; go from top
> center (h pos) to center left (v pos)
> > %endif
> > + lea dst2q, [dstq + 8*strideq]
> > +%else
> > + lea dstq, [dstq + 4*mstrideq]
> > + lea dst2q, [dstq + 8*strideq]
> > %endif
> >
> > - lea dst1q, [dstq + 2*mstride3q] ; dst1q =
> &dst[stride * -6]
> > - lea dst2q, [dstq + 2* stride3q] ; dst2q =
> &dst[stride * +6]
> > -
> > DEFINE_REAL_P7_TO_Q7
> >
> > %ifidn %1, h
> > @@ -795,9 +795,9 @@ SECTION .text
> >
> > %macro LPF_16_VH 2
> > INIT_XMM %2
> > -cglobal vp9_loop_filter_v_%1_16, 5,10,16, dst, stride, E, I, H,
> mstride, dst1, dst2, stride3, mstride3
> > +cglobal vp9_loop_filter_v_%1_16, 5,10,16, dst, stride, E, I, H,
> mstride, dst2, stride3, mstride3
>
> Since you drop one register, it should probably become 5,9,16
>
Those are being updated in the second patch in a more complete manner.
- Hendrik
More information about the ffmpeg-devel
mailing list