[FFmpeg-devel] [PATCH] mdct15: add inverse transform postrotation SIMD
Rostislav Pehlivanov
atomnuker at gmail.com
Sun Jul 30 00:55:55 EEST 2017
On 29 July 2017 at 22:37, James Almer <jamrial at gmail.com> wrote:
> On 7/29/2017 5:38 PM, Rostislav Pehlivanov wrote:
> > Speeds up decoding by 8% in total.
> >
> > 20ms frames:
> > Before: 17774 decicycles in postrotate, 262065 runs, 79 skips
> > After: 7169 decicycles in postrotate, 262104 runs, 40 skips
> >
> > 10ms frames:
> > Before: 9058 decicycles in postrotate, 524209 runs, 79 skips
> > After: 3915 decicycles in postrotate, 524236 runs, 52 skips
> >
> > 5ms frames:
> > Before: 4764 decicycles in postrotate, 1048466 runs, 110 skips
> > After: 2161 decicycles in postrotate, 1048515 runs, 61 skips
> >
> > 2.5ms frames:
> > Before: 2608 decicycles in postrotate, 2097030 runs, 122 skips
> > After: 1377 decicycles in postrotate, 2097097 runs, 55 skips
> >
> > Needs to overwrite the start of some buffers as well as the
> > end of them, hence the OVERALLOC stuff.
> >
> > Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
> > ---
> > libavcodec/mdct15.c | 75 ++++++++++++++++++++++++++++++
> +-------------
> > libavcodec/mdct15.h | 7 +++--
> > libavcodec/x86/mdct15.asm | 72 ++++++++++++++++++++++++++++++
> ++++++++++++
> > libavcodec/x86/mdct15_init.c | 4 +++
> > 4 files changed, 134 insertions(+), 24 deletions(-)
> >
> > diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
> > index d68372c344..0a6c0069db 100644
> > --- a/libavcodec/mdct15.c
> > +++ b/libavcodec/mdct15.c
> > @@ -40,6 +40,29 @@
> >
> > #define CMUL3(c, a, b) CMUL((c).re, (c).im, (a).re, (a).im, (b).re,
> (b).im)
> >
> > +#define OVERALLOC_AMOUNT 32 /* 1 ymm register */
>
> Use AV_INPUT_BUFFER_PADDING_SIZE
>
> > +
> > +#define OVERALLOC(val, size)
> \
> > + {
> \
> > + (val) = NULL;
> \
> > + uint8_t *temp = av_malloc((size) + 2*OVERALLOC_AMOUNT);
> \
>
> Why two times?
>
>
Start and end.
> > + if (temp) {
> \
> > + memset(temp, 0, OVERALLOC_AMOUNT);
> \
> > + memset(temp + (size) + OVERALLOC_AMOUNT, 0,
> OVERALLOC_AMOUNT); \
> > + (val) = (void *)(temp + OVERALLOC_AMOUNT);
> \
>
> Can't you just keep the zero padding at the end like in every other
> padded buffer used by simd functions?
> This seems pretty unconventional, especially with the custom free code
> below.
>
>
No, the function starts in the middle and reads/writes to the start and end
per single loop.
The amount of iterations is not mod 8 so I have to pad both the start and
end.
> Also, you could just use av_mallocz*.
>
> > + }
> \
> > + }
> > +
> > +#define OVERFREEP(val)
> \
> > + {
> \
> > + uint8_t *temp = (uint8_t *)(val);
> \
> > + if (temp) {
> \
> > + temp -= OVERALLOC_AMOUNT;
> \
> > + av_free(temp);
> \
> > + }
> \
> > + val = NULL;
> \
> > + }
> > +
> > av_cold void ff_mdct15_uninit(MDCT15Context **ps)
> > {
> > MDCT15Context *s = *ps;
> > @@ -50,9 +73,9 @@ av_cold void ff_mdct15_uninit(MDCT15Context **ps)
> > ff_fft_end(&s->ptwo_fft);
> >
> > av_freep(&s->pfa_prereindex);
> > - av_freep(&s->pfa_postreindex);
> > - av_freep(&s->twiddle_exptab);
> > - av_freep(&s->tmp);
> > + OVERFREEP(s->pfa_postreindex);
> > + OVERFREEP(s->twiddle_exptab);
> > + OVERFREEP(s->tmp);
> >
> > av_freep(ps);
> > }
> > @@ -69,7 +92,7 @@ static inline int init_pfa_reindex_tabs(MDCT15Context
> *s)
> > if (!s->pfa_prereindex)
> > return 1;
> >
> > - s->pfa_postreindex = av_malloc(15 * l_ptwo *
> sizeof(*s->pfa_postreindex));
> > + OVERALLOC(s->pfa_postreindex, 15 * l_ptwo *
> sizeof(*s->pfa_postreindex));
>
> Not strictly related to this patch, but this should probably be using
> av_malloc_array().
>
>
Fixed in the overalloc macro.
> > if (!s->pfa_postreindex)
> > return 1;
> >
> > @@ -203,6 +226,21 @@ static void mdct15(MDCT15Context *s, float *dst,
> const float *src, ptrdiff_t str
> > }
> > }
> >
> > +static void postrotate_c(FFTComplex *out, FFTComplex *in, FFTComplex
> *exp,
> > + uint32_t *lut, int64_t len8)
> > +{
> > + int i;
> > +
> > + /* Reindex again, apply twiddles and output */
> > + for (i = 0; i < len8; i++) {
> > + const int i0 = len8 + i, i1 = len8 - i - 1;
> > + const int s0 = lut[i0], s1 = lut[i1];
> > +
> > + CMUL(out[i1].re, out[i0].im, in[s1].im, in[s1].re, exp[i1].im,
> exp[i1].re);
> > + CMUL(out[i0].re, out[i1].im, in[s0].im, in[s0].re, exp[i0].im,
> exp[i0].re);
> > + }
> > +}
> > +
> > static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
> > ptrdiff_t stride)
> > {
> > @@ -226,15 +264,7 @@ static void imdct15_half(MDCT15Context *s, float
> *dst, const float *src,
> > s->ptwo_fft.fft_calc(&s->ptwo_fft, s->tmp + l_ptwo*i);
> >
> > /* Reindex again, apply twiddles and output */
> > - for (i = 0; i < len8; i++) {
> > - const int i0 = len8 + i, i1 = len8 - i - 1;
> > - const int s0 = s->pfa_postreindex[i0], s1 =
> s->pfa_postreindex[i1];
> > -
> > - CMUL(z[i1].re, z[i0].im, s->tmp[s1].im, s->tmp[s1].re,
> > - s->twiddle_exptab[i1].im, s->twiddle_exptab[i1].re);
> > - CMUL(z[i0].re, z[i1].im, s->tmp[s0].im, s->tmp[s0].re,
> > - s->twiddle_exptab[i0].im, s->twiddle_exptab[i0].re);
> > - }
> > + s->postreindex(z, s->tmp, s->twiddle_exptab, s->pfa_postreindex,
> len8);
> > }
> >
> > av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N,
> double scale)
> > @@ -253,13 +283,14 @@ av_cold int ff_mdct15_init(MDCT15Context **ps, int
> inverse, int N, double scale)
> > if (!s)
> > return AVERROR(ENOMEM);
> >
> > - s->fft_n = N - 1;
> > - s->len4 = len2 / 2;
> > - s->len2 = len2;
> > - s->inverse = inverse;
> > - s->fft15 = fft15_c;
> > - s->mdct = mdct15;
> > - s->imdct_half = imdct15_half;
> > + s->fft_n = N - 1;
> > + s->len4 = len2 / 2;
> > + s->len2 = len2;
> > + s->inverse = inverse;
> > + s->fft15 = fft15_c;
> > + s->mdct = mdct15;
> > + s->imdct_half = imdct15_half;
> > + s->postreindex = postrotate_c;
> >
> > if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
> > goto fail;
> > @@ -267,11 +298,11 @@ av_cold int ff_mdct15_init(MDCT15Context **ps, int
> inverse, int N, double scale)
> > if (init_pfa_reindex_tabs(s))
> > goto fail;
> >
> > - s->tmp = av_malloc_array(len, 2 * sizeof(*s->tmp));
> > + OVERALLOC(s->tmp, len * 2 * sizeof(*s->tmp));
> > if (!s->tmp)
> > goto fail;
> >
> > - s->twiddle_exptab = av_malloc_array(s->len4,
> sizeof(*s->twiddle_exptab));
> > + OVERALLOC(s->twiddle_exptab, s->len4 * sizeof(*s->twiddle_exptab));
> > if (!s->twiddle_exptab)
> > goto fail;
> >
> > diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
> > index 1c2149d436..301b238ec8 100644
> > --- a/libavcodec/mdct15.h
> > +++ b/libavcodec/mdct15.h
> > @@ -30,8 +30,8 @@ typedef struct MDCT15Context {
> > int len2;
> > int len4;
> > int inverse;
> > - int *pfa_prereindex;
> > - int *pfa_postreindex;
> > + uint32_t *pfa_prereindex;
> > + uint32_t *pfa_postreindex;
>
> What's the point changing these? It has no effect on hand written asm
> and the C version still uses const int when loading single values from
> them.
>
>
Wasn't sure if we require ints to be strictly 32 bits. Fixed.
> >
> > FFTContext ptwo_fft;
> > FFTComplex *tmp;
> > @@ -42,6 +42,9 @@ typedef struct MDCT15Context {
> > /* 15-point FFT */
> > void (*fft15)(FFTComplex *out, FFTComplex *in, FFTComplex *exptab,
> ptrdiff_t stride);
> >
> > + /* PFA postrotate and exptab */
> > + void (*postreindex)(FFTComplex *out, FFTComplex *in, FFTComplex
> *exp, uint32_t *lut, int64_t len8);
>
> ptrdiff_t, not int64_t.
>
Thought because its not a stride it should be int64_t, fixed.
>
> > +
> > /* Calculate a full 2N -> N MDCT */
> > void (*mdct)(struct MDCT15Context *s, float *dst, const float *src,
> ptrdiff_t stride);
> >
> > diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
> > index f8b895944d..c5ea56c2c3 100644
> > --- a/libavcodec/x86/mdct15.asm
> > +++ b/libavcodec/x86/mdct15.asm
> > @@ -28,6 +28,10 @@ SECTION_RODATA
> >
> > sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
> >
> > +sign_adjust_r: times 4 dd 0x80000000, 0x00000000
> > +perm_neg: dd 0x2, 0x5, 0x3, 0x4, 0x6, 0x1, 0x7, 0x0
> > +perm_pos: dd 0x0, 0x7, 0x1, 0x6, 0x4, 0x3, 0x5, 0x2
>
> These are index values, so IMO use decimal. It's more readable that way.
>
> Also, SECTION_RODATA needs to be 32 byte aligned now.
>
>
Fixed both.
> > +
> > SECTION .text
> >
> > %macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
> > @@ -138,4 +142,72 @@ cglobal fft15, 4, 6, 14, out, in, exptab, stride,
> stride3, stride5
> >
> > RET
> >
> > +%macro LUT_LOAD_4D 4
> > + mov r7d, [lutq + %4q*4 + 0]
> > + movsd %2, [inq + r7q*8]
> > + mov r7d, [lutq + %4q*4 + 4]
> > + movhps %2, [inq + r7q*8]
> > + mov r7d, [lutq + %4q*4 + 8]
> > + movsd %3, [inq + r7q*8]
> > + mov r7d, [lutq + %4q*4 + 12]
> > + movhps %3, [inq + r7q*8]
> > + vinsertf128 %1, %1, %3, 1
> > +%endmacro
> > +
> > +;**********************************************************
> *****************************************************
> > +;void ff_mdct15_postreindex_avx2(FFTComplex *out, FFTComplex *in,
> FFTComplex *exp, uint32_t *lut, int64_t len8);
> > +;**********************************************************
> *****************************************************
> > +INIT_YMM avx2
> > +cglobal mdct15_postreindex, 5, 8, 10, out, in, exp, lut, len8,
> offset_p, offset_n
>
> You're using seven gprs, not 8.
>
>
No, look in LUT_LOAD_4D, I'm using r7 there so 8 gprs.
> > +
> > + mova m7, [perm_pos]
> > + mova m8, [perm_neg]
> > + mova m9, [sign_adjust_r]
> > +
> > + mov offset_pq, len8q
> > + lea offset_nq, [len8q - 4]
> > +
> > + shl len8q, 1
> > +
> > + movu m10, [outq - mmsize] ; backup from start - mmsize to
> start
> > + movu m11, [outq + len8q*8] ; backup from end to end +
> mmsize
>
> And you seem to be using 12 xmmy/ymm regs, not 10.
>
Fixed.
>
> > +
> > +.loop:
> > + movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im,
> exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
> > + movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im,
> exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
> > +
> > + LUT_LOAD_4D m3, xm3, xm4, offset_p ; in[p0].re, in[p0].im,
> in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
> > + LUT_LOAD_4D m4, xm4, xm5, offset_n ; in[n3].re, in[n3].im,
> in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
> > +
> > + mulps m5, m3, m0 ; in[p].reim * exp[p].reim
> > + mulps m6, m4, m1 ; in[n].reim * exp[n].reim
> > +
> > + xorps m5, m9 ; in[p].re *= -1, in[p].im *= 1
> > + xorps m6, m9 ; in[n].re *= -1, in[n].im *= 1
> > +
> > + shufps m3, m3, m3, q2301 ; in[p].imre
> > + shufps m4, m4, m4, q2301 ; in[n].imre
> > +
> > + mulps m3, m0 ; in[p].imre * exp[p].reim
> > + mulps m4, m1 ; in[n].imre * exp[n].reim
> > +
> > + haddps m5, m4 ; out[p0].re, out[p1].re,
> out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
> > + haddps m3, m6 ; out[n0].im, out[n1].im,
> out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
> > +
> > + vpermps m5, m7, m5 ; out[p0].re, out[p0].im,
> out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
> > + vpermps m3, m8, m3 ; out[n3].im, out[n3].re,
> out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
> > +
> > + movups [outq + offset_pq*8], m5
> > + movups [outq + offset_nq*8], m3
> > +
> > + sub offset_nq, 4
> > + add offset_pq, 4
> > + cmp offset_pq, len8q
> > + jl .loop
> > +
> > + movu [outq - mmsize], m10
> > + movu [outq + len8q*8], m11
> > +
> > + RET
> > +
> > %endif
> > diff --git a/libavcodec/x86/mdct15_init.c b/libavcodec/x86/mdct15_init.c
> > index ba3d94c2ec..60d47e71ce 100644
> > --- a/libavcodec/x86/mdct15_init.c
> > +++ b/libavcodec/x86/mdct15_init.c
> > @@ -25,6 +25,7 @@
> > #include "libavutil/x86/cpu.h"
> > #include "libavcodec/mdct15.h"
> >
> > +void ff_mdct15_postreindex_avx2(FFTComplex *out, FFTComplex *in,
> FFTComplex *exp, uint32_t *lut, int64_t len8);
> > void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab,
> ptrdiff_t stride);
> >
> > static void perm_twiddles(MDCT15Context *s)
> > @@ -90,6 +91,9 @@ av_cold void ff_mdct15_init_x86(MDCT15Context *s)
> > adjust_twiddles = 1;
> > }
> >
> > + if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
> > + s->postreindex = ff_mdct15_postreindex_avx2;
> > +
>
> Why didn't you write a SSE3 version of this function? Baseline for any
> function should be SSE/SSE3 if float, SSE2/SSE4 if integer.
>
>
I or someone else could do it later. The whole thing started as a test to
see how slow vgather is so I had to use avx2 but then it turned out to be
slower so I dropped it but kept the whole thing as avx2.
> > if (adjust_twiddles)
> > perm_twiddles(s);
> > }
> >
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
More information about the ffmpeg-devel
mailing list