[FFmpeg-devel] [PATCH v3] aacenc: add SIMD optimizations for abs_pow34 and quantization

James Almer jamrial at gmail.com
Tue Oct 18 01:45:07 EEST 2016


On 10/17/2016 6:24 PM, Rostislav Pehlivanov wrote:
> diff --git a/libavcodec/aacenc_utils.h b/libavcodec/aacenc_utils.h
> index ff9188a..f5cf77d 100644
> --- a/libavcodec/aacenc_utils.h
> +++ b/libavcodec/aacenc_utils.h
> @@ -37,7 +37,7 @@
>  #define ROUND_TO_ZERO 0.1054f
>  #define C_QUANT 0.4054f
>  
> -static inline void abs_pow34_v(float *out, const float *in, const int size)
> +static inline void abs_pow34_v(float *out, const float *in, const int64_t size)

Why int64_t? There's no need for values that big...

>  {
>      int i;
>      for (i = 0; i < size; i++) {

...And it's certainly not correct, seeing this for loop here.

> diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm
> new file mode 100644
> index 0000000..ff4019f
> --- /dev/null
> +++ b/libavcodec/x86/aacencdsp.asm
> @@ -0,0 +1,87 @@
> +;******************************************************************************
> +;* SIMD optimized AAC encoder DSP functions
> +;*
> +;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +float_abs_mask:  times 4 dd 0x7fffffff
> +
> +SECTION .text
> +
> +;*******************************************************************
> +;void ff_abs_pow34_sse(float *out, const float *in, const int64_t size);
> +;*******************************************************************
> +INIT_XMM sse
> +cglobal abs_pow34, 3, 3, 3, out, in, size
> +    mova   m2, [float_abs_mask]
> +    shl    sizeq, 2
> +    add    inq, sizeq
> +    add    outq, sizeq
> +    neg    sizeq
> +.loop:
> +    mova   m0, [inq+sizeq]
> +    andps  m0, m2
> +    sqrtps m1, m0
> +    mulps  m0, m1
> +    sqrtps m0, m0
> +    mova   [outq+sizeq], m0
> +    add    sizeq, mmsize
> +    jl     .loop
> +    RET
> +
> +;*******************************************************************
> +;void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled,
> +;                                int size, int is_signed, int maxval, const float Q34,
> +;                                const float rounding)
> +;*******************************************************************
> +INIT_XMM sse2
> +cglobal aac_quantize_bands, 6, 6, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding

5, 5, 6

No need to load maxval into a gpr on x86_32 and win64 when cvtsi2ss
can also load it directly from memory.

> +%if UNIX64 == 0
> +    movss     m0, Q34m
> +    movss     m1, roundingm
> +%endif
> +    SPLATD    m0
> +    SPLATD    m1

shufps m0, m0, 0
shufps m1, m1, 0

On sse2, SPLATD will expand to pshufd, so better stay in float domain.

If you add an AVX/FMA3 version of this function, you could instead use
vbroadcastss in them, and save yourself the movss.

> +    cvtsi2ss  m3, maxvald
> +    SPLATD    m3

%if UNIX64
    cvtsi2ss  m3, maxvald
%else
    cvtsi2ss  m3, dword maxvalm
%endif
    shufps    m3, m3, 0

After you made the function 5, 5, 6.

> +    shl       is_signedd, 31
> +    movd      m4, is_signedd
> +    SPLATD    m4

Use shufps here since the instruction using this register in the loop
should be a float one.

> +    shl       sizeq,   2

Even if it doesn't come from stack on win64, better use "sized" anyway.

> +    add       inq,     sizeq
> +    add       outq,    sizeq
> +    add       scaledq, sizeq
> +    neg       sizeq
> +.loop:
> +    mova      m2, [scaledq+sizeq]
> +    mulps     m2, m0

mulps m2, m0, [scaledq+sizeq]

> +    addps     m2, m1

You could combine the mulps and addps into a single fmaddps if you add
an FMA3 version. Something like

movaps  m2, [scaledq+sizeq]
fmaddps m2, m2, m0, m1

The movaps is needed because, unlike FMA4's fmaddps which is
non-destructive, FMA3 needs dst to be the same as one of the three src
registers.

> +    minps     m2, m3
> +    mova      m5, [inq+sizeq]

movaps m5, [inq+sizeq]

> +    pand      m5, m4

andps m5, m4

> +    orps      m2, m5
> +    cvttps2dq m2, m2
> +    mova      [outq+sizeq], m2
> +    add       sizeq, mmsize
> +    jl       .loop
> +    RET

Can't you make these function also process eight floats per loop?
YMM versions would be nice, seeing how much these functions affect
overall encoding speed.



More information about the ffmpeg-devel mailing list