[FFmpeg-devel] [PATCH 5/5] avcodec/dca: add DTS Express (LBR) decoder

Fri Apr 29 07:23:48 CEST 2016

On 4/27/2016 2:22 PM, foo86 wrote:
> ---
>  Changelog            |    1 +
>  libavcodec/Makefile  |    2 +-
>  libavcodec/dca_lbr.c | 1858 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/dca_lbr.h |  130 ++++
>  libavcodec/dcadata.c |  460 +++++++++++++
>  libavcodec/dcadata.h |   46 ++
>  libavcodec/dcadec.c  |   22 +-
>  libavcodec/dcadec.h  |    5 +-
>  libavcodec/dcadsp.c  |   27 +
>  libavcodec/dcadsp.h  |    4 +
>  libavcodec/dcahuff.c |  245 ++++++-
>  libavcodec/dcahuff.h |   13 +
>  12 files changed, 2807 insertions(+), 6 deletions(-)
>  create mode 100644 libavcodec/dca_lbr.c
>  create mode 100644 libavcodec/dca_lbr.h

[...]

> +#define SW0     0.022810893
> +#define SW1     0.41799772
> +#define SW2     0.9084481
> +#define SW3     0.99973983
> +
> +#define C1      0.068974845
> +#define C2      0.34675997
> +#define C3      0.29396889
> +#define C4      0.19642374
> +
> +#define AL1     0.30865827
> +#define AL2     0.038060233

Make sure these are float. gcc, clang and icc are all converting the time_samples
below to double before multiplying, making the following function much slower
than it should be.

> +
> +static void transform_channel(DCALbrDecoder *s, int ch, float *output)
> +{
> +    LOCAL_ALIGNED(32, float, values, [DCA_LBR_SUBBANDS    ], [4]);
> +    LOCAL_ALIGNED(32, float, result, [DCA_LBR_SUBBANDS * 2], [4]);

LOCAL_ALIGNED_32(float, ...)

> +    int i, sf, nsubbands = s->nsubbands, noutsubbands = 8 << s->freq_range;
> +
> +    // Clear inactive subbands
> +    if (nsubbands < noutsubbands)
> +        memset(values[nsubbands], 0, (noutsubbands - nsubbands) * sizeof(values[0]));
> +
> +    for (sf = 0; sf < DCA_LBR_TIME_SAMPLES / 4; sf++) {
> +        // Short window and 8 point forward MDCT

According to perf, a lot of CPU time is spent on this and the aliasing cancellation
code below, at least with a mono sample i found in the wild. Fixing the constants
above helps a lot, though.

It looks like it shouldn't be hard to write using simd, so maybe it would be a good
idea to move this part to dcadsp.

> +        for (i = 0; i < nsubbands; i++) {
> +            float *samples = &s->time_samples[ch][i][sf * 4];
> +
> +            float a = samples[-4] * SW0 - samples[-1] * SW3;
> +            float b = samples[-3] * SW1 - samples[-2] * SW2;
> +            float c = samples[ 2] * SW1 + samples[ 1] * SW2;
> +            float d = samples[ 3] * SW0 + samples[ 0] * SW3;
> +
> +            values[i][0] = C1 * b - C2 * c + C4 * a - C3 * d;
> +            values[i][1] = C1 * d - C2 * a - C4 * b - C3 * c;
> +            values[i][2] = C3 * b + C2 * d - C4 * c + C1 * a;
> +            values[i][3] = C3 * a - C2 * b + C4 * d - C1 * c;
> +        }
> +
> +        // Aliasing cancellation for high frequencies
> +        for (i = 12; i < nsubbands - 1; i++) {
> +            float a = values[i  ][3] * AL1;
> +            float b = values[i+1][0] * AL1;
> +            values[i  ][3] += b - a;
> +            values[i+1][0] -= b + a;
> +            a = values[i  ][2] * AL2;
> +            b = values[i+1][1] * AL2;
> +            values[i  ][2] += b - a;
> +            values[i+1][1] -= b + a;
> +        }
> +
> +        base_func_synth(s, ch, values[0], sf);
> +
> +        s->imdct.imdct_calc(&s->imdct, result[0], values[0]);
> +
> +        // Long window and overlap-add
> +        s->fdsp->vector_fmul_add(output, result[0], s->window,
> +                                 s->history[ch], noutsubbands * 4);
> +        s->fdsp->vector_fmul_reverse(s->history[ch], result[noutsubbands],
> +                                     s->window, noutsubbands * 4);
> +        output += noutsubbands * 4;
> +    }
> +
> +    // Update history for LPC and forward MDCT
> +    for (i = 0; i < nsubbands; i++) {
> +        float *samples = s->time_samples[ch][i] - DCA_LBR_TIME_HISTORY;
> +        memcpy(samples, samples + DCA_LBR_TIME_SAMPLES, DCA_LBR_TIME_HISTORY * sizeof(float));
> +    }
> +}