[FFmpeg-devel] [PATCH] ALS decoder

Thu Aug 20 02:40:31 CEST 2009

Thilo Borgmann <thilo.borgmann at googlemail.com> writes:

> Hi,
>
> the second part of my GSoC project is the decoder itself.
>
> Refers to 14496-3, subpart 11.

Some comments below.  This is not a thorough review.  Someone who
actually understands these algorithms will have to do that.

> +typedef struct {
> +    uint32_t     als_id;                 ///< ALS identifier
> +    uint32_t     samp_freq;              ///< sampling frequency in Hz
> +    uint32_t     samples;                ///< number of samples (per channel), 0xFFFFFFFF if unknown
> +    int          channels;               ///< number of channels
> +    int          file_type;              ///< not used, provided for debugging
> +    int          resolution;             ///< 000 = 8-bit; 001 = 16-bit; 010 = 24-bit; 011 = 32-bit
> +    int          floating;               ///< 1 = IEEE 32-bit floating-point, 0 = integer
> +    int          msb_first;              ///< original byte order of the input audio data
> +    int          frame_length;           ///< frame Length
> +    int          random_access;          ///< distance between RA frames (in frames, 0...255)
> +    enum RA_Flag ra_flag;                ///< indicates where the size of ra units is stored
> +    int          adapt_order;            ///< adaptive order: 1 = on, 0 = off
> +    int          coef_table;             ///< table index of Rice code parameters
> +    int          long_term_prediction;   ///< long term prediction (LTP): 1 = on, 0 = off
> +    int          max_order;              ///< maximum prediction order (0..1023)
> +    int          block_switching;        ///< number of block switching levels
> +    int          bgmc_mode;              ///< BGMC Mode: 1 = on, 0 = off (Rice coding only)
> +    int          sb_part;                ///< sub-block partition
> +    int          joint_stereo;           ///< joint Stereo: 1 = on, 0 = off
> +    int          mc_coding;              ///< extended inter-channel coding: 1 = on, 0 = off
> +    int          chan_config;            ///< indicates that a chan_config_info field is present
> +    int          chan_sort;              ///< channel rearrangement: 1 = on, 0 = off
> +    int          crc_enabled;            ///< indicates that the crc field is present
> +    int          rlslms;                 ///< use RLS-LMS predictor: 1 = on, 0 = off
> +    int          aux_data_enabled;       ///< indicates that auxiliary data is present
> +    int          chan_config_info;       ///< mapping of channels to loudspeaker locations
> +    int          *chan_pos;              ///< original channel positions
> +    uint32_t     header_size;            ///< header size of original audio file in bytes, provided for debugging
> +    uint32_t     trailer_size;           ///< Trailer size of original audio file in bytes, provided for debugging
> +    uint32_t     crc;                    ///< 32-bit CCITT-32 CRC checksum
> +} ALSSpecificConfig;

Please try not to make *every* line exceed 80 columns.  You could
easily shorten them here by reducing the amount of whitespace between
the type and name and before the comment.

> +typedef struct {
> +    AVCodecContext    *avctx;
> +    ALSSpecificConfig sconf;
> +    GetBitContext     gb;                ///< a bit reader context
> +    unsigned int      num_frames;        ///< number of frames to decode, 0 if unknown
> +    unsigned int      cur_frame_length;  ///< length of the current frame to decode
> +    unsigned int      last_frame_length; ///< length of the last frame to decode, 0 if unknown
> +    unsigned int      frame_id;          ///< the frame id / number of the current frame
> +    unsigned int      js_switch;         ///< if true, joint-stereo decoding is enforced
> +    unsigned int      num_blocks;        ///< number of blocks used in the current frame
> +    int64_t           *quant_cof;        ///< quantized parcor coefficients
> +    int64_t           *lpc_cof;          ///< coefficients of the direct form prediction filter
> +    int64_t           *prev_raw_samples; ///< contains unshifted raw samples from the previous block
> +    int64_t           **raw_samples;     ///< decoded raw samples for each channel
> +    int64_t           *raw_buffer;       ///< contains all decoded raw samples including carryover samples
> +} ALSDecContext;

Ditto.

> +/** Computes ceil(log2(x)) using av_log2.
> + */
> +static inline int ceil_log2(int x) {
> +    return x > 0 ? av_log2((x - 1) << 1) : 0;
> +}

Little functions like this are likely to be needed again some time.
They should be placed in a common location.

> +/** Reads an ALSSpecificConfig from a buffer into the output struct.
> + */
> +static av_cold int read_specific_config(ALSDecContext *ctx)
> +{
> +    GetBitContext gb;
> +    uint64_t ht_size;
> +    int i, config_offset;
> +    MPEG4AudioConfig m4ac;
> +    ALSSpecificConfig *sconf = &ctx->sconf;
> +    const uint8_t *buffer    = ctx->avctx->extradata;
> +    int buffer_size          = ctx->avctx->extradata_size;
> +
> +    init_get_bits(&gb, buffer, buffer_size * 8);
> +
> +    config_offset = ff_mpeg4audio_get_config(&m4ac, buffer, buffer_size);
> +
> +    if (config_offset < 0)
> +        return -1;
> +
> +    skip_bits_long(&gb, config_offset);
> +    buffer_size -= config_offset >> 3;
> +
> +    if (buffer_size < 22)
> +        return -1;
> +
> +    // read the fixed items
> +    sconf->als_id               = get_bits_long(&gb, 32);
> +    sconf->samp_freq            = m4ac.sample_rate;
> +    skip_bits_long(&gb, 32);

Maybe comment what is being skipped here and elsewhere.

> +    sconf->samples              = get_bits_long(&gb, 32);
> +    sconf->channels             = m4ac.channels;
> +    skip_bits(&gb, 16);
> +    sconf->file_type            = get_bits(&gb, 3);
> +    sconf->resolution           = get_bits(&gb, 3);
> +    sconf->floating             = get_bits1(&gb);
> +    sconf->msb_first            = get_bits1(&gb);
> +    sconf->frame_length         = get_bits(&gb, 16) + 1;
> +    sconf->random_access        = get_bits(&gb, 8);
> +    sconf->ra_flag              = get_bits(&gb, 2);
> +    sconf->adapt_order          = get_bits1(&gb);
> +    sconf->coef_table           = get_bits(&gb, 2);
> +    sconf->long_term_prediction = get_bits1(&gb);
> +    sconf->max_order            = get_bits(&gb, 10);
> +    sconf->block_switching      = get_bits(&gb, 2);
> +    sconf->bgmc_mode            = get_bits1(&gb);
> +    sconf->sb_part              = get_bits1(&gb);
> +    sconf->joint_stereo         = get_bits1(&gb);
> +    sconf->mc_coding            = get_bits1(&gb);
> +    sconf->chan_config          = get_bits1(&gb);
> +    sconf->chan_sort            = get_bits1(&gb);
> +    sconf->crc_enabled          = get_bits1(&gb);
> +    sconf->rlslms               = get_bits1(&gb);
> +    skip_bits(&gb, 5);                                      // skip 5 reserved bits

Please lose some of that whitespace.  Compliments on the alignment
though.  Diego will love you.

> +    sconf->aux_data_enabled     = get_bits1(&gb);
> +    buffer_size -= 22;
> +
> +
> +    // check for ALSSpecificConfig struct
> +    if (sconf->als_id != MKBETAG('A','L','S','\0'))
> +        return -1;
> +
> +    ctx->cur_frame_length = sconf->frame_length;
> +
> +    // allocate quantized parcor coefficient buffer
> +    if (!(ctx->quant_cof = av_malloc(sizeof(int64_t) * sconf->max_order))) {
> +        av_log(ctx->avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    // allocate LPC coefficients
> +    if (!(ctx->lpc_cof = av_malloc(sizeof(int64_t) * sconf->max_order))) {
> +        av_log(ctx->avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    // calculate total number of frames to decode if possible
> +    if (sconf->samples != 0xFFFFFFFF) {
> +        ctx->num_frames        = ((sconf->samples - 1) / sconf->frame_length) + 1;
> +        ctx->last_frame_length = sconf->samples % ctx->sconf.frame_length;
> +        if (!ctx->last_frame_length) {
> +            ctx->last_frame_length = sconf->frame_length;
> +        }
> +    } else {
> +        ctx->num_frames        = 0;
> +        ctx->last_frame_length = 0;
> +    }
> +
> +
> +    // read channel config
> +    if (sconf->chan_config) {
> +        if (buffer_size < 2)
> +            return -1;
> +        sconf->chan_config_info = get_bits(&gb, 16);
> +        buffer_size -= 2;
> +        // TODO: use this to set avctx->channel_layout
> +    }
> +
> +
> +    // read channel sorting
> +    if (sconf->chan_sort && sconf->channels > 1) {
> +        int chan_pos_bits = ceil_log2(sconf->channels);
> +        int bytes_needed  = (sconf->channels * chan_pos_bits + 7) >> 3;
> +        if (buffer_size < bytes_needed)
> +            return -1;
> +
> +        if(!(sconf->chan_pos = av_malloc(sconf->channels * sizeof(int))))
> +            return -1;
> +
> +        for (i = 0; i < sconf->channels; i++) {
> +            sconf->chan_pos[i] = get_bits(&gb, chan_pos_bits);
> +        }

We usually omit {} for single-line for/if/while statements...

> +        align_get_bits(&gb);
> +        buffer_size -= bytes_needed;
> +        // TODO: use this to actually do channel sorting
> +    } else {
> +        sconf->chan_sort = 0;
> +    }

... except for one-line else clauses after a multiline if.

> +    // read fixed header and trailer sizes, if size = 0xFFFFFFFF then there is no data field!
> +    if (buffer_size < 8)
> +        return -1;
> +
> +    sconf->header_size  = get_bits_long(&gb, 32);
> +    sconf->trailer_size = get_bits_long(&gb, 32);
> +    if (sconf->header_size  == 0xFFFFFFFF)
> +        sconf->header_size  = 0;
> +    if (sconf->trailer_size == 0xFFFFFFFF)
> +        sconf->trailer_size = 0;
> +
> +    ht_size = sconf->header_size + sconf->trailer_size;
> +
> +    buffer_size -= 8;
> +
> +
> +    // skip the header and trailer data
> +    if (buffer_size < ht_size)
> +        return -1;
> +
> +    ht_size <<= 3;
> +
> +    while (ht_size > 0) {
> +        int len = FFMIN(ht_size, INT32_MAX);
> +        skip_bits_long(&gb, len);
> +        ht_size -= len;
> +    }
> +
> +    buffer_size -= ht_size >> 3;
> +
> +
> +    // read the crc data
> +    if (sconf->crc_enabled) {
> +        if (buffer_size < 4)
> +            return -1;
> +
> +        sconf->crc = get_bits_long(&gb, 32);
> +    }
> +
> +
> +    // no need to read the rest of ALSSpecificConfig (ra_unit_size & aux data)
> +#ifdef DEBUG
> +    dprint_specific_config(ctx);
> +#endif
> +    return 0;
> +}
> +
> +
> +/** Checks the ALSSpecificConfig for unsupported features.
> + */
> +static int check_specific_config(ALSDecContext *ctx)
> +{
> +    ALSSpecificConfig *sconf = &ctx->sconf;
> +    int error = 0;
> +
> +    if (sconf->floating) {
> +        av_log_missing_feature(ctx->avctx, "Floating point decoding", 0);
> +        error = -1;
> +    }
> +
> +    if (sconf->long_term_prediction) {
> +        av_log_missing_feature(ctx->avctx, "Long-term prediction", 0);
> +        error = -1;
> +    }
> +
> +    if (sconf->bgmc_mode) {
> +        av_log_missing_feature(ctx->avctx, "BGMC entropy decoding", 0);
> +        error = -1;
> +    }
> +
> +    if (sconf->mc_coding) {
> +        av_log_missing_feature(ctx->avctx, "Multi-channel correlation", 0);
> +        error = -1;
> +    }
> +
> +    if (sconf->chan_sort) {
> +        av_log_missing_feature(ctx->avctx, "Channel sorting", 0);
> +    }
> +
> +    if (sconf->rlslms) {
> +        av_log_missing_feature(ctx->avctx, "Adaptive RLS-LMS prediction", 0);
> +        error = -1;
> +    }
> +
> +    return error;
> +}
> +
> +
> +/** Parses the bs_info item to extract the block partitioning.
> + */
> +static void parse_bs_info(uint32_t bs_info, unsigned int n, unsigned int div,
> +                          unsigned int **div_blocks, unsigned int *num_blocks)
> +{
> +    if (n < 31 && ((bs_info >> (30 - n)) & 1)) {
> +        // if the level is valid and the investigated bit n is set
> +        // then recursively check both children at bits (2n+1) and (2n+2)
> +        n   *= 2;
> +        div += 1;
> +        parse_bs_info(bs_info, n + 1, div, div_blocks, num_blocks);
> +        parse_bs_info(bs_info, n + 2, div, div_blocks, num_blocks);
> +    } else {
> +        // else the bit is not set or the last level has been reached
> +        // (bit implicitly not set)
> +        **div_blocks = div;
> +        (*div_blocks)++;
> +        (*num_blocks)++;
> +    }
> +}
> +
> +
> +/** Reads and decodes a Rice codeword.
> + */
> +static int64_t decode_rice(GetBitContext *gb, unsigned int k)
> +{
> +    int64_t value = 0;
> +    int64_t q = 0;
> +    int     max = gb->size_in_bits - get_bits_count(gb) - k;
> +
> +    if (!k) {
> +        q = get_unary(gb, 0, max);
> +        return (q & 1) ? -((q + 1) >> 1) : ((q + 1) >> 1);
> +    } else if (k == 1) {
> +        q = get_unary(gb, 0, max);
> +        return get_bits1(gb) ? q : -(q + 1);
> +    } else {
> +        unsigned int r, sub_sign;
> +
> +        q         = get_unary(gb, 0, max);
> +        sub_sign  = get_bits1(gb);
> +        r         = get_bits_long(gb, k - 1);
> +
> +        value = (q << (k - 1)) + r;
> +
> +        return sub_sign ? value : -(value + 1);
> +    }
> +}

This function looks like it was designed specifically to make gcc fail:

1.  64-bit variables used unnecessarily.
2.  GCC hates complex conditionals.
3.  Compilers in general are bad at bit-hacks.

It seems to me that int is enough for 'q' in the first two cases.
get_unary() returns int, so we start with at most that many bits.  To
avoid overflow in q+1, do this instead:

    int r = q & 1;
    return r ? -((q >> 1) + r) : ((q >> 1) + r);

Furthermore, gcc does stupid things with that code even with plain int
variables.  Rewriting in a few more steps helps massively:

    int r = q & 1;
    q = (q >> 1) + r;
    return r ? -q : q;

This reduces the number of instructions on ARM from 6 to 4 (from 4 to
3 with armcc).  On MIPS it goes from 9 with a branch to 5 branch-free.

Finally, -(v + 1) is equivalent to ~v on two's complement machines,
which we have previously agreed to assume we have.  Hence we should
write the return values in the second and third cases like this:

    return sign ? value : ~value;

This prevents 32-bit overflow in the k==1 case and gives better code
with several compilers in the final case, where we must resort to
64-bit maths.

> +/** Converts PARCOR coefficient k to direct filter coefficient.
> + */
> +static void parcor_to_lpc(unsigned int k, int64_t *par, int64_t *cof)
> +{
> +    int i;
> +    int64_t tmp1, tmp2;
> +
> +    for (i = 0; i < (k+1) >> 1; i++) {
> +        tmp1 = cof[    i    ] + ((par[k] * cof[k - i - 1] + (1 << 19)) >> 20);
> +        tmp2 = cof[k - i - 1] + ((par[k] * cof[    i    ] + (1 << 19)) >> 20);
> +        cof[k - i - 1] = tmp2;
> +        cof[    i    ] = tmp1;
> +    }
> +
> +    cof[k] = par[k];
> +}

This is again a perfect trap for gcc.  64-bit maths in tight loops is
something it is *really* bad at.  Is there no way to avoid 64-bit
elements here?  Is perhaps the range usually smaller, so we could
choose a 32-bit version most of the time and fall back on 64-bit only
when required?  Regardless of compiler, 64-bit multiplication is
expensive on any 32-bit machine.

> +/** Reformat block sizes from log2 format to direct form. Also assure that the
> + *  block sizes of the last frame correspond to the actual number of samples.
> + */
> +static void reconstruct_block_sizes(ALSDecContext *ctx, uint32_t *div_blocks)
> +{
> +    unsigned int b;
> +
> +    // The last frame may have an overdetermined block structure given in
> +    // the bitstream. In that case the defined block structure would need
> +    // more samples than available to be consistent.
> +    // The block structure is actually used but the block sizes are adapted
> +    // to fit the actual number of available samples.
> +    // Example: 5 samples, 2nd level block sizes: 2 2 2 2.
> +    // This results in the actual block sizes:    2 2 1 0.
> +    // This is not specified in 14496-3 but actually done by the reference
> +    // codec RM22 revision 2.
> +    // This appears to happen in case of an odd number of samples in the last
> +    // frame which is actually not allowed by the block length switching part
> +    // of 14496-3.
> +    // The ALS conformance files feature an odd number of samples in the last
> +    // frame.
> +    if (ctx->cur_frame_length == ctx->last_frame_length) {
> +        unsigned int remaining = ctx->cur_frame_length;
> +
> +        for (b = 0; b < ctx->num_blocks; b++) {
> +            div_blocks[b] = ctx->sconf.frame_length >> div_blocks[b];
> +
> +            if (remaining < div_blocks[b]) {
> +                div_blocks[b] = remaining;
> +                ctx->num_blocks = b + 1;
> +                break;
> +            } else {
> +                remaining -= div_blocks[b];
> +            }
> +        }
> +    } else {
> +        for (b = 0; b < ctx->num_blocks; b++)
> +            div_blocks[b] = ctx->sconf.frame_length >> div_blocks[b];
> +    }
> +}
> +
> +
> +/** Reads the block data.
> + */
> +static int read_block_data(ALSDecContext *ctx, unsigned int ra_block,
> +                            int64_t *raw_samples, unsigned int block_length,
> +                            unsigned int *js_blocks, int64_t *raw_other)
> +{
> +    ALSSpecificConfig *sconf = &ctx->sconf;
> +    AVCodecContext *avctx    = ctx->avctx;
> +    GetBitContext *gb        = &ctx->gb;
> +    unsigned int shift_lsbs  = 0;
> +    unsigned int block_type;
> +    unsigned int k;
> +
> +    block_type = get_bits1(gb);
> +
> +    if (block_type == 0) {
> +        unsigned int const_block;
> +        int32_t      const_val = 0;
> +
> +        const_block  = get_bits1(gb);    // 1 = constant value, 0 = zero block (silence)
> +        *js_blocks   = get_bits1(gb);
> +
> +        // skip 5 reserved bits
> +        skip_bits(gb, 5);
> +
> +        if (const_block) {
> +            unsigned int const_val_bits;
> +
> +            if (sconf->resolution == 2 || sconf->floating)
> +                const_val_bits = 24;
> +            else
> +                const_val_bits = avctx->bits_per_raw_sample;
> +
> +            const_val = get_sbits_long(gb, const_val_bits);
> +        }
> +
> +        // write raw samples into buffer
> +        for (k = 0; k < block_length; k++)
> +            raw_samples[k] = const_val;
> +    } else {
> +        unsigned int s[8];
> +        unsigned int sub_blocks, sb_length;
> +        unsigned int opt_order = 1;
> +        int64_t      *quant_cof = ctx->quant_cof;
> +        int64_t      *lpc_cof   = ctx->lpc_cof;
> +        unsigned int start = 0;
> +        int          sb, smp;
> +        int64_t      y;
> +
> +        *js_blocks  = get_bits1(gb);
> +
> +        // determine the number of sub blocks for entropy decoding
> +        if (!sconf->bgmc_mode && !sconf->sb_part)
> +            sub_blocks = 1;
> +        else if (sconf->bgmc_mode && sconf->sb_part)
> +            sub_blocks = 1 << get_bits(gb, 2);
> +        else
> +            sub_blocks = get_bits1(gb) ? 4 : 1;
> +
> +        // do not continue in case of a damaged stream since
> +        // block_length must be evenly divisible by sub_blocks
> +        if (block_length % sub_blocks) {
> +            av_log(avctx, AV_LOG_WARNING,
> +                   "Block length is not evenly divisible by the number of sub blocks.\n");
> +            return -1;
> +        }
> +
> +        sb_length = block_length / sub_blocks;
> +
> +
> +        if (!sconf->bgmc_mode) {
> +            s[0] = get_bits(gb, (sconf->resolution > 1) ? 5 : 4);
> +            for (k = 1; k < sub_blocks; k++)
> +                s[k] = s[k - 1] + decode_rice(gb, 0);
> +        } else {
> +            // TODO: BGMC mode
> +        }
> +
> +        if (get_bits1(gb)) {
> +            shift_lsbs = get_bits(gb, 4) + 1;
> +        }
> +
> +
> +        if (!sconf->rlslms) {
> +            int64_t quant_index;
> +
> +            if (sconf->adapt_order) {
> +                int opt_order_length =
> +                        FFMIN(
> +                        ceil_log2(sconf->max_order+1),
> +                        FFMAX(ceil_log2((block_length >> 3) - 1), 1)
> +                        );
> +                opt_order = get_bits(gb, opt_order_length);
> +            } else {
> +                opt_order = sconf->max_order;
> +            }
> +
> +            if (opt_order) {
> +                if (sconf->coef_table == 3) {
> +                    // read coefficient 0
> +                    quant_index = get_bits(gb, 7) - 64;
> +                    quant_cof[0] = parcor_scaled_values[quant_index + 64];
> +
> +                    // read coefficient 1
> +                    quant_index = get_bits(gb, 7) - 64;
> +                    quant_cof[1] = -parcor_scaled_values[quant_index + 64];
> +
> +                    // read coefficients 2 to opt_order
> +                    for (k = 2; k < opt_order; k++) {
> +                        quant_index = get_bits(gb, 7) - 64;
> +                        quant_cof[k] = (quant_index << 14) + (1 << 13);
> +                    }
> +                } else {
> +                    int offset, rice_param, k_max;
> +
> +                    // read coefficient 0
> +                    offset       = parcor_rice_table[sconf->coef_table][0][0];
> +                    rice_param   = parcor_rice_table[sconf->coef_table][0][1];
> +                    quant_index  = decode_rice(gb, rice_param) + offset;
> +                    quant_cof[0] = parcor_scaled_values[quant_index + 64];
> +
> +                    // read coefficient 1
> +                    offset       = parcor_rice_table[sconf->coef_table][1][0];
> +                    rice_param   = parcor_rice_table[sconf->coef_table][1][1];
> +                    quant_index  = decode_rice(gb, rice_param) + offset;
> +                    quant_cof[1] = -parcor_scaled_values[quant_index + 64];
> +
> +                    // read coefficients 2 to 19
> +                    k_max = FFMIN(20, opt_order);
> +                    for (k = 2; k < k_max; k++) {
> +                        offset       = parcor_rice_table[sconf->coef_table][k][0];
> +                        rice_param   = parcor_rice_table[sconf->coef_table][k][1];
> +                        quant_index  = decode_rice(gb, rice_param) + offset;
> +                        quant_cof[k] = (quant_index << 14) + (1 << 13);
> +                    }
> +
> +                    // read coefficients 20 to 126
> +                    k_max = FFMIN(127, opt_order);
> +                    for (k = 20; k < k_max; k++) {
> +                        offset       = k & 1;
> +                        rice_param   = 2;
> +                        quant_index  = decode_rice(gb, rice_param) + offset;
> +                        quant_cof[k] = (quant_index << 14) + (1 << 13);
> +                    }
> +
> +                    // read coefficients 127 to opt_order
> +                    for (k = 127; k < opt_order; k++) {
> +                        offset       = 0;
> +                        rice_param   = 1;
> +                        quant_index  = decode_rice(gb, rice_param) + offset;
> +                        quant_cof[k] = (quant_index << 14) + (1 << 13);
> +                    }
> +                }
> +            }
> +        }

The block nesting is very deep here.  Perhaps splitting this large
function into several smaller ones would help readability.  Even gcc
should manage to inline them.

> +        if (sconf->long_term_prediction) {
> +            // TODO: LTP mode
> +        }
> +
> +        start = 0;
> +
> +        // read first value and residuals in case of a random access block
> +        if (ra_block) {
> +            if (opt_order)
> +                raw_samples[0] = decode_rice(gb, avctx->bits_per_raw_sample - 4);
> +            if (opt_order > 1)
> +                raw_samples[1] = decode_rice(gb, s[0] + 3);
> +            if (opt_order > 2)
> +                raw_samples[2] = decode_rice(gb, s[0] + 1);
> +
> +            start = FFMIN(opt_order, 3);
> +        } else {
> +            for (k = 0; k < opt_order; k++)
> +                parcor_to_lpc(k, quant_cof, lpc_cof);
> +        }
> +
> +        // read all residuals
> +        // TODO: decode directly into ctx->raw_samples[] instead of storing the residuals
> +        if (sconf->bgmc_mode) {
> +            // TODO: BGMC mode
> +        } else {
> +            int64_t *current_res = raw_samples;
> +
> +            for (sb = 0; sb < sub_blocks; sb++) {
> +                for (k = start; k < sb_length; k++) {
> +                    current_res[k] = decode_rice(gb, s[sb]);
> +                }
> +                current_res += sb_length;
> +                start = 0;
> +            }
> +         }
> +
> +        // reconstruct all samples from residuals
> +        if (ra_block) {
> +            unsigned int progressive = FFMIN(block_length, opt_order);
> +
> +            for (smp = 0; smp < block_length; smp++) {
> +                unsigned int max, dequant;
> +
> +                dequant = smp < progressive;
> +                max     = dequant ? smp : progressive;
> +
> +                y = 1 << 19;
> +
> +                for (sb = 0; sb < max; sb++)
> +                    y += lpc_cof[sb] * raw_samples[smp - (sb + 1)];
> +
> +                raw_samples[smp] -= y >> 20;
> +                if (dequant)
> +                    parcor_to_lpc(smp, quant_cof, lpc_cof);
> +            }
> +        } else {
> +            int store_prev_samples = (*js_blocks && raw_other) || shift_lsbs;
> +
> +            // store previous smaples in case that they have to be altered
> +            if (store_prev_samples)
> +                memcpy(ctx->prev_raw_samples, raw_samples - sconf->max_order,
> +                       sizeof(int64_t) * sconf->max_order);
> +
> +            // reconstruct difference signal for prediction (joint-stereo)
> +            if (*js_blocks && raw_other) {
> +                int i;
> +                if (raw_other > raw_samples) {          // D = R - L
> +                    for (i = -1; i >= -sconf->max_order; i--)
> +                        raw_samples[i] = raw_other[i] - raw_samples[i];
> +                } else {                                // D = R - L

Are those two comments meant to be the same?

> +                    for (i = -1; i >= -sconf->max_order; i--)
> +                        raw_samples[i] = raw_samples[i] - raw_other[i];
> +                }
> +            }
> +
> +            // reconstruct shifted signal
> +            if (shift_lsbs) {
> +                for (smp = -1; smp >= -sconf->max_order; smp--)
> +                    raw_samples[smp] >>= shift_lsbs;
> +            }
> +
> +            // reconstruct raw samples
> +            for (smp = 0; smp < block_length; smp++) {
> +                y = 1 << 19;
> +
> +                for (sb = 0; sb < opt_order; sb++)
> +                    y += lpc_cof[sb] * raw_samples[smp - (sb + 1)];
> +
> +                raw_samples[smp] -= y >> 20;
> +            }
> +
> +            // restore previous samples in case that they have been altered
> +            if (store_prev_samples)
> +                memcpy(raw_samples - sconf->max_order, ctx->prev_raw_samples,
> +                       sizeof(int64_t) * sconf->max_order);
> +        }
> +    }
> +
> +    if (sconf->rlslms) {
> +        // TODO: read RLSLMS extension data
> +    }
> +
> +    if (!sconf->mc_coding || ctx->js_switch) {
> +        align_get_bits(gb);
> +    }
> +
> +    if (shift_lsbs) {
> +        for (k = 0; k < block_length; k++)
> +            raw_samples[k] <<= shift_lsbs;
> +    }
> +
> +    return 0;
> +}

Many of the loops above look like they could be easily simdified.  I
don't know how much time is spent in each, so I haven't thought about
it in detail.  Breaking the function apart would also help in finding
where most time is spent.

> +
> +/** Reads the frame data.
> + */
> +static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
> +{
> +    ALSSpecificConfig *sconf = &ctx->sconf;
> +    GetBitContext *gb = &ctx->gb;
> +    unsigned int div_blocks[32];                ///< Block sizes.
> +    unsigned int c, b, ra_block;
> +    int64_t *raw_samples_L;
> +    int64_t *raw_samples_R;
> +    unsigned int js_blocks[2];
> +
> +    uint32_t bs_info = 0;
> +    unsigned int *ptr_div_blocks;
> +
> +    // skip ra_unit_size if present
> +    if (sconf->ra_flag == RA_FLAG_FRAMES && ra_frame)
> +        skip_bits_long(gb, 32);
> +
> +    if (sconf->mc_coding && sconf->joint_stereo) {
> +        ctx->js_switch = get_bits1(gb);
> +        align_get_bits(gb);
> +    }
> +
> +    if (!sconf->mc_coding || ctx->js_switch) {
> +        int independent_bs = !sconf->joint_stereo;
> +
> +        for (c = 0; c < sconf->channels; c++) {
> +            js_blocks[0] = 0;
> +            js_blocks[1] = 0;
> +
> +            if (sconf->block_switching) {
> +                unsigned int bs_info_len = 1 << (sconf->block_switching + 2);
> +                bs_info = get_bits_long(gb, bs_info_len);
> +                bs_info <<= (32 - bs_info_len);
> +            }
> +
> +            ctx->num_blocks = 0;
> +            ptr_div_blocks = &div_blocks[0];
> +            parse_bs_info(bs_info, 0, 0, &ptr_div_blocks, &ctx->num_blocks);
> +            reconstruct_block_sizes(ctx, div_blocks);
> +
> +            // if joint_stereo and block_switching is set, independent decoding
> +            // is signaled via the first bit of bs_info
> +            if(sconf->joint_stereo && sconf->block_switching) {
> +                if (bs_info >> 31)
> +                    independent_bs = 2;
> +            }
> +
> +            // if this is the last channel, it has to be decoded independently
> +            if (c == sconf->channels - 1)
> +                independent_bs = 1;
> +
> +            if (independent_bs) {
> +                raw_samples_L = ctx->raw_samples[c];
> +
> +                for (b = 0; b < ctx->num_blocks; b++) {
> +                    ra_block = !b && ra_frame;
> +                    if (read_block_data(ctx, ra_block, raw_samples_L,
> +                                        div_blocks[b], &js_blocks[0], NULL)) {
> +                        // damaged block, write zero for the rest of the frame
> +                        while (b < ctx->num_blocks) {
> +                            memset(raw_samples_L, 0, div_blocks[b]);
> +                            raw_samples_L += div_blocks[b];
> +                            b++;
> +                        }
> +                        return -1;
> +                    }
> +                    raw_samples_L += div_blocks[b];
> +                }
> +
> +                // store carryover raw samples
> +                memmove((ctx->raw_samples[c]) - sconf->max_order,
> +                        (ctx->raw_samples[c]) - sconf->max_order + sconf->frame_length,
> +                        sizeof(int64_t) * sconf->max_order);
> +
> +                if(independent_bs)
> +                    independent_bs--;
> +            } else {
> +                unsigned int offset = 0;
> +
> +                // decode all blocks
> +                for (b = 0; b < ctx->num_blocks; b++) {
> +                    unsigned int s;
> +                    raw_samples_L = ctx->raw_samples[c    ] + offset;
> +                    raw_samples_R = ctx->raw_samples[c + 1] + offset;
> +                    ra_block = !b && ra_frame;
> +                    if (read_block_data(ctx, ra_block, raw_samples_L, div_blocks[b],
> +                                        &js_blocks[0], raw_samples_R) ||
> +                        read_block_data(ctx, ra_block, raw_samples_R, div_blocks[b],
> +                                        &js_blocks[1], raw_samples_L)) {
> +                        // damaged block, write zero for the rest of the frame
> +                        while (b < ctx->num_blocks) {
> +                            memset(raw_samples_L, 0, div_blocks[b]);
> +                            memset(raw_samples_R, 0, div_blocks[b]);
> +                            raw_samples_L += div_blocks[b];
> +                            raw_samples_R += div_blocks[b];
> +                            b++;
> +                        }
> +                        return -1;
> +                    }

Again, very deep nesting makes code hard to read.  Consider making
some blocks separate functions.

> +                    // reconstruct joint-stereo blocks
> +                    if (js_blocks[0]) {
> +                        if (js_blocks[1])
> +                            av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel pair!\n");
> +
> +                        for (s = 0; s < div_blocks[b]; s++)
> +                            raw_samples_L[s] = raw_samples_R[s] - raw_samples_L[s];
> +                    } else if (js_blocks[1]) {
> +                        for (s = 0; s < div_blocks[b]; s++)
> +                            raw_samples_R[s] = raw_samples_R[s] + raw_samples_L[s];
> +                    }
> +
> +                    offset += div_blocks[b];
> +                }
> +
> +                // store carryover raw samples
> +                memmove((ctx->raw_samples[c]) - sconf->max_order,
> +                        (ctx->raw_samples[c]) - sconf->max_order + sconf->frame_length,
> +                        sizeof(int64_t) * sconf->max_order);
> +
> +                memmove((ctx->raw_samples[c + 1]) - sconf->max_order,
> +                        (ctx->raw_samples[c + 1]) - sconf->max_order + sconf->frame_length,
> +                        sizeof(int64_t) * sconf->max_order);
> +
> +                c++;
> +            }
> +        }
> +    } else { // multi-channel coding
> +        if (sconf->block_switching) {
> +            unsigned int bs_info_len = 1 << (sconf->block_switching + 2);
> +            bs_info = get_bits_long(gb, bs_info_len);
> +            bs_info <<= (32 - bs_info_len);
> +        }
> +
> +        ctx->num_blocks = 0;
> +        ptr_div_blocks = &div_blocks[0];
> +        parse_bs_info(bs_info, 0, 0, &ptr_div_blocks, &ctx->num_blocks);
> +        reconstruct_block_sizes(ctx, div_blocks);
> +
> +        // TODO: multi channel coding might use a temporary buffer instead as
> +        //       the actual channel is not known when read_block-data is called
> +        raw_samples_L = ctx->raw_samples[0];
> +
> +        for (b = 0; b < ctx->num_blocks; b++) {
> +            ra_block = !b && ra_frame;
> +            if (read_block_data(ctx, ra_block, raw_samples_L,
> +                                div_blocks[b], &js_blocks[0], NULL)) {
> +                // damaged block, write zero for the rest of the frame
> +                while (b < ctx->num_blocks) {
> +                    memset(raw_samples_L, 0, div_blocks[b]);
> +                    raw_samples_L += div_blocks[b];
> +                    b++;
> +                }
> +                return -1;
> +            }
> +            raw_samples_L += div_blocks[b];
> +            // TODO: read_channel_data
> +        }
> +    }
> +
> +    if (sconf->floating) {
> +        // TODO: read_diff_float_data
> +    }
> +
> +    return 0;
> +}

Maybe you already tried to fix it, but there's a scary number of
memcpy/memmove calls above.

> +/** Decodes an ALS frame.
> + */
> +static int decode_frame(AVCodecContext *avctx,
> +                        void *data, int *data_size,
> +                        AVPacket *avpkt)
> +{
> +    ALSDecContext *ctx       = avctx->priv_data;
> +    ALSSpecificConfig *sconf = &ctx->sconf;
> +    const uint8_t *buffer    = avpkt->data;
> +    int buffer_size          = avpkt->size;
> +    int invalid_frame        = 0;
> +    unsigned int c, sample, ra_frame, bytes_read, shift;
> +
> +    init_get_bits(&ctx->gb, buffer, buffer_size * 8);
> +    ra_frame = sconf->random_access && ((!ctx->frame_id) ||
> +               !(ctx->frame_id % sconf->random_access));
> +
> +    // the last frame to decode might have a different length
> +    if (ctx->num_frames && ctx->num_frames - 1 == ctx->frame_id) {
> +        ctx->cur_frame_length = ctx->last_frame_length;
> +    }
> +
> +    // decode the frame data
> +    if ((invalid_frame = read_frame_data(ctx, ra_frame))) {
> +        av_log(ctx->avctx, AV_LOG_WARNING,
> +               "Reading frame data failed. Skipping RA unit.\n");
> +    }
> +
> +    // increment the frame counter
> +    ctx->frame_id++;
> +
> +    // transform decoded frame into output format
> +    #define INTERLEAVE_OUTPUT(bps)                                                 \

That's an awful lot of whitespace.  Please cut it back so the lines
fit in 80 columns, which will be easily possible when you address my
next comment.

> +    {                                                                              \
> +        int##bps##_t *dest = (int##bps##_t*) data;                                 \
> +        shift = bps - ctx->avctx->bits_per_raw_sample;                             \
> +        for (sample = 0; sample < ctx->cur_frame_length; sample++) {               \
> +            for (c = 0; c < sconf->channels; c++) {                                \
> +                *(dest++) = (int##bps##_t) (ctx->raw_samples[c][sample] << shift); \

Useless parens: *dest++ is good.  Useless cast => more useless parens.

> +            }                                                                      \
> +        }                                                                          \
> +    }
> +
> +    if (ctx->avctx->bits_per_raw_sample <= 16) {
> +        INTERLEAVE_OUTPUT(16)
> +    } else {
> +        INTERLEAVE_OUTPUT(32)
> +    }
> +
> +    *data_size = ctx->cur_frame_length * sconf->channels
> +                 * (av_get_bits_per_sample_format(avctx->sample_fmt) >> 3);
> +
> +    bytes_read = invalid_frame ? buffer_size :
> +                                 (get_bits_count(&ctx->gb) + 7) >> 3;
> +
> +    return bytes_read;
> +}
> +
> +
> +/** Uninitializes the ALS decoder.
> + */
> +static av_cold int decode_end(AVCodecContext *avctx)
> +{
> +    ALSDecContext *ctx = avctx->priv_data;
> +
> +    av_freep(&ctx->sconf.chan_pos);
> +
> +    av_freep(&ctx->quant_cof);
> +    av_freep(&ctx->lpc_cof);
> +    av_freep(&ctx->prev_raw_samples);
> +    av_freep(&ctx->raw_samples);
> +    av_freep(&ctx->raw_buffer);
> +
> +    return 0;
> +}
> +
> +
> +/** Initializes the ALS decoder.
> + */
> +static av_cold int decode_init(AVCodecContext *avctx)
> +{
> +    unsigned int c;
> +    unsigned int channel_size;
> +    ALSDecContext *ctx = avctx->priv_data;
> +    ALSSpecificConfig *sconf = &ctx->sconf;
> +    ctx->avctx = avctx;
> +
> +    if (!avctx->extradata) {
> +        av_log(avctx, AV_LOG_ERROR, "Missing required ALS extradata.\n");
> +        return -1;
> +    }
> +
> +    if (read_specific_config(ctx)) {
> +        av_log(avctx, AV_LOG_ERROR, "Reading ALSSpecificConfig failed.\n");
> +        decode_end(avctx);
> +        return -1;
> +    }
> +
> +    if (check_specific_config(ctx)) {
> +        decode_end(avctx);
> +        return -1;
> +    }
> +
> +    avctx->sample_rate = sconf->samp_freq;
> +    avctx->channels    = sconf->channels;
> +
> +    if (sconf->floating) {
> +        avctx->sample_fmt          = SAMPLE_FMT_FLT;
> +        avctx->bits_per_raw_sample = 32;
> +    } else {
> +        avctx->sample_fmt          = sconf->resolution > 1
> +                                     ? SAMPLE_FMT_S32 : SAMPLE_FMT_S16;
> +        avctx->bits_per_raw_sample = (sconf->resolution + 1) * 8;
> +    }
> +
> +    avctx->frame_size = sconf->frame_length;
> +    channel_size      = sconf->frame_length + sconf->max_order;
> +
> +    // allocate previous raw sample buffer
> +    if (!(ctx->prev_raw_samples = av_malloc(sizeof(int64_t) * sconf->max_order))) {
> +        av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
> +        decode_end(avctx);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    // allocate raw and carried sample buffer
> +    if (!(ctx->raw_buffer = av_mallocz(sizeof(int64_t) *
> +                                       avctx->channels * channel_size))) {
> +        av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
> +        decode_end(avctx);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    // allocate raw sample array buffer
> +    if (!(ctx->raw_samples = av_malloc(sizeof(int64_t*) * avctx->channels))) {
> +        av_log(avctx, AV_LOG_ERROR, "Allocating buffer array failed.\n");
> +        decode_end(avctx);
> +        return AVERROR(ENOMEM);
> +    }

This looks very repetitive...

> +    // allocate raw and carried samples buffers
> +    ctx->raw_samples[0] = ctx->raw_buffer + sconf->max_order;
> +    for (c = 1; c < avctx->channels; c++) {
> +        ctx->raw_samples[c] = ctx->raw_samples[c - 1] + channel_size;
> +    }
> +
> +    return 0;
> +}
> +
> +
> +AVCodec als_decoder = {
> +    "als",
> +    CODEC_TYPE_AUDIO,
> +    CODEC_ID_MP4ALS,
> +    sizeof(ALSDecContext),
> +    decode_init,
> +    NULL,
> +    decode_end,
> +    decode_frame,
> +    .long_name = NULL_IF_CONFIG_SMALL("MPEG-4 Audio Lossless Coding (ALS)"),
> +};
> +
> Index: libavcodec/als_data.h
> ===================================================================
> --- libavcodec/als_data.h	(revision 0)
> +++ libavcodec/als_data.h	(revision 0)
> @@ -0,0 +1,147 @@
> +/*
> + * ALS header file for common data
> + * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file libavcodec/als_data.h
> + * MPEG-4 ALS header file for common data
> + * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
> + */
> +
> +
> +#ifndef AVCODEC_ALS_DATA_H
> +#define AVCODEC_ALS_DATA_H
> +
> +
> +#include <stdint.h>
> +
> +/** Rice parameters and corresponding index offsets for decoding the
> + *  indices of scaled PARCOR values. The table choosen is set globally
> + *  by the encoder and stored in ALSSpecificConfig.
> + */
> +int8_t parcor_rice_table[3][20][2] = {
> +                        {
> +                        {-52, 4},

WTF happened to the indentation here?

> +
> +/** Scaled PARCOR values used for the first two PARCOR coefficients.
> + *  To be indexed by the Rice coded indices.
> + *  Generated by: parcor_scaled_values[i] = 32 + ((i * (i+1)) << 7) - (1 << 20)
> + */
> +int32_t parcor_scaled_values[] = {-1048544, -1048288, -1047776, -1047008,
> +                                  -1045984, -1044704, -1043168, -1041376,

And here.

> +#endif /* AVCODEC_ALS_DATA_H */
> Index: libavcodec/allcodecs.c
> ===================================================================
> --- libavcodec/allcodecs.c	(revision 19671)
> +++ libavcodec/allcodecs.c	(working copy)
> @@ -198,6 +198,7 @@
>      REGISTER_ENCDEC  (AAC, aac);
>      REGISTER_ENCDEC  (AC3, ac3);
>      REGISTER_ENCDEC  (ALAC, alac);
> +    REGISTER_DECODER (ALS, als);
>      REGISTER_DECODER (APE, ape);
>      REGISTER_DECODER (ATRAC3, atrac3);
>      REGISTER_DECODER (COOK, cook);
> Index: libavcodec/Makefile
> ===================================================================
> --- libavcodec/Makefile	(revision 19671)
> +++ libavcodec/Makefile	(working copy)
> @@ -42,6 +42,7 @@
>  OBJS-$(CONFIG_AC3_ENCODER)             += ac3enc.o ac3tab.o ac3.o
>  OBJS-$(CONFIG_ALAC_DECODER)            += alac.o
>  OBJS-$(CONFIG_ALAC_ENCODER)            += alacenc.o lpc.o
> +OBJS-$(CONFIG_ALS_DECODER)             += alsdec.o
>  OBJS-$(CONFIG_AMV_DECODER)             += sp5xdec.o mjpegdec.o mjpeg.o
>  OBJS-$(CONFIG_APE_DECODER)             += apedec.o
>  OBJS-$(CONFIG_ASV1_DECODER)            += asv1.o mpeg12data.o
> Index: Changelog
> ===================================================================
> --- Changelog	(revision 19671)
> +++ Changelog	(working copy)
> @@ -32,6 +32,7 @@
>  - RTMP support in libavformat
>  - noX handling for OPT_BOOL X options
>  - Wave64 demuxer
> +- MPEG-4 ALS decoder
>
> Index: doc/general.texi
> ===================================================================
> --- doc/general.texi	(revision 19671)
> +++ doc/general.texi	(working copy)
> @@ -564,6 +564,7 @@
>  @item MP2 (MPEG audio layer 2)  @tab IX  @tab IX
>  @item MP3 (MPEG audio layer 3)  @tab  E  @tab IX
>      @tab encoding supported through external library LAME, ADU MP3 and MP3onMP4 also supported
> + at item MPEG-4 Audio Lossless Coding (ALS)  @tab     @tab  X
>  @item Musepack SV7           @tab     @tab  X
>  @item Musepack SV8           @tab     @tab  X
>  @item Nellymoser Asao        @tab  X  @tab  X

-- 
M?ns Rullg?rd
mans at mansr.com