[FFmpeg-devel] [PATCH] WMA Voice decoder

Sun Jan 31 02:06:57 CET 2010

Ronald S. Bultje wrote:
> Hi,
> 
> On Wed, Jan 20, 2010 at 4:56 PM, Ronald S. Bultje <rsbultje at gmail.com> wrote:
>> On Tue, Jan 19, 2010 at 7:38 PM, Ronald S. Bultje <rsbultje at gmail.com> wrote:
>>> my first decoder, please be kind. :-).
>> Updated reflecting Mans and Diego's comments. Thanks both!
> 
> V3, adapted to many comments all over.
> 
> This patch also uses ff_acelp_interpolatef() where appropriate

Thanks! We won't regret it the day people start SIMD-optimizing AMR :)

Second batch of comments:

> Index: ffmpeg-svn/libavcodec/Makefile
> ===================================================================
> --- ffmpeg-svn.orig/libavcodec/Makefile	2010-01-28 10:52:31.000000000 -0500
> +++ ffmpeg-svn/libavcodec/Makefile	2010-01-28 11:08:29.000000000 -0500
> @@ -338,6 +338,7 @@
>  OBJS-$(CONFIG_WMAV1_ENCODER)           += wmaenc.o wma.o
>  OBJS-$(CONFIG_WMAV2_DECODER)           += wmadec.o wma.o
>  OBJS-$(CONFIG_WMAV2_ENCODER)           += wmaenc.o wma.o
> +OBJS-$(CONFIG_WMAVOICE_DECODER)        += wmavoice.o

Lots of celp* acelp* lsp.o missing

> +/**
> + * Initialize decoder.
> + */
> +static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
> +{

Slightly redundant comment.

> +/**
> + * Apply first set of pitch-adaptive window pulses.
> + * @param s WMA Voice decoding context private data
> + * @param gb bit I/O context
> + * @param block_idx block index in frame [0, 1]
> + * @param fcb storage location for fixed codebook pulse info
> + */
> +static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
> +                          int block_idx, AMRFixed *fcb)
> +{
> +    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
> +    float v;
> +
> +    if (s->aw_n_pulses[block_idx]) {
> +        int n, v_mask, i_mask, sh, n_pulses;
> +
> +        if (s->aw_pitch_range == 24) { ///< 3 pulses, 1:sign + 3:index each
> +            n_pulses = 3;
> +            v_mask   = 8;
> +            i_mask   = 7;
> +            sh       = 4;
> +        } else { ///< 4 pulses, 1:sign + 2:index each
> +            n_pulses = 4;
> +            v_mask   = 4;
> +            i_mask   = 3;
> +            sh       = 3;
> +        }
> +
> +        for (n = 1; n <= n_pulses; n++, val >>= sh) {
> +            fcb->y[fcb->n] =  (val & v_mask) ? -1.0 : 1.0;
> +            fcb->x[fcb->n] = ((val & i_mask) + 1) * n_pulses - n +
> +                                  s->aw_first_pulse_off[block_idx];
> +            while (fcb->x[fcb->n] < 0)
> +                fcb->x[fcb->n] += fcb->pitch_lag;
> +            if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2) fcb->n++;
> +        }
> +    } else {
> +        int num2 = (val & 0x1FF) >> 1, delta, idx;
> +
> +        if (num2 < 79)       delta = 1;
> +        else if (num2 < 156) delta = 2;
> +        else if (num2 < 231) delta = 3;
> +        else                 delta = 4;
> +        idx    = (delta * delta + num2) % 80;

         if      (num2 < 1*79) delta = 1; idx = num2 + 1;
         else if (num2 < 2*78) delta = 2; idx = num2 + 1 - 1*77;
         else if (num2 < 3*77) delta = 3; idx = num2 + 1 - 2*76;
         else                  delta = 4; idx = num2 + 1 - 3*75;

> +/**
> + * Parse FCB/ACB signal for a single block.
> + * @note see #synth_block().
> + */
> +static void synth_block_fcb_acb(AVCodecContext *ctx, GetBitContext *gb,
> +                                int block_idx, int size,
> +                                int block_pitch_sh2,
> +                                const struct frame_type_desc *frame_desc,
> +                                float *excitation)
> +{
> +    static const float gain_coeff[6] = {
> +        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
> +    };
> +    WMAVoiceContext *s = ctx->priv_data;
> +    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
> +    int n, idx;
> +    AMRFixed fcb;
> +
> +    assert(size <= MAX_FRAMESIZE / 2);
> +    memset(pulses, 0, sizeof(float) * size);
> +
> +    fcb.pitch_lag      = block_pitch_sh2 >> 2;
> +    fcb.pitch_fac      = 1.0;
> +    fcb.no_repeat_mask = 0;
> +    fcb.n              = 0;
> +
> +    /**
> +     * For the other frame types, this is where we apply the innovation
> +     * (fixed) codebook pulses of the speech signal.
> +     */
> +    if (frame_desc->fcb_type == 2) { ///< pitch-adapting window (AW) codebook
> +        aw_pulse_set1(s, gb, block_idx, &fcb);
> +        aw_pulse_set2(s, gb, block_idx, &fcb);
> +    } else { ///< innovation codebook
> +        int sh = 7 - frame_desc->fcb_type; ///< 3:4/4:3/5:2
> +
> +        fcb.no_repeat_mask = -1;
> +        for (n = 0; n < 5; n++) {
> +            float pulse = get_bits1(gb) ? 1.0 : -1.0;
> +            int idx1, idx2;
> +
> +            idx1           = get_bits(gb, sh);
> +            fcb.x[fcb.n]   = n + 5 * idx1;
> +            fcb.y[fcb.n++] = pulse;
> +            if (n < frame_desc->dbl_pulses) {
> +                idx2           = get_bits(gb, sh);
> +                fcb.x[fcb.n]   = n + 5 * idx2;
> +                fcb.y[fcb.n++] = (idx1 >= idx2) ? pulse : -pulse;
> +            }
> +        }
> +    }
> +    ff_set_fixed_vector(pulses, &fcb, 1.0, size);
> +
> +    /**
> +     * Calculate gain for adaptive & fixed codebook signal.
> +     * @note see #ff_amr_set_fixed_gain().
> +     */
> +    idx = get_bits(gb, 7);
> +    fcb_gain = ff_wmavoice_gain_codebook_fcb[idx] *
> +               expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
> +                    5.2409161640);
> +    acb_gain = ff_wmavoice_gain_codebook_acb[idx];
> +    pred_err = logf(av_clipf(ff_wmavoice_gain_codebook_fcb[idx], 0.05, 5.0));
> +

You can avoid the logf() if you replace the table by its logarithm and

     fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
                     5.2409161640 + ff_wmavoice_gain_codebook_fcb[idx]);
     pred_err = av_clipf(ff_wmavoice_gain_codebook_fcb[idx], log(0.05), 
log(5.0)));

> +    if (frame_desc->acb_type == 1) {
> +        for (n = 0; n < size; n++) {
> +            int pitch_sh8 = (s->last_pitch_val << 8) +
> +                ((s->pitch_diff_sh16 * (block_idx * size + n)) >> 8);
> +            int pitch = (pitch_sh8 + 0x6F) >> 8;
> +            idx = (((pitch << 8) - pitch_sh8) * 8 + 0x80) >> 8;
> +            //FIXME don't call this with len=1
> +            ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
> +                                  ff_wmavoice_ipol1_coeffs, 17,
> +                                  (idx >= 0) | (abs(idx) << 1), 9, 1);

I think you can at least call this with len=s->pitch_diff_sh16 or 
something like that (since pitch_sh8 seems to change only after a 
certain fixed number of iterations). Also it will not stay in the way 
for SIMD optimizations.

> +static const uint8_t ff_wmavoice_dq_lsp10i[0xf00] = {
> +    125, 109,  84,  55,  34,  51, 109, 112, 118, 132,
> +    122, 102,  78,  80, 132, 119, 132, 132, 125, 131,

I think static tables should not have a "ff_" prefix.

Also, please also look at the doxygen warnings, there are quite a few 
that are really relevant.

-Vitor