[FFmpeg-devel] [RFC] Generic psychoacoustic model interface

Thu Aug 28 19:10:26 CEST 2008

On Wed, Aug 27, 2008 at 04:33:17PM +0200, Michael Niedermayer wrote:
> On Wed, Aug 27, 2008 at 11:35:20AM +0300, Kostya wrote:
> > Here's my first attempt to define codec-agnostic psy model.
> > Here's an interface for it. I'm not sure about AC3, but
> > it should be possible to use it with DCA, Vorbis,
> > MPEG Audio Layers I-III and NBC, maybe WMA too.
> > In case somebody codes an implementation, of course.
> > Personally I plan to make my encoder use it backed with
> > already implemented 3GPP model.
> 
> [...]
> > /**
> >  * windowing related information
> >  */
> > typedef struct FFWindowInfo{
> >     int window_type[2];               ///< window type (short/long/transitional, etc.) - current and previous
> >     int window_shape;                 ///< window shape (sine/KBD/whatever)
> 
> >     void *additional_info;            ///< codec-dependent window information
> 
> passing opaque data from psy to encoder is not clean, it requires
> both to maintain a "hidden" compatible API

Of course, unless we can decide on what will be needed for all encoders. 

> > }FFWindowInfo;
> > 
> > /**
> >  * context used by psychoacoustic model
> >  */
> > typedef struct FFPsyContext{
> >     AVCodecContext *avctx;            ///< encoder context
> > 
> >     FFPsyBand bands[MAX_BANDS];       ///< frame bands information
> >     FFWindowInfo *win_info;           ///< frame window info
> > 
> 
> >     const uint8_t *long_bands;        ///< scalefactor band sizes for long frame
> >     int num_long_bands;               ///< number of scalefactor bands for long frame
> >     const uint8_t *short_bands;       ///< scalefactor band sizes for short frame
> >     int num_short_bands;              ///< number of scalefactor bands for short frame
> 
> Having only 2 band lists would be a problem for any codec that has more
> than 2 window lengths (like wma)
> 
> 
> [...]
> > /**
> >  * Suggest window sequence for channel.
> >  *
> >  * @param ctx       model context
> >  * @param audio     samples for the current frame
> >  * @param la        lookahead samples (NULL when unavailable)
> >  * @param channel   number of channel element to analyze
> >  * @param prev_type previous window type
> >  *
> >  * @return suggested window information in a structure
> >  */
> > FFWindowInfo* ff_psy_suggest_window(AACPsyContext *ctx, int16_t *audio, int16_t *la,
> >                                     int channel, int prev_type);
> 
> ...get/find/calculate_suggested...
> audio&la should be const
> 
> and maybe the return should be FFWindowInfo instead of FFWindowInfo* to 
> avoid memleak issues ...
> 
> 
> > 
> > /**
> >  * Perform psychoacoustic analysis and set band info.
> >  *
> >  * @param ctx   model context
> >  * @param tag   number of channel element to analyze
> >  * @param type  channel element type (e.g. ID_SCE or ID_CPE)
> >  * @param cpe   pointer to the current channel element
> >  */
> > void ff_psy_analyze(AACPsyContext *ctx, int tag, int type, ChannelElement *cpe);
> 
> ChannelElement is AAC specific

those are leftovers 

> [...]
> > /**
> >  * Preprocess several channel in audio frame in order to compress it better.
> >  *
> >  * @param ctx      preprocessing context
> >  * @param audio    samples to preprocess
> >  * @param dest     place to put filtered samples
> >  * @param tag      number of channel group
> >  * @param channels number of channel to preprocess (some additional work may be done on stereo pair)
> >  */
> > void ff_aac_psy_preprocess(struct FFPsyPreprocessContext *ctx, int16_t *audio, int16_t *dest, int tag, int channels);
> 
> audio is missing a const
> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> There will always be a question for which you do not know the correct awnser.
-------------- next part --------------
/*
 * audio encoder psychoacoustic model
 * Copyright (C) 2008 Konstantin Shishkov
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef FFMPEG_AACPSY_H
#define FFMPEG_AACPSY_H

#include "avcodec.h"

/** maximum possible number of bands */
#define MAX_BANDS 128

enum FFPsyMSDecision{
    FF_PSY_MS_SEPARATE,  ///< channels should be coded independently
    FF_PSY_MS_JOINT,     ///< channels should be coded as an average and difference
    FF_PSY_MS_UNDECIDED, ///< let encoder decide how to code channels
};

/**
 * single band psychoacoustic information
 */
typedef struct FFPsyBand{
    float energy;
    float threshold;
    float perceptual_entropy;
}FFPsyBand;

/**
 * windowing related information
 */
typedef struct FFWindowInfo{
    int window_type[2];               ///< window type (short/long/transitional, etc.) - current and previous
    int window_shape;                 ///< window shape (sine/KBD/whatever)
    void *additional_info;            ///< codec-dependent window information, should be consistent between encoder and psy model
}FFWindowInfo;

/**
 * context used by psychoacoustic model
 */
typedef struct FFPsyContext{
    AVCodecContext *avctx;            ///< encoder context

    FFPsyBand bands[MAX_BANDS];       ///< frame bands information
    FFWindowInfo *win_info;           ///< frame window info

    const uint8_t *bands;             ///< scalefactor band sizes for possible fram sizes
    const int     *num_bands;         ///< number of scalefactor bands for possible frame sizes
    const uint8_t *short_bands;       ///< scalefactor band sizes for short frame
    int num_short_bands;              ///< number of scalefactor bands for short frame

    void* model_priv_data;            ///< psychoacoustic model implementation private data
}FFPsyContext;

/**
 * Initialize psychoacoustic model.
 *
 * @param ctx        model context
 * @param avctx      codec context
 * @param bands      scalefactor band lengths for all frame lengths
 * @param num_bands  number of scalefactor bands for all frame lengths
 *
 * @return zero if successful, a negative value if not
 */
int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
                const uint8_t **bands, const int* num_bands);

/**
 * Suggest window sequence for channel.
 *
 * @param ctx       model context
 * @param audio     samples for the current frame
 * @param la        lookahead samples (NULL when unavailable)
 * @param channel   number of channel element to analyze
 * @param prev_type previous window type
 *
 * @return suggested window information in a structure
 */
FFWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
                                   const int16_t *audio, const int16_t *la,
                                   int channel, int prev_type);

/**
 * Get psychoacoustic model suggestion about coding two bands as M/S
 */
enum FFPsyMSDecision ff_psy_suggest_ms(FFPsyContext *ctx, FFPsyBand *left, FFPsyBand *right);

/**
 * Perform psychoacoustic analysis and set band info.
 *
 * @param ctx    model context
 * @param tag    number of channel element to analyze
 * @param winfo  window information
 * @param coeffs transformed channel coefficients
 */
void ff_psy_analyze(FFPsyContext *ctx, int tag, FFWindowInfo *winfo, const float *coeffs);

/**
 * Cleanup model context at the end.
 *
 * @param ctx model context
 */
void ff_psy_end(FFPsyContext *ctx);

/**************************************************************************
 *                       Audio preprocessing stuff.                       *
 *       This should be moved into some audio filter eventually.          *
 **************************************************************************/
struct FFPsyPreprocessContext;

/**
 * psychoacoustic model audio preprocessing initialization
 */
struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx);

/**
 * Preprocess several channel in audio frame in order to compress it better.
 *
 * @param ctx      preprocessing context
 * @param audio    samples to preprocess
 * @param dest     place to put filtered samples
 * @param tag      number of channel group
 * @param channels number of channel to preprocess (some additional work may be done on stereo pair)
 */
void ff_aac_psy_preprocess(struct FFPsyPreprocessContext *ctx,
                           const int16_t *audio, int16_t *dest,
                           int tag, int channels);

/**
 * Cleanup audio preprocessing module.
 */
void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx);

#endif /* FFMPEG_AACPSY_H */