[FFmpeg-devel] [RFC] Generic psychoacoustic model interface
Kostya
kostya.shishkov
Thu Aug 28 19:10:26 CEST 2008
On Wed, Aug 27, 2008 at 04:33:17PM +0200, Michael Niedermayer wrote:
> On Wed, Aug 27, 2008 at 11:35:20AM +0300, Kostya wrote:
> > Here's my first attempt to define codec-agnostic psy model.
> > Here's an interface for it. I'm not sure about AC3, but
> > it should be possible to use it with DCA, Vorbis,
> > MPEG Audio Layers I-III and NBC, maybe WMA too.
> > In case somebody codes an implementation, of course.
> > Personally I plan to make my encoder use it backed with
> > already implemented 3GPP model.
>
> [...]
> > /**
> > * windowing related information
> > */
> > typedef struct FFWindowInfo{
> > int window_type[2]; ///< window type (short/long/transitional, etc.) - current and previous
> > int window_shape; ///< window shape (sine/KBD/whatever)
>
> > void *additional_info; ///< codec-dependent window information
>
> passing opaque data from psy to encoder is not clean, it requires
> both to maintain a "hidden" compatible API
Of course, unless we can decide on what will be needed for all encoders.
> > }FFWindowInfo;
> >
> > /**
> > * context used by psychoacoustic model
> > */
> > typedef struct FFPsyContext{
> > AVCodecContext *avctx; ///< encoder context
> >
> > FFPsyBand bands[MAX_BANDS]; ///< frame bands information
> > FFWindowInfo *win_info; ///< frame window info
> >
>
> > const uint8_t *long_bands; ///< scalefactor band sizes for long frame
> > int num_long_bands; ///< number of scalefactor bands for long frame
> > const uint8_t *short_bands; ///< scalefactor band sizes for short frame
> > int num_short_bands; ///< number of scalefactor bands for short frame
>
> Having only 2 band lists would be a problem for any codec that has more
> than 2 window lengths (like wma)
>
>
> [...]
> > /**
> > * Suggest window sequence for channel.
> > *
> > * @param ctx model context
> > * @param audio samples for the current frame
> > * @param la lookahead samples (NULL when unavailable)
> > * @param channel number of channel element to analyze
> > * @param prev_type previous window type
> > *
> > * @return suggested window information in a structure
> > */
> > FFWindowInfo* ff_psy_suggest_window(AACPsyContext *ctx, int16_t *audio, int16_t *la,
> > int channel, int prev_type);
>
> ...get/find/calculate_suggested...
> audio&la should be const
>
> and maybe the return should be FFWindowInfo instead of FFWindowInfo* to
> avoid memleak issues ...
>
>
> >
> > /**
> > * Perform psychoacoustic analysis and set band info.
> > *
> > * @param ctx model context
> > * @param tag number of channel element to analyze
> > * @param type channel element type (e.g. ID_SCE or ID_CPE)
> > * @param cpe pointer to the current channel element
> > */
> > void ff_psy_analyze(AACPsyContext *ctx, int tag, int type, ChannelElement *cpe);
>
> ChannelElement is AAC specific
those are leftovers
> [...]
> > /**
> > * Preprocess several channel in audio frame in order to compress it better.
> > *
> > * @param ctx preprocessing context
> > * @param audio samples to preprocess
> > * @param dest place to put filtered samples
> > * @param tag number of channel group
> > * @param channels number of channel to preprocess (some additional work may be done on stereo pair)
> > */
> > void ff_aac_psy_preprocess(struct FFPsyPreprocessContext *ctx, int16_t *audio, int16_t *dest, int tag, int channels);
>
> audio is missing a const
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> There will always be a question for which you do not know the correct awnser.
-------------- next part --------------
/*
* audio encoder psychoacoustic model
* Copyright (C) 2008 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef FFMPEG_AACPSY_H
#define FFMPEG_AACPSY_H
#include "avcodec.h"
/** maximum possible number of bands */
#define MAX_BANDS 128
enum FFPsyMSDecision{
FF_PSY_MS_SEPARATE, ///< channels should be coded independently
FF_PSY_MS_JOINT, ///< channels should be coded as an average and difference
FF_PSY_MS_UNDECIDED, ///< let encoder decide how to code channels
};
/**
* single band psychoacoustic information
*/
typedef struct FFPsyBand{
float energy;
float threshold;
float perceptual_entropy;
}FFPsyBand;
/**
* windowing related information
*/
typedef struct FFWindowInfo{
int window_type[2]; ///< window type (short/long/transitional, etc.) - current and previous
int window_shape; ///< window shape (sine/KBD/whatever)
void *additional_info; ///< codec-dependent window information, should be consistent between encoder and psy model
}FFWindowInfo;
/**
* context used by psychoacoustic model
*/
typedef struct FFPsyContext{
AVCodecContext *avctx; ///< encoder context
FFPsyBand bands[MAX_BANDS]; ///< frame bands information
FFWindowInfo *win_info; ///< frame window info
const uint8_t *bands; ///< scalefactor band sizes for possible fram sizes
const int *num_bands; ///< number of scalefactor bands for possible frame sizes
const uint8_t *short_bands; ///< scalefactor band sizes for short frame
int num_short_bands; ///< number of scalefactor bands for short frame
void* model_priv_data; ///< psychoacoustic model implementation private data
}FFPsyContext;
/**
* Initialize psychoacoustic model.
*
* @param ctx model context
* @param avctx codec context
* @param bands scalefactor band lengths for all frame lengths
* @param num_bands number of scalefactor bands for all frame lengths
*
* @return zero if successful, a negative value if not
*/
int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
const uint8_t **bands, const int* num_bands);
/**
* Suggest window sequence for channel.
*
* @param ctx model context
* @param audio samples for the current frame
* @param la lookahead samples (NULL when unavailable)
* @param channel number of channel element to analyze
* @param prev_type previous window type
*
* @return suggested window information in a structure
*/
FFWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
const int16_t *audio, const int16_t *la,
int channel, int prev_type);
/**
* Get psychoacoustic model suggestion about coding two bands as M/S
*/
enum FFPsyMSDecision ff_psy_suggest_ms(FFPsyContext *ctx, FFPsyBand *left, FFPsyBand *right);
/**
* Perform psychoacoustic analysis and set band info.
*
* @param ctx model context
* @param tag number of channel element to analyze
* @param winfo window information
* @param coeffs transformed channel coefficients
*/
void ff_psy_analyze(FFPsyContext *ctx, int tag, FFWindowInfo *winfo, const float *coeffs);
/**
* Cleanup model context at the end.
*
* @param ctx model context
*/
void ff_psy_end(FFPsyContext *ctx);
/**************************************************************************
* Audio preprocessing stuff. *
* This should be moved into some audio filter eventually. *
**************************************************************************/
struct FFPsyPreprocessContext;
/**
* psychoacoustic model audio preprocessing initialization
*/
struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx);
/**
* Preprocess several channel in audio frame in order to compress it better.
*
* @param ctx preprocessing context
* @param audio samples to preprocess
* @param dest place to put filtered samples
* @param tag number of channel group
* @param channels number of channel to preprocess (some additional work may be done on stereo pair)
*/
void ff_aac_psy_preprocess(struct FFPsyPreprocessContext *ctx,
const int16_t *audio, int16_t *dest,
int tag, int channels);
/**
* Cleanup audio preprocessing module.
*/
void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx);
#endif /* FFMPEG_AACPSY_H */
More information about the ffmpeg-devel
mailing list