[FFmpeg-devel] [RFC] Generic psychoacoustic model interface
Kostya
kostya.shishkov
Sat Aug 30 12:21:54 CEST 2008
On Thu, Aug 28, 2008 at 10:36:57PM +0200, Michael Niedermayer wrote:
> On Thu, Aug 28, 2008 at 08:10:26PM +0300, Kostya wrote:
[...]
> > /**
> > * windowing related information
> > */
> > typedef struct FFWindowInfo{
>
> > int window_type[2]; ///< window type (short/long/transitional, etc.) - current and previous
>
> How is this "transitional" going to work with many different frame lengths?
> is there 1? N*N ?
that's for AAC (i.e. requires a bit of different windowing),
encoder will set that to internal value
[...]
> > /**
> > * Get psychoacoustic model suggestion about coding two bands as M/S
> > */
> > enum FFPsyMSDecision ff_psy_suggest_ms(FFPsyContext *ctx, FFPsyBand *left, FFPsyBand *right);
>
> iam a little unsure about this one, but iam not objecting ...
dropped for now, may revive later
Here's another draft - it's psychoacoustic model interface with
partial implementation (there are some inaccuracies and debugs there,
but's this is RFC, not a final patch).
I plan to use it this way with my encoder.
General flow:
init
while(frame){
suggest window()
[encoder may ignore that]
set band info() = calculate thresholds for all bands with provided window type
psy analyze() = get distortions and weight for band quantized with a series of
quantizers, my encoder will use that for RD-aware quantization
}
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> I count him braver who overcomes his desires than him who conquers his
> enemies for the hardest victory is over self. -- Aristotle
-------------- next part --------------
/*
* audio encoder psychoacoustic model
* Copyright (C) 2008 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef FFMPEG_PSYMODEL_H
#define FFMPEG_PSYMODEL_H
#include "avcodec.h"
/** maximum possible number of bands */
#define MAX_BANDS 128
/**
* single band psychoacoustic information
*/
typedef struct FFPsyBand{
int bits;
float energy;
float threshold;
float distortion;
float perceptual_weight;
}FFPsyBand;
/**
* windowing related information
*/
typedef struct FFPsyWindowInfo{
int window_type[2]; ///< window type (short/long/transitional, etc.) - current and previous
int window_shape; ///< window shape (sine/KBD/whatever)
int num_windows; ///< number of windows in a frame
int grouping[8]; ///< window grouping (for e.g. AAC)
int *window_sizes; ///< sequence of window sizes inside one frame (for eg. WMA)
}FFPsyWindowInfo;
/**
* context used by psychoacoustic model
*/
typedef struct FFPsyContext{
AVCodecContext *avctx; ///< encoder context
FFPsyBand *psy_bands; ///< frame bands information
FFPsyWindowInfo *win_info; ///< frame window info
uint8_t **bands; ///< scalefactor band sizes for possible frame sizes
int *num_bands; ///< number of scalefactor bands for possible frame sizes
int num_lens; ///< number of scalefactor band sets
void* model_priv_data; ///< psychoacoustic model implementation private data
}FFPsyContext;
/**
* Initialize psychoacoustic model.
*
* @param ctx model context
* @param avctx codec context
* @param num_lens number of possible frame lengths
* @param bands scalefactor band lengths for all frame lengths
* @param num_bands number of scalefactor bands for all frame lengths
*
* @return zero if successful, a negative value if not
*/
av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
int num_lens,
uint8_t **bands, int* num_bands);
/**
* Suggest window sequence for channel.
*
* @param ctx model context
* @param audio samples for the current frame
* @param la lookahead samples (NULL when unavailable)
* @param channel number of channel element to analyze
* @param prev_type previous window type
*
* @return suggested window information in a structure
*/
FFPsyWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
const int16_t *audio, const int16_t *la,
int channel, int prev_type);
/**
* Perform psychoacoustic analysis and set band info (threshold, energy).
*
* @param ctx model context
* @param channel audio channel number
* @param coeffs pointer to the transformed coefficients
* @param wi window information
*/
void ff_psy_set_band_info(FFPsyContext *ctx, int channel, const float *coeffs,
FFPsyWindowInfo *wi);
/**
* Analyze band and output perceptual information for given quantizers.
*
* @param ctx model context
* @param channel audio channel number
* @param band_no number of band to analyze
* @param coeffs pointer to the transformed coefficients
* @param length band length
* @param quants number of quantizers to try
* @param Q array of quantizers
* @param qstep quantizers stride
* @param bands output array where information will be stored
*/
void ff_psy_analyze(FFPsyContext *ctx, int channel, int band_no,
const float *coeffs, int length,
int quants, const float *Q, const int qstep,
FFPsyBand *bands);
/**
* Cleanup model context at the end.
*
* @param ctx model context
*/
av_cold void ff_psy_end(FFPsyContext *ctx);
/**************************************************************************
* Audio preprocessing stuff. *
* This should be moved into some audio filter eventually. *
**************************************************************************/
struct FFPsyPreprocessContext;
/**
* psychoacoustic model audio preprocessing initialization
*/
av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx);
/**
* Preprocess several channel in audio frame in order to compress it better.
*
* @param ctx preprocessing context
* @param audio samples to preprocess
* @param dest place to put filtered samples
* @param tag channel number
* @param channels number of channel to preprocess (some additional work may be done on stereo pair)
*/
void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx,
const int16_t *audio, int16_t *dest,
int tag, int channels);
/**
* Cleanup audio preprocessing module.
*/
av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx);
#endif /* FFMPEG_PSYMODEL_H */
-------------- next part --------------
/*
* audio encoder psychoacoustic model
* Copyright (C) 2008 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "avcodec.h"
#include "psymodel.h"
#include "iirfilter.h"
#ifdef ENABLE_AAC_ENCODER
#include "aac.h"
#include "aactab.h"
/**
* Quantize one coefficient.
* @return absolute value of the quantized coefficient
* @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
*/
static av_always_inline int quant(float coef, const float Q)
{
return av_clip((int)(pow(fabsf(coef) * Q, 0.75) + 0.4054), 0, 8191);
}
static inline float psy_aac_get_approximate_quant_error(const float *c, int size,
const float Q, const float IQ)
{
int i;
int q;
float coef, unquant, sum = 0.0f;
for(i = 0; i < size; i++){
coef = fabs(c[i]);
q = quant(c[i], Q);
unquant = (q * cbrt(q)) * IQ;
sum += (coef - unquant) * (coef - unquant);
}
return sum * 1.0 / 512.0;
}
//XXX: stub
static inline int psy_aac_get_approximate_bits(const float *c, int size, const float Q)
{
int i, bits = 0;
for(i = 0; i < size; i += 2){
int idx = 0, j, q;
for(j = 0; j < 2; j++){
q = quant(c[i+j], Q);
q = FFABS(q);
if(q)
bits++;
if(q > 16)
bits += av_log2(q)*2 - 4 + 1;
idx = idx*17 + FFMIN(q, 16);
}
bits += ff_aac_spectral_bits[10][idx];
}
return bits;
}
#define PSY_3GPP_SPREAD_LOW 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark)
#define PSY_3GPP_SPREAD_HI 3.0f // spreading factor for descending threshold spreading (30 dB/Bark)
#define PSY_3GPP_RPEMIN 0.01f
#define PSY_3GPP_RPELEV 2.0f
/**
* information for single band used by 3GPP TS26.403-inspired psychoacoustic model
*/
typedef struct Psy3gppBand{
float energy; ///< band energy
float ffac; ///< form factor
float thr; ///< energy threshold
float pe; ///< perceptual entropy
float thr_quiet; ///< threshold in quiet
}Psy3gppBand;
/**
* single/pair channel context for psychoacoustic model
*/
typedef struct Psy3gppChannel{
Psy3gppBand band[128]; ///< bands information
Psy3gppBand prev_band[128]; ///< bands information from the previous frame
float win_energy; ///< sliding average of channel energy
float iir_state[2]; ///< hi-pass IIR filter state
uint8_t next_grouping; ///< stored grouping scheme for the next frame (in case of 8 short window sequence)
enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame
}Psy3gppChannel;
/**
* psychoacoustic model frame type-dependent coefficients
*/
typedef struct Psy3gppCoeffs{
float ath [64]; ///< absolute threshold of hearing per bands
float barks [64]; ///< Bark value for each spectral band in long frame
float spread_low[64]; ///< spreading factor for low-to-high threshold spreading in long frame
float spread_hi [64]; ///< spreading factor for high-to-low threshold spreading in long frame
}Psy3gppCoeffs;
/**
* 3GPP TS26.403-inspired psychoacoustic model specific data
*/
typedef struct Psy3gppContext{
Psy3gppCoeffs psy_coef[2];
Psy3gppChannel *ch;
}Psy3gppContext;
/**
* Calculate Bark value for given line.
*/
static inline float calc_bark(float f)
{
return 13.3f * atanf(0.00076f * f) + 3.5f * atanf((f / 7500.0f) * (f / 7500.0f));
}
#define ATH_ADD 4
/**
* Calculate ATH value for given frequency.
* Borrowed from Lame.
*/
static inline float ath(float f, float add)
{
f /= 1000.0f;
return 3.64 * pow(f, -0.8)
- 6.8 * exp(-0.6 * (f - 3.4) * (f - 3.4))
+ 6.0 * exp(-0.15 * (f - 8.7) * (f - 8.7))
+ (0.6 + 0.04 * add) * 0.001 * f * f * f * f;
}
static av_cold int psy_aac_init(FFPsyContext *ctx){
Psy3gppContext *pctx;
float barks[1024];
int i, j, g, start;
float prev, minscale, minath;
ctx->model_priv_data = av_mallocz(sizeof(Psy3gppContext));
pctx = (Psy3gppContext*) ctx->model_priv_data;
for(i = 0; i < 1024; i++)
barks[i] = calc_bark(i * ctx->avctx->sample_rate / 2048.0);
minath = ath(3410, ATH_ADD);
for(j = 0; j < 2; j++){
Psy3gppCoeffs *coeffs = &pctx->psy_coef[j];
i = 0;
prev = 0.0;
for(g = 0; g < ctx->num_bands[j]; g++){
i += ctx->bands[j][g];
coeffs->barks[g] = (barks[i - 1] + prev) / 2.0;
prev = barks[i - 1];
}
for(g = 0; g < ctx->num_bands[j] - 1; g++){
coeffs->spread_low[g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_LOW);
coeffs->spread_hi [g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_HI);
}
start = 0;
for(g = 0; g < ctx->num_bands[j]; g++){
minscale = ath(ctx->avctx->sample_rate * start / 1024.0, ATH_ADD);
for(i = 1; i < ctx->bands[j][g]; i++){
minscale = fminf(minscale, ath(ctx->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD));
}
coeffs->ath[g] = minscale - minath;
start += ctx->bands[j][g];
}
}
pctx->ch = av_mallocz(sizeof(Psy3gppChannel) * ctx->avctx->channels);
return 0;
}
static av_cold void psy_aac_end(FFPsyContext *ctx)
{
Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
av_freep(&pctx->ch);
av_freep(&ctx->model_priv_data);
}
/**
* Calculate band thresholds as suggested in 3GPP TS26.403
*/
static void psy_aac_calc_thresholds(FFPsyContext *ctx, int channel, const float *coefs,
FFPsyWindowInfo *wi)
{
Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
Psy3gppChannel *pch = &pctx->ch[channel];
int start = 0;
int i, w, g;
const int num_bands = ctx->num_bands[wi->num_windows == 8];
const uint8_t* band_sizes = ctx->bands[wi->num_windows == 8];
Psy3gppCoeffs *coeffs = &pctx->psy_coef[wi->num_windows == 8];
//calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
for(w = 0; w < wi->num_windows*16; w += 16){
for(g = 0; g < num_bands; g++){
Psy3gppBand *band = &pch->band[w+g];
for(i = 0; i < band_sizes[g]; i++)
band->energy += coefs[start+i] * coefs[start+i];
band->energy *= 1.0f / (512*512);
band->thr = band->energy * 0.001258925f;
start += band_sizes[g];
ctx->psy_bands[channel*MAX_BANDS+w+g].energy = band->energy;
}
}
//modify thresholds - spread, threshold in quiet - 5.4.3 "Spreaded Energy Calculation"
for(w = 0; w < wi->num_windows*16; w += 16){
Psy3gppBand *band = &pch->band[w];
for(g = 1; g < num_bands; g++){
band[g].thr = FFMAX(band[g].thr, band[g-1].thr * coeffs->spread_low[g-1]);
}
for(g = num_bands - 2; g >= 0; g--){
band[g].thr = FFMAX(band[g].thr, band[g+1].thr * coeffs->spread_hi [g+1]);
}
for(g = 0; g < num_bands; g++){
band[g].thr_quiet = FFMAX(band[g].thr, coeffs->ath[g]);
if(wi->num_windows != 8 && wi->window_type[1] != EIGHT_SHORT_SEQUENCE){
band[g].thr_quiet = fmaxf(PSY_3GPP_RPEMIN*band[g].thr_quiet,
fminf(band[g].thr_quiet,
PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
}
band[g].thr = FFMAX(band[g].thr, band[g].thr_quiet * 0.25);
ctx->psy_bands[channel*MAX_BANDS+w+g].threshold = band[g].thr;
}
}
memcpy(pch->prev_band, pch->band, sizeof(pch->band));
}
#endif
av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
int num_lens,
uint8_t **bands, int* num_bands)
{
ctx->avctx = avctx;
ctx->psy_bands = av_mallocz(sizeof(FFPsyBand) * MAX_BANDS * avctx->channels);
ctx->bands = av_malloc(sizeof(ctx->bands[0]) * num_lens);
ctx->num_bands = av_malloc(sizeof(ctx->num_bands[0]) * num_lens);
memcpy(ctx->bands, bands, sizeof(ctx->bands[0]) * num_lens);
memcpy(ctx->num_bands, num_bands, sizeof(ctx->num_bands[0]) * num_lens);
switch(ctx->avctx->codec_id){
case CODEC_ID_AAC:
return psy_aac_init(ctx);
}
return 0;
}
FFPsyWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
const int16_t *audio, const int16_t *la,
int channel, int prev_type)
{
return *(FFPsyWindowInfo*)NULL;
}
void ff_psy_set_band_info(FFPsyContext *ctx, int channel,
const float *coeffs, FFPsyWindowInfo *wi)
{
switch(ctx->avctx->codec_id){
case CODEC_ID_AAC:
psy_aac_calc_thresholds(ctx, channel, coeffs, wi);
break;
}
}
void ff_psy_analyze(FFPsyContext *ctx, int channel, int band_no,
const float *coeffs, int length,
int quants, const float *Q, const int qstep,
FFPsyBand *bands)
{
int i;
float invthr;
invthr = 1.0 / ctx->psy_bands[band_no].threshold;
switch(ctx->avctx->codec_id){
case CODEC_ID_AAC:
for(i = 0; i < quants; i++, Q += qstep){
const float IQ = 1.0 / Q[0];
bands[i].threshold = ctx->psy_bands[band_no].threshold;
bands[i].energy = ctx->psy_bands[band_no].energy;
bands[i].distortion = psy_aac_get_approximate_quant_error(coeffs, length, IQ, Q[0]);
bands[i].bits = psy_aac_get_approximate_bits(coeffs, length, IQ);
bands[i].perceptual_weight = bands[i].distortion * invthr;
}
break;
}
}
av_cold void ff_psy_end(FFPsyContext *ctx)
{
av_freep(&ctx->bands);
av_freep(&ctx->num_bands);
av_freep(&ctx->psy_bands);
switch(ctx->avctx->codec_id){
case CODEC_ID_AAC:
psy_aac_end(ctx);
}
}
typedef struct FFPsyPreprocessContext{
AVCodecContext *avctx;
float stereo_att;
struct FFIIRFilterCoeffs *fcoeffs;
struct FFIIRFilterState **fstate;
}FFPsyPreprocessContext;
#define FILT_ORDER 4
av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx)
{
FFPsyPreprocessContext *ctx;
int i;
ctx = av_mallocz(sizeof(FFPsyPreprocessContext));
ctx->avctx = avctx;
ctx->fcoeffs = ff_iir_filter_init_coeffs(FF_FILTER_TYPE_BUTTERWORTH, FF_FILTER_MODE_LOWPASS,
FILT_ORDER, 0.25, 0.0, 0.0);
if(ctx->fcoeffs){
ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
for(i = 0; i < avctx->channels; i++)
ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
}
return ctx;
}
void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx,
const int16_t *audio, int16_t *dest,
int tag, int channels)
{
int ch, i;
if(ctx->fstate){
for(ch = 0; ch < channels; ch++){
ff_iir_filter(ctx->fcoeffs, ctx->fstate[tag+ch], ctx->avctx->frame_size,
audio + ch, ctx->avctx->channels,
dest + ch, ctx->avctx->channels);
}
}else{
for(ch = 0; ch < channels; ch++){
for(i = 0; i < ctx->avctx->frame_size; i++)
dest[i*ctx->avctx->channels + ch] = audio[i*ctx->avctx->channels + ch];
}
}
}
av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx)
{
int i;
ff_iir_filter_free_coeffs(ctx->fcoeffs);
for(i = 0; i < ctx->avctx->channels; i++){
ff_iir_filter_free_state(ctx->fstate[i]);
}
av_freep(&ctx->fstate);
}
More information about the ffmpeg-devel
mailing list