[FFmpeg-devel] AAC decoder round 7
Michael Niedermayer
michaelni
Tue Aug 12 17:21:03 CEST 2008
On Tue, Aug 12, 2008 at 03:08:09PM +0100, Robert Swain wrote:
> 2008/8/11 Michael Niedermayer <michaelni at gmx.at>:
> > On Mon, Aug 11, 2008 at 12:50:09PM +0100, Robert Swain wrote:
> >> $subj
> >>
> >> Regards,
> >> Rob
> >
> > [...]
> >
> >> Index: libavcodec/aactab.c
> >> ===================================================================
> >> --- libavcodec/aactab.c (revision 14694)
> >> +++ libavcodec/aactab.c (working copy)
> >> @@ -32,6 +32,11 @@
> >>
> >> #include <stdint.h>
> >>
> >> +DECLARE_ALIGNED(16, float, ff_aac_kbd_long_1024[1024]);
> >> +DECLARE_ALIGNED(16, float, ff_aac_kbd_short_128[128]);
> >
> >> +DECLARE_ALIGNED(16, float, ff_aac_sine_long_1024[1024]);
> >> +DECLARE_ALIGNED(16, float, ff_aac_sine_short_128[128]);
> >
> > i think these can be shared with at least wma
> > i dont remember what vorbis used?
>
> WMA has variable window sizes and I think it would be awkward for it
> to use just these two arrays unless it has all its arrays declared in
> a similar way.
but it would not be awkward for aac to use 2 of the wma windows, if they
where in a common c/h file and non static.
>
> "Vorbis windows all use the slope function $y = \sin(.5*\pi \,
> \sin^2((x+.5)/n*\pi))$."
>
> > [...]
> >> @@ -380,7 +465,42 @@
> >> ics->use_kb_window[0] = get_bits1(gb);
> >> ics->num_window_groups = 1;
> >> ics->group_len[0] = 1;
> >> + if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
> >> + int i;
> >> + ics->max_sfb = get_bits(gb, 4);
> >> + for (i = 0; i < 7; i++) {
> >> + if (get_bits1(gb)) {
> >> + ics->group_len[ics->num_window_groups-1]++;
> >> + } else {
> >> + ics->num_window_groups++;
> >> + ics->group_len[ics->num_window_groups-1] = 1;
> >> + }
> >> + }
> >
> >> + ics->swb_offset = swb_offset_128[ac->m4ac.sampling_index];
> >> + ics->num_swb = ff_aac_num_swb_128[ac->m4ac.sampling_index];
> >> + ics->num_windows = 8;
> >> + ics->tns_max_bands = tns_max_bands_128[ac->m4ac.sampling_index];
> >> + } else {
> >> + ics->max_sfb = get_bits(gb, 6);
> >> + ics->swb_offset = swb_offset_1024[ac->m4ac.sampling_index];
> >> + ics->num_swb = ff_aac_num_swb_1024[ac->m4ac.sampling_index];
> >> + ics->num_windows = 1;
> >> + ics->tns_max_bands = tns_max_bands_1024[ac->m4ac.sampling_index];
> >
> > vertical align
>
> Would you prefer to have them aligned such that the array indices are
> aligned as well? I think it looks better this way but you may not.
yes i prefer them aligned as well
>
> >> /**
> >> + * Decode spectral data; reference: table 4.50.
> >> + *
> >> + * @param band_type array of the used band type
> >> + * @param icoef array of quantized spectral data
> >> + *
> >> + * @return Returns error status. 0 - OK, !0 - error
> >> + */
> >> +static int decode_spectrum(AACContext * ac, int icoef[1024], GetBitContext * gb,
> >> + const IndividualChannelStream * ics, enum BandType band_type[120]) {
> >> + int i, k, g, idx = 0;
> >> + const uint16_t * offsets = ics->swb_offset;
> >> +
> >> + for (g = 0; g < ics->num_window_groups; g++) {
> >> + for (i = 0; i < ics->max_sfb; i++, idx++) {
> >> + const int cur_band_type = band_type[idx];
> >> + const int dim = cur_band_type >= FIRST_PAIR_BT ? 2 : 4;
> >> + const int is_cb_unsigned = IS_CODEBOOK_UNSIGNED(cur_band_type);
> >> + int group;
> >> + if (cur_band_type == ZERO_BT) {
> >> + for (group = 0; group < ics->group_len[g]; group++) {
> >> + memset(icoef + group * 128 + offsets[i], 0, (offsets[i+1] - offsets[i])*sizeof(int));
> >> + }
> >> + }else if (cur_band_type != NOISE_BT && cur_band_type != INTENSITY_BT2 && cur_band_type != INTENSITY_BT) {
> >> + for (group = 0; group < ics->group_len[g]; group++) {
> >> + for (k = offsets[i]; k < offsets[i+1]; k += dim) {
> >> + const int index = get_vlc2(gb, vlc_spectral[cur_band_type - 1].table, 6, 3);
> >> + const int coef_idx = (group << 7) + k;
> >> + const int8_t *vq_ptr;
> >> + int j;
> >> + if(index >= ff_aac_spectral_sizes[cur_band_type - 1]) {
> >> + av_log(ac->avccontext, AV_LOG_ERROR,
> >> + "Read beyond end of ff_aac_codebook_vectors[%d][]. index %d >= %d\n",
> >> + cur_band_type - 1, index, ff_aac_spectral_sizes[cur_band_type - 1]);
> >> + return -1;
> >> + }
> >> + vq_ptr = &ff_aac_codebook_vectors[cur_band_type - 1][index * dim];
> >> + if (is_cb_unsigned) {
> >> + for (j = 0; j < dim; j++)
> >> + if (vq_ptr[j])
> >> + icoef[coef_idx + j] = 1 - 2*get_bits1(gb);
> >> + }else {
> >> + for (j = 0; j < dim; j++)
> >> + icoef[coef_idx + j] = 1;
> >> + }
> >> + if (cur_band_type == ESC_BT) {
> >> + for (j = 0; j < 2; j++) {
> >> + if (vq_ptr[j] == 16) {
> >> + int n = 4;
> >> + /* The total length of escape_sequence must be < 22 bits according
> >> + to the specification (i.e. max is 11111111110xxxxxxxxxx). */
> >> + while (get_bits1(gb) && n < 15) n++;
> >> + if(n == 15) {
> >> + av_log(ac->avccontext, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
> >> + return -1;
> >> + }
> >> + icoef[coef_idx + j] *= (1<<n) + get_bits(gb, n);
> >> + }else
> >> + icoef[coef_idx + j] *= vq_ptr[j];
> >> + }
> >> + }else
> >> + for (j = 0; j < dim; j++)
> >> + icoef[coef_idx + j] *= vq_ptr[j];
> >> + }
> >> + }
> >> + }
> >> + }
> >> + icoef += ics->group_len[g]<<7;
> >> + }
> >> + return 0;
> >> +}
> >> +
> >> +/**
> >> * Add pulses with particular amplitudes to the quantized spectral data; reference: 4.6.3.3.
> >> *
> >> * @param pulse pointer to pulse data struct
> >> @@ -538,6 +780,46 @@
> >> }
> >>
> >> /**
> >> + * Dequantize and scale spectral data; reference: 4.6.3.3.
> >> + *
> >> + * @param icoef array of quantized spectral data
> >> + * @param band_type array of the used band type
> >> + * @param sf array of scalefactors or intensity stereo positions
> >> + * @param coef array of dequantized, scaled spectral data
> >> + */
> >> +static void dequant(AACContext * ac, float coef[1024], const int icoef[1024], float sf[120],
> >> + const IndividualChannelStream * ics, enum BandType band_type[120]) {
> >> + const uint16_t * offsets = ics->swb_offset;
> >> + const int c = 1024/ics->num_windows;
> >> + int g, i, group, k, idx = 0;
> >> +
> >> + for (g = 0; g < ics->num_windows; g++)
> >> + memset(coef + g * 128 + offsets[ics->max_sfb], 0, sizeof(float)*(c - offsets[ics->max_sfb]));
> >> +
> >> + for (g = 0; g < ics->num_window_groups; g++) {
> >> + for (i = 0; i < ics->max_sfb; i++, idx++) {
> >> + if (band_type[idx] == NOISE_BT) {
> >> + const float scale = sf[idx] / ((offsets[i+1] - offsets[i]) * PNS_MEAN_ENERGY);
> >> + for (group = 0; group < ics->group_len[g]; group++) {
> >> + for (k = offsets[i]; k < offsets[i+1]; k++) {
> >> + ac->random_state = lcg_random(ac->random_state);
> >> + coef[group*128+k] = ac->random_state * scale;
> >> + }
> >> + }
> >> + } else if (band_type[idx] != INTENSITY_BT && band_type[idx] != INTENSITY_BT2) {
> >> + for (group = 0; group < ics->group_len[g]; group++) {
> >> + for (k = offsets[i]; k < offsets[i+1]; k++) {
> >> + coef[group*128+k] = ivquant(icoef[group*128+k]) * sf[idx];
> >> + }
> >> + }
> >> + }
> >> + }
> >> + coef += ics->group_len[g]*128;
> >> + icoef += ics->group_len[g]*128;
> >> + }
> >> +}
> >> +
> >> +/**
> >
> > dequant and decode_spectrum() can be merged, especially the VQ tables can
> > already include ivquant().
> > and yes i know that this means add_pulses will be more tricky
>
> See attached (20080812-1422-merge_decspec_dequant.diff). It's not that
> nice to look at and svn diff makes it worse. Suggestions to improve it
> very welcome. Making add_pulse() recursive seemed like a good solution
> to avoid refactoring the main loops and conditions but maybe you have
> a better idea.
yes i do :)
for(each pulse){
float c= coeff[ pos[i] ];
float ic= c / sqrtf(sqrtf(fabsf(c))) + amp[i];
coeff[ pos[i] ]= cbrtf(fabsf(ic)) * ic;
}
This idea is of course based on the assumtation that there are few pulses
compared to the number of coefficients, which as far as i understand is true
as there are max 4 pulses IIRC
>
> >> * Decode an individual_channel_stream payload; reference: table 4.44.
> >> *
> >> * @param common_window Channels have independent [0], or shared [1], Individual Channel Stream information.
> >> @@ -597,6 +879,71 @@
> >> }
> >>
> >> /**
> >> + * Mid/Side stereo decoding; reference: 4.6.8.1.3.
> >> + */
> >> +static void apply_mid_side_stereo(ChannelElement * cpe) {
> >> + const IndividualChannelStream * ics = &cpe->ch[0].ics;
> >> + float *ch0 = cpe->ch[0].coeffs;
> >> + float *ch1 = cpe->ch[1].coeffs;
> >> + int g, i, k, group, idx = 0;
> >> + const uint16_t * offsets = ics->swb_offset;
> >
> >> + for (g = 0; g < ics->num_window_groups; g++) {
> >> + for (i = 0; i < ics->max_sfb; i++, idx++) {
> >> + if (cpe->ms_mask[idx] &&
> >> + cpe->ch[0].band_type[idx] < NOISE_BT && cpe->ch[1].band_type[idx] < NOISE_BT) {
> >> + for (group = 0; group < ics->group_len[g]; group++) {
> >> + for (k = offsets[i]; k < offsets[i+1]; k++) {
> >> + float tmp = ch0[group*128 + k] - ch1[group*128 + k];
> >> + ch0[group*128 + k] += ch1[group*128 + k];
> >> + ch1[group*128 + k] = tmp;
> >> + }
> >> + }
> >> + }
> >> + }
> >> + ch0 += ics->group_len[g]*128;
> >> + ch1 += ics->group_len[g]*128;
> >> + }
> >
> > please try the following, iam curious if its faster (START/STOP_TIMER),
> > it does more if() but accesses elements in sequential order
> >
> > for (g = 0; g < ics->num_window_groups; g++) {
> > for (group = 0; group < ics->group_len[g]; group++) {
> > for (i = 0; i < ics->max_sfb; i++, idx++) {
> > if (cpe->ms_mask[idx] &&
> > cpe->ch[0].band_type[idx] < NOISE_BT && cpe->ch[1].band_type[idx] < NOISE_BT) {
> > for (k = offsets[i]; k < offsets[i+1]; k++) {
> > float tmp = ch0[k] - ch1[k];
> > ch0[k] += ch1[k];
> > ch1[k] = tmp;
> > }
> > }
> > }
> > idx -= ics->max_sfb;
> > ch0 += 128;
> > ch1 += 128;
> > }
> > idx += ics->max_sfb;
> > }
>
> Original:
>
> 102076 dezicycles in apply_mid_side_stereo, 4091 runs, 5 skips
> 100861 dezicycles in apply_mid_side_stereo, 8183 runs, 9 skips
> 97587 dezicycles in apply_mid_side_stereo, 16366 runs, 18 skips
> 96453 dezicycles in apply_mid_side_stereo, 32745 runs, 23 skips
>
> Patched:
>
> 98167 dezicycles in apply_mid_side_stereo, 4095 runs, 1 skips
> 98988 dezicycles in apply_mid_side_stereo, 8189 runs, 3 skips
> 98271 dezicycles in apply_mid_side_stereo, 16379 runs, 5 skips
> 98045 dezicycles in apply_mid_side_stereo, 32758 runs, 10 skips
from that old looks faster so lets keep that, though next time please
run the benchmark 3 times each not once!
[...]
> > [...]
> >> +
> >> +/**
> >> + * Convert spectral data to float samples, applying all supported tools as appropriate.
> >> + */
> >> +static void spectral_to_sample(AACContext * ac) {
> >> + int i, type;
> >> + for (i = 0; i < MAX_ELEM_ID; i++) {
> >> + for(type = 0; type < 4; type++) {
> >> + ChannelElement *che = ac->che[type][i];
> >> + if(che) {
> >> + if(type == TYPE_CCE && che->coup.coupling_point == BEFORE_TNS)
> >> + apply_channel_coupling(ac, che, apply_dependent_coupling);
> >> + if(che->ch[0].tns.present)
> >> + apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
> >> + if(che->ch[1].tns.present)
> >> + apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
> >> + if(type == TYPE_CCE && che->coup.coupling_point == BETWEEN_TNS_AND_IMDCT)
> >> + apply_channel_coupling(ac, che, apply_dependent_coupling);
> >> + imdct_and_windowing(ac, &che->ch[0]);
> >> + if(type == TYPE_CPE)
> >> + imdct_and_windowing(ac, &che->ch[1]);
> >> + if(type == TYPE_CCE && che->coup.coupling_point == AFTER_IMDCT)
> >> + apply_channel_coupling(ac, che, apply_independent_coupling);
> >
> > the 3 TYPE_CCE checks are unneeded when coupling_point matches neiter of the 3
>
> So if type != TYPE_CCE you want me to set coupling_point to the
> 'invalid' value (2, IIRC)? Or do you want me to rework what the valid
> values are such that the 'invalid' value is 0? Or something else?
>
> I don't really like using the 'invalid' value to remove these checks
> as this is not speed critical code. But that's just my opinion.
no i meant that you could use a value of 4 or -1 not the "invalid" one
>
> > and maybe its possible to put the whole in a for(ch=0; ch<2; ch++) loop
> > this would need some chnages to apply_channel_coupling() though
> > iam not sure if this would end up cleaner or not, probably depends mostly on
> > apply_channel_coupling() being clean when its just doing one channel ...
>
> ch could be passed to apply_channel_coupling() and if(ch) return;. And
> imdct_and_windowing() would still need a check for whether the current
> che is a CPE or not. So overall I think it would add more lines than
> it would remove.
ok, then forget it for now
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
I hate to see young programmers poisoned by the kind of thinking
Ulrich Drepper puts forward since it is simply too narrow -- Roman Shaposhnik
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080812/7e58dd82/attachment.pgp>
More information about the ffmpeg-devel
mailing list