[FFmpeg-devel] [PATCH/RFC] H.264 FMO+ASO decoding

Sun Sep 26 16:43:45 CEST 2010

On Wed, Jul 21, 2010 at 10:20:36PM +0200, Stefan Gehrer wrote:
> Hi,
>
> attached is a patch that implements the header stuff for FMO decoding
> in H.264 baseline streams. It decodes the slice group map and also
> provides means that at the start of a slice the actual x/y position
> of the first macroblock can be determined from first_mb_in_slice.
> If this is done, slices can be decoded out of order, i.e.
> first_mb_in_slice does not have to increase for slices of the same
> picture (aka ASO).
> Let's take as example a tiny video of 4x3 macroblocks which
> has a slice group map with two slice groups like this:
>
> 1  1  1  1
> 1  0  0  1
> 1  1  1  1
>
> Each slice group itself (0 or 1) is decoded in raster-scan order,
> so that the macroblock addresses are as follows:
>
> 2  3  4  5
> 6  0  1  7
> 8  9 10 11
>
> So if a slice comes along with first_mb_in_slice equal to 7 we need
> to start decoding at MB position x=3 and y=1.
>
> Unfortunately, the real challenge starts here. A lot of neighbor
> context handling (i4x4 modes, non-zero counts, MVs) has to be
> handled differently when the assumption of raster-scan order of
> macroblocks is not true anymore. Also, deblocking can only be done
> after the picture has been fully decoded. This is because deblocking
> goes across slice boundaries and slice group boundaries and the
> neighbor MB might just happen to be the last to be decoded in the
> picture.
> Considering the heavy optimizations that have been done in the
> normal decoding paths I guess there would be some outcry if in
> many places in the MB decoding a conditional like
> if(pps->slice_group_count > 1)
> would appear.
> So my feeling is that if FMO is to be implemented it may be best
> to have a new code path for the slice data decoding, a
> baseline-FMO-specific version of decode_slice() and some of its
> subfunctions maybe?
> Opinions welcome.

my guess is you wont finish FMO
also we need templating for it ...
still parts of your patch could be usefull and move us a tiny step closer to
FMO support

>
> Stefan

>  h264.c    |   32 +++++++---
>  h264.h    |    6 +
>  h264_ps.c |  187 ++++++++++++++++++++++++++++++++++++++++++++++++++------------
>  3 files changed, 182 insertions(+), 43 deletions(-)
> 6228649db31722b1b6bbf9ff33a04c52d2baaf14  h264_fmo.diff
> diff --git a/libavcodec/h264.c b/libavcodec/h264.c
> index d1662fc..bfb65d6 100644
> --- a/libavcodec/h264.c
> +++ b/libavcodec/h264.c
> @@ -1968,8 +1968,6 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
>          av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
>          return -1;
>      }
> -    s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
> -    s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
>      if (s->picture_structure == PICT_BOTTOM_FIELD)
>          s->resync_mb_y = s->mb_y = s->mb_y + 1;
>      assert(s->mb_y < s->mb_height);

> @@ -2153,11 +2151,18 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
>      }
>      h->qp_thresh= 15 + 52 - FFMIN(h->slice_alpha_c0_offset, h->slice_beta_offset) - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
>  
> -#if 0 //FMO
> -    if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
> -        slice_group_change_cycle= get_bits(&s->gb, ?);
> -#endif
> +    if(h->pps.slice_group_count > 1){
> +        int addr = -1;
>  
> +        if(h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
> +            ff_h264_draw_slice_group(h, &h->pps, s->mb_width, s->mb_height);
> +        h->slice_group_current = 0;
> +        for(j=0;j<=first_mb_in_slice;j++)
> +            addr = ff_h264_fmo_next_mb(h, addr);

this is too slow with many slices and groups

> +        first_mb_in_slice = addr;
> +    }
> +    s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
> +    s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
>      h0->last_slice_type = slice_type;
>      h->slice_num = ++h0->current_slice;
>      if(h->slice_num >= MAX_SLICES){

> @@ -2608,6 +2613,14 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){
>                  return -1;
>              }
>  
> +            if(h->pps.slice_group_count > 1){
> +                int addr = ff_h264_fmo_next_mb(h, s->mb_y*s->mb_width+s->mb_x);
> +
> +                if(addr < 0)
> +                    goto end;
> +                s->mb_x =  addr % s->mb_width;
> +                s->mb_y = (addr / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
> +            } else
>              if(++s->mb_x >= s->mb_width){
>                  s->mb_x=0;
>                  loop_filter(h);
> @@ -2620,7 +2633,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){
>                  }
>                  if(s->mb_y >= s->mb_height){
>                      tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
> -
> +                end:
>                      if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
>                          ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
>  

iam not a fan of this kind of goto spagethi code

> @@ -3364,8 +3377,11 @@ av_cold void ff_h264_free_context(H264Context *h)
>      for(i = 0; i < MAX_SPS_COUNT; i++)
>          av_freep(h->sps_buffers + i);
>  
> -    for(i = 0; i < MAX_PPS_COUNT; i++)
> +    for(i = 0; i < MAX_PPS_COUNT; i++){
> +        if(h->pps_buffers[i])
> +            av_free(h->pps_buffers[i]->slice_group_map);
>          av_freep(h->pps_buffers + i);
> +    }
>  }
>  
>  av_cold int ff_h264_decode_end(AVCodecContext *avctx)
> diff --git a/libavcodec/h264.h b/libavcodec/h264.h
> index 7158d97..db2ec75 100644
> --- a/libavcodec/h264.h
> +++ b/libavcodec/h264.h
> @@ -220,6 +220,9 @@ typedef struct PPS{
>      int pic_order_present;      ///< pic_order_present_flag
>      int slice_group_count;      ///< num_slice_groups_minus1 + 1
>      int mb_slice_group_map_type;
> +    uint8_t *slice_group_map;
> +    int slice_group_dir_flag;
> +    int slice_group_change_rate;
>      unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
>      int weighted_pred;          ///< weighted_pred_flag
>      int weighted_bipred_idc;
> @@ -589,6 +592,7 @@ typedef struct H264Context{
>      int sei_buffering_period_present;  ///< Buffering period SEI flag
>      int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs
>  
> +    int slice_group_current;
>      //SVQ3 specific fields
>      int halfpel_flag;
>      int thirdpel_flag;
> @@ -619,6 +623,8 @@ int ff_h264_decode_seq_parameter_set(H264Context *h);
>   */
>  int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length);
>  
> +int ff_h264_fmo_next_mb(H264Context *h, int addr);
> +int ff_h264_draw_slice_group(H264Context *h, PPS *pps, int width, int height);
>  /**
>   * Decode a network abstraction layer unit.
>   * @param consumed is the number of bytes used as input
> diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
> index 7648e2c..811d81d 100644
> --- a/libavcodec/h264_ps.c
> +++ b/libavcodec/h264_ps.c
> @@ -420,6 +420,143 @@ build_qp_table(PPS *pps, int t, int index)
>          pps->chroma_qp_table[t][i] = ff_h264_chroma_qp[av_clip(i + index, 0, 51)];
>  }
>  
> +static int first_in_group(H264Context *h){
> +    int map_units = h->s.mb_width * h->s.mb_height;
> +    int i;
> +
> +    for(i=0;i<map_units;i++){
> +        if(h->pps.slice_group_map[i] == h->slice_group_current)
> +      return i;
> +    }
> +    return -1;

indention

> +}
> +
> +int ff_h264_fmo_next_mb(H264Context *h, int addr){
> +    int map_units = h->s.mb_width * h->s.mb_height;
> +
> +    while(h->pps.slice_group_map[++addr] != h->slice_group_current){
> +        if(addr == map_units){
> +            h->slice_group_current++;
> +            if(h->slice_group_current == h->pps.slice_group_count)
> +                return -1;
> +            addr = first_in_group(h);
> +        }
> +    }

is is not efficient if there are many groups if i understand it correctly
and you call it per MB so it must be efficient

> +    return addr;
> +}
> +
> +int ff_h264_draw_slice_group(H264Context *h, PPS *pps, int width, int height){
> +    MpegEncContext * const s = &h->s;
> +    int map_units = width * height;
> +    int group, length, bits, x, y, i = 0;
> +    int top, left, bottom, right, xdir, ydir, vacant;
> +    int run_length[8];
> +    int dir_flag = pps->slice_group_dir_flag;
> +
> +    switch(pps->mb_slice_group_map_type) {
> +    case 0: /* interleaved slice group map */
> +        for(group = 0; group < pps->slice_group_count; group++)
> +            run_length[group] = get_ue_golomb(&s->gb) + 1;
> +        do {
> +            for(group=0; group<pps->slice_group_count && i<map_units;
> +                group++) {
> +                memset(pps->slice_group_map + i, group,
> +                       FFMIN(run_length[group], map_units - i));
> +                i += run_length[group];
> +            }
> +        } while(i < map_units);
> +        break;
> +    case 1: /* dispersed slice group map */
> +        for(y=0;y<height; y++)
> +            for(x=0;x<width;x++)
> +                pps->slice_group_map[i++] =
> +                    ( x + ((y * pps->slice_group_count)>>1) ) %
> +                    pps->slice_group_count;

please dont do % per MB

similar comments about efficiency apply to other parts too

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The misfortune of the wise is better than the prosperity of the fool.
-- Epicurus
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100926/5962fd35/attachment.pgp>