[FFmpeg-devel] [PATCH 11/11] avcodec/vvcdec: move frame tab memset from the main thread to worker threads

Nuo Mi nuomi2021 at gmail.com
Thu Aug 15 15:45:34 EEST 2024


On Sun, Aug 11, 2024 at 10:01 PM Nuo Mi <nuomi2021 at gmail.com> wrote:

>
>
> On Sun, Jul 28, 2024 at 11:19 AM Nuo Mi <nuomi2021 at gmail.com> wrote:
>
>> memset tables in the main thread can become a bottleneck for the decoder.
>> For example, if it takes 1% of the processing time for one core, the
>> maximum achievable FPS will be 100.
>> Move the memeset to worker threads will fix the issue.
>>
> will apply next week if there are no objections
>
Done

> ---
>>  libavcodec/vvc/dec.c    |  13 ++++-
>>  libavcodec/vvc/thread.c | 122 ++++++++++++++++++++++++----------------
>>  libavcodec/vvc/thread.h |   1 +
>>  3 files changed, 85 insertions(+), 51 deletions(-)
>>
>> diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c
>> index 575bcfa33d..d34713296d 100644
>> --- a/libavcodec/vvc/dec.c
>> +++ b/libavcodec/vvc/dec.c
>> @@ -82,7 +82,13 @@ static int tl_create(TabList *l)
>>              if (!*t->tab)
>>                  return AVERROR(ENOMEM);
>>          }
>> -    } else if (l->zero) {
>> +    }
>> +    return 0;
>> +}
>> +
>> +static int tl_zero(TabList *l)
>> +{
>> +    if (l->zero) {
>>          for (int i = 0; i < l->nb_tabs; i++) {
>>              Tab *t = l->tabs + i;
>>              memset(*t->tab, 0, t->size);
>> @@ -404,6 +410,11 @@ static int pic_arrays_init(VVCContext *s,
>> VVCFrameContext *fc)
>>      return 0;
>>  }
>>
>> +int ff_vvc_per_frame_init(VVCFrameContext *fc)
>> +{
>> +    return frame_context_for_each_tl(fc, tl_zero);
>> +}
>> +
>>  static int min_positive(const int idx, const int diff, const int
>> min_diff)
>>  {
>>      return diff > 0 && (idx < 0 || diff < min_diff);
>> diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c
>> index 28065d726f..74f8e4e9d0 100644
>> --- a/libavcodec/vvc/thread.c
>> +++ b/libavcodec/vvc/thread.c
>> @@ -40,6 +40,7 @@ typedef struct ProgressListener {
>>  } ProgressListener;
>>
>>  typedef enum VVCTaskStage {
>> +    VVC_TASK_STAGE_INIT,                    // for CTU(0, 0) only
>>      VVC_TASK_STAGE_PARSE,
>>      VVC_TASK_STAGE_INTER,
>>      VVC_TASK_STAGE_RECON,
>> @@ -175,10 +176,14 @@ static int task_has_target_score(VVCTask *t, const
>> VVCTaskStage stage, const uin
>>      uint8_t target = 0;
>>      VVCFrameContext *fc = t->fc;
>>
>> +    if (stage == VVC_TASK_STAGE_INIT)
>> +        return 1;
>> +
>>      if (stage == VVC_TASK_STAGE_PARSE) {
>> -        const H266RawSPS *rsps = fc->ps.sps->r;
>> -        const int wpp = rsps->sps_entropy_coding_sync_enabled_flag &&
>> !is_first_row(fc, t->rx, t->ry);
>> -        target = 2 + wpp - 1;                           //left parse +
>> colocation + wpp - no previous stage
>> +        const H266RawSPS *rsps   = fc->ps.sps->r;
>> +        const int wpp            =
>> rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx,
>> t->ry);
>> +        const int no_prev_stage  = t->rs > 0;
>> +        target = 2 + wpp - no_prev_stage;
>>  //left parse + colocation + wpp - no_prev_stage
>>      } else if (stage == VVC_TASK_STAGE_INTER) {
>>          target = atomic_load(&t->target_inter_score);
>>      } else {
>> @@ -399,6 +404,55 @@ static int task_priority_higher(const AVTask *_a,
>> const AVTask *_b)
>>      return a->ry < b->ry;
>>  }
>>
>> +static void check_colocation(VVCContext *s, VVCTask *t)
>> +{
>> +    const VVCFrameContext *fc = t->fc;
>> +
>> +    if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag ||
>> fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
>> +        VVCFrame *col       = fc->ref->collocated_ref;
>> +        const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
>> +        if (col && first_col) {
>> +            //we depend on bottom and right boundary, do not - 1 for y
>> +            const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
>> +            add_progress_listener(col, &t->col_listener, t, s,
>> VVC_PROGRESS_MV, y);
>> +            return;
>> +        }
>> +    }
>> +    frame_thread_add_score(s, fc->ft, t->rx, t->ry,
>> VVC_TASK_STAGE_PARSE);
>> +}
>> +
>> +static void submit_entry_point(VVCContext *s, VVCFrameThread *ft,
>> SliceContext *sc, EntryPoint *ep)
>> +{
>> +    const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
>> +    VVCTask *t   = ft->tasks + rs;
>> +
>> +    frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
>> +}
>> +
>> +static int run_init(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
>> +{
>> +    VVCFrameContext *fc = lc->fc;
>> +    VVCFrameThread *ft  = fc->ft;
>> +    const int ret       = ff_vvc_per_frame_init(fc);
>> +
>> +    if (ret < 0)
>> +        return ret;
>> +
>> +    for (int i = 0; i < fc->nb_slices; i++) {
>> +        SliceContext *sc = fc->slices[i];
>> +        for (int j = 0; j < sc->nb_eps; j++) {
>> +            EntryPoint *ep = sc->eps + j;
>> +            for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
>> +                const int rs = sc->sh.ctb_addr_in_curr_slice[k];
>> +                VVCTask *t   = ft->tasks + rs;
>> +                check_colocation(s, t);
>> +            }
>> +            submit_entry_point(s, ft, sc, ep);
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>>  static void report_frame_progress(VVCFrameContext *fc,
>>     const int ry, const VVCProgress idx)
>>  {
>> @@ -547,6 +601,7 @@ static int run_alf(VVCContext *s, VVCLocalContext
>> *lc, VVCTask *t)
>>  #define VVC_THREAD_DEBUG
>>  #ifdef VVC_THREAD_DEBUG
>>  const static char* task_name[] = {
>> +    "INIT",
>>      "P",
>>      "I",
>>      "R",
>> @@ -567,6 +622,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s,
>> VVCLocalContext *lc)
>>      VVCFrameThread *ft       = fc->ft;
>>      const VVCTaskStage stage = t->stage;
>>      static const run_func run[] = {
>> +        run_init,
>>          run_parse,
>>          run_inter,
>>          run_recon,
>> @@ -726,7 +782,7 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc)
>>
>>      for (int rs = 0; rs < ft->ctu_count; rs++) {
>>          VVCTask *t = ft->tasks + rs;
>> -        task_init(t, VVC_TASK_STAGE_PARSE, fc, rs % ft->ctu_width, rs /
>> ft->ctu_width);
>> +        task_init(t, rs ? VVC_TASK_STAGE_PARSE : VVC_TASK_STAGE_INIT,
>> fc, rs % ft->ctu_width, rs / ft->ctu_width);
>>      }
>>
>>      memset(&ft->row_progress[0], 0, sizeof(ft->row_progress));
>> @@ -745,59 +801,25 @@ fail:
>>      return AVERROR(ENOMEM);
>>  }
>>
>> -static void check_colocation(VVCContext *s, VVCTask *t)
>> -{
>> -    const VVCFrameContext *fc = t->fc;
>> -
>> -    if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag ||
>> fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
>> -        VVCFrame *col       = fc->ref->collocated_ref;
>> -        const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
>> -        if (col && first_col) {
>> -            //we depend on bottom and right boundary, do not - 1 for y
>> -            const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
>> -            add_progress_listener(col, &t->col_listener, t, s,
>> VVC_PROGRESS_MV, y);
>> -            return;
>> -        }
>> -    }
>> -    frame_thread_add_score(s, fc->ft, t->rx, t->ry,
>> VVC_TASK_STAGE_PARSE);
>> -}
>> -
>> -static void submit_entry_point(VVCContext *s, VVCFrameThread *ft,
>> SliceContext *sc, EntryPoint *ep)
>> -{
>> -    const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
>> -    VVCTask *t   = ft->tasks + rs;
>> -
>> -    frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
>> -}
>> -
>>  int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc)
>>  {
>>      VVCFrameThread *ft = fc->ft;
>>
>> -    // We'll handle this in two passes:
>> -    // Pass 0 to initialize tasks with parser, this will help detect bit
>> stream error
>> -    // Pass 1 to shedule location check and submit the entry point
>> -    for (int pass = 0; pass < 2; pass++) {
>> -        for (int i = 0; i < fc->nb_slices; i++) {
>> -            SliceContext *sc = fc->slices[i];
>> -            for (int j = 0; j < sc->nb_eps; j++) {
>> -                EntryPoint *ep = sc->eps + j;
>> -                for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
>> -                    const int rs = sc->sh.ctb_addr_in_curr_slice[k];
>> -                    VVCTask *t   = ft->tasks + rs;
>> -                    if (pass) {
>> -                        check_colocation(s, t);
>> -                    } else {
>> -                        const int ret = task_init_parse(t, sc, ep, k);
>> -                        if (ret < 0)
>> -                            return ret;
>> -                    }
>> -                }
>> -                if (pass)
>> -                    submit_entry_point(s, ft, sc, ep);
>> +    for (int i = 0; i < fc->nb_slices; i++) {
>> +        SliceContext *sc = fc->slices[i];
>> +        for (int j = 0; j < sc->nb_eps; j++) {
>> +            EntryPoint *ep = sc->eps + j;
>> +            for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
>> +                const int rs = sc->sh.ctb_addr_in_curr_slice[k];
>> +                VVCTask *t   = ft->tasks + rs;
>> +                const int ret = task_init_parse(t, sc, ep, k);
>> +                if (ret < 0)
>> +                    return ret;
>>              }
>>          }
>>      }
>> +    frame_thread_add_score(s, ft, 0, 0, VVC_TASK_STAGE_INIT);
>> +
>>      return 0;
>>  }
>>
>> diff --git a/libavcodec/vvc/thread.h b/libavcodec/vvc/thread.h
>> index 8ac59b2ecf..7b15dbee59 100644
>> --- a/libavcodec/vvc/thread.h
>> +++ b/libavcodec/vvc/thread.h
>> @@ -32,5 +32,6 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc);
>>  void ff_vvc_frame_thread_free(VVCFrameContext *fc);
>>  int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc);
>>  int ff_vvc_frame_wait(VVCContext *s, VVCFrameContext *fc);
>> +int ff_vvc_per_frame_init(VVCFrameContext *fc);
>>
>>  #endif // AVCODEC_VVC_THREAD_H
>> --
>> 2.34.1
>>
>>


More information about the ffmpeg-devel mailing list