[FFmpeg-devel] [PATCH v6 14/14] vvcdec: add full vvc decoder

Fri Dec 8 14:19:08 EET 2023

Nuo Mi:
> vvc decoder plug-in to avcodec.
> split frames into slices/tiles and send them to vvc_thread for further decoding
> reorder and wait for the frame decoding to be done and output the frame
> 
> Features:
>     + Support I, P, B frames
>     + Support 8/10/12 bits, chroma 400, 420, 422, and 444 and range extension
>     + Support VVC new tools like MIP, CCLM, AFFINE, GPM, DMVR, PROF, BDOF, LMCS, ALF
>     + 295 conformace clips passed
>     - Not support RPR, IBC, PALETTE, and other minor features yet
> 
> Performance:
>     C code FPS on i7-12700 (x86):
>         BQTerrace_1920x1080_60_10_420_22_RA.vvc      93.0
>         Chimera_8bit_1080P_1000_frames.vvc          184.3
>         NovosobornayaSquare_1920x1080.bin           191.3
>         RitualDance_1920x1080_60_10_420_32_LD.266   150.7
>         RitualDance_1920x1080_60_10_420_37_RA.266   170.0
>         Tango2_3840x2160_60_10_420_27_LD.266         33.7
> 
>     C code FPS on M1 Mac Pro (ARM):
>         BQTerrace_1920x1080_60_10_420_22_RA.vvc     58.7
>         Chimera_8bit_1080P_1000_frames.vvc          153.3
>         NovosobornayaSquare_1920x1080.bin           150.3
>         RitualDance_1920x1080_60_10_420_32_LD.266   105.0
>         RitualDance_1920x1080_60_10_420_37_RA.266   133.0
>         Tango2_3840x2160_60_10_420_27_LD.266        21.7
> 
>     Asm optimizations still working in progress. please check
>     https://github.com/ffvvc/FFmpeg/wiki#performance-data for the latest
> 
> Contributors(based on code merge order):
>     Nuo Mi <nuomi2021 at gmail.com>
>     Xu Mu <toxumu at outlook.com>
>     frankplow <post at frankplowman.com>
>     Shaun Loo <shaunloo10 at gmail.com>
> ---
>  libavcodec/vvc/vvcdec.c | 1007 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 1007 insertions(+)
> 
> diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c
> index 3c591ce875..e40eb7339f 100644
> --- a/libavcodec/vvc/vvcdec.c
> +++ b/libavcodec/vvc/vvcdec.c
> @@ -21,28 +21,1035 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>   */
>  #include "libavcodec/codec_internal.h"
> +#include "libavcodec/decode.h"
>  #include "libavcodec/profiles.h"
> +#include "libavcodec/refstruct.h"
> +#include "libavutil/cpu.h"
>  
>  #include "vvcdec.h"
> +#include "vvc_ctu.h"
> +#include "vvc_data.h"
> +#include "vvc_refs.h"
> +#include "vvc_thread.h"
> +
> +static int vvc_frame_start(VVCContext *s, VVCFrameContext *fc, SliceContext *sc)
> +{
> +    const VVCPH *ph                 = &fc->ps.ph;
> +    const H266RawSliceHeader *rsh   = sc->sh.r;
> +    int ret;
> +
> +    // 8.3.1 Decoding process for picture order count
> +    if (!s->temporal_id && !ph->r->ph_non_ref_pic_flag && !(IS_RASL(s) || IS_RADL(s)))
> +        s->poc_tid0 = ph->poc;
> +
> +    if ((ret = ff_vvc_set_new_ref(s, fc, &fc->frame)) < 0)
> +        goto fail;
> +
> +    if (!IS_IDR(s))
> +        ff_vvc_bump_frame(s, fc);
> +
> +    av_frame_unref(fc->output_frame);
> +
> +    if ((ret = ff_vvc_output_frame(s, fc, fc->output_frame,rsh->sh_no_output_of_prior_pics_flag, 0)) < 0)
> +        goto fail;
> +
> +    if ((ret = ff_vvc_frame_rpl(s, fc, sc)) < 0)
> +        goto fail;
> +
> +    if ((ret = ff_vvc_frame_thread_init(fc)) < 0)
> +        goto fail;
> +    return 0;
> +fail:
> +    if (fc->ref)
> +        ff_vvc_unref_frame(fc, fc->ref, ~0);
> +    fc->ref = NULL;
> +    return ret;
> +}
> +
> +static void ctb_arrays_free(VVCFrameContext *fc)
> +{
> +    av_freep(&fc->tab.deblock);
> +    av_freep(&fc->tab.sao);
> +    av_freep(&fc->tab.alf);
> +    av_freep(&fc->tab.slice_idx);
> +    av_freep(&fc->tab.coeffs);
> +    if (fc->tab.ctus) {
> +        for (int i = 0; i < fc->tab.ctu_count; i++)
> +            ff_vvc_ctu_free_cus(fc->tab.ctus + i);
> +        av_freep(&fc->tab.ctus);
> +    }
> +    ff_refstruct_pool_uninit(&fc->rpl_tab_pool);
> +}
> +
> +static int ctb_arrays_init(VVCFrameContext *fc, const int ctu_count, const int ctu_size)
> +{
> +    if (fc->tab.ctu_count != ctu_count || fc->tab.ctu_size != ctu_size) {
> +        ctb_arrays_free(fc);
> +        fc->tab.deblock         = av_calloc(ctu_count, sizeof(*fc->tab.deblock));
> +        fc->tab.sao             = av_calloc(ctu_count, sizeof(*fc->tab.sao));
> +        fc->tab.alf             = av_calloc(ctu_count, sizeof(*fc->tab.alf));
> +        fc->tab.ctus            = av_calloc(ctu_count, sizeof(*fc->tab.ctus));
> +        fc->tab.slice_idx       = av_malloc(ctu_count * sizeof(*fc->tab.slice_idx));
> +        if (!fc->tab.deblock || !fc->tab.sao || !fc->tab.alf || !fc->tab.ctus || !fc->tab.slice_idx )
> +            return AVERROR(ENOMEM);
> +        fc->tab.coeffs = av_malloc(ctu_count * sizeof(*fc->tab.coeffs) * ctu_size * VVC_MAX_SAMPLE_ARRAYS);
> +        if (!fc->tab.coeffs)
> +            return AVERROR(ENOMEM);
> +        fc->rpl_tab_pool = ff_refstruct_pool_alloc(ctu_count * sizeof(RefPicListTab), 0);
> +        if (!fc->rpl_tab_pool)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        memset(fc->tab.deblock, 0, ctu_count * sizeof(*fc->tab.deblock));
> +        memset(fc->tab.sao, 0, ctu_count * sizeof(*fc->tab.sao));
> +        memset(fc->tab.alf, 0, ctu_count * sizeof(*fc->tab.alf));
> +        for (int i = 0; i < fc->tab.ctu_count; i++)
> +            ff_vvc_ctu_free_cus(fc->tab.ctus + i);
> +        memset(fc->tab.ctus, 0, ctu_count * sizeof(*fc->tab.ctus));
> +    }
> +    memset(fc->tab.slice_idx, -1, ctu_count * sizeof(*fc->tab.slice_idx));
> +
> +    return 0;
> +}
> +
> +static void min_cb_arrays_free(VVCFrameContext *fc)
> +{
> +    for (int i = LUMA; i <= CHROMA; i++) {
> +        av_freep(&fc->tab.cb_pos_x[i]);
> +        av_freep(&fc->tab.cb_pos_y[i]);
> +        av_freep(&fc->tab.cb_width[i]);
> +        av_freep(&fc->tab.cb_height[i]);
> +        av_freep(&fc->tab.cqt_depth[i]);
> +        av_freep(&fc->tab.cpm[i]);
> +        av_freep(&fc->tab.cp_mv[i]);
> +    }
> +
> +    av_freep(&fc->tab.ipm);
> +    av_freep(&fc->tab.imf);
> +    av_freep(&fc->tab.imtf);
> +    av_freep(&fc->tab.imm);
> +    av_freep(&fc->tab.skip);
> +}
> +
> +static int min_cb_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_cb)
> +{
> +    if (fc->tab.pic_size_in_min_cb != pic_size_in_min_cb) {
> +        min_cb_arrays_free(fc);
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            fc->tab.cb_pos_x[i]  = av_mallocz(pic_size_in_min_cb * sizeof(int));
> +            fc->tab.cb_pos_y[i]  = av_mallocz(pic_size_in_min_cb * sizeof(int));
> +            fc->tab.cb_width[i]  = av_mallocz(pic_size_in_min_cb);
> +            fc->tab.cb_height[i] = av_mallocz(pic_size_in_min_cb);
> +            fc->tab.cqt_depth[i] = av_mallocz(pic_size_in_min_cb);
> +            if (!fc->tab.cb_pos_x[i] || !fc->tab.cb_pos_y[i] || !fc->tab.cb_width[i] || !fc->tab.cb_height[i] || !fc->tab.cqt_depth[i])
> +                return AVERROR(ENOMEM);
> +
> +            fc->tab.cpm[i]   = av_mallocz(pic_size_in_min_cb);
> +            fc->tab.cp_mv[i] = av_mallocz(pic_size_in_min_cb * sizeof(Mv) * MAX_CONTROL_POINTS);
> +            if (!fc->tab.cpm[i] || !fc->tab.cp_mv[i])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        fc->tab.ipm  = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.imf  = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.imtf = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.imm  = av_mallocz(pic_size_in_min_cb);
> +        fc->tab.skip = av_mallocz(pic_size_in_min_cb);
> +        if (!fc->tab.ipm || !fc->tab.imf || !fc->tab.imtf || !fc->tab.imm || !fc->tab.skip)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            memset(fc->tab.cb_pos_x[i], 0, pic_size_in_min_cb * sizeof(int));
> +            memset(fc->tab.cb_pos_y[i], 0, pic_size_in_min_cb * sizeof(int));
> +            memset(fc->tab.cb_width[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cb_height[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cqt_depth[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cpm[i], 0, pic_size_in_min_cb);
> +            memset(fc->tab.cp_mv[i], 0, pic_size_in_min_cb * sizeof(Mv) * MAX_CONTROL_POINTS);
> +        }
> +
> +        memset(fc->tab.ipm, 0, pic_size_in_min_cb);
> +        memset(fc->tab.imf, 0, pic_size_in_min_cb);
> +        memset(fc->tab.imtf, 0, pic_size_in_min_cb);
> +        memset(fc->tab.imm, 0, pic_size_in_min_cb);
> +        memset(fc->tab.skip, 0, pic_size_in_min_cb);
> +    }
> +    return 0;
> +}
> +
> +static void min_tu_arrays_free(VVCFrameContext *fc)
> +{
> +    for (int i = LUMA; i <= CHROMA; i++) {
> +        av_freep(&fc->tab.tb_pos_x0[i]);
> +        av_freep(&fc->tab.tb_pos_y0[i]);
> +        av_freep(&fc->tab.tb_width[i]);
> +        av_freep(&fc->tab.tb_height[i]);
> +        av_freep(&fc->tab.pcmf[i]);
> +    }
> +
> +    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +        av_freep(&fc->tab.qp[i]);
> +        av_freep(&fc->tab.tu_coded_flag[i]);
> +    }
> +
> +    av_freep(&fc->tab.tu_joint_cbcr_residual_flag);
> +}
> +
> +static int min_tu_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_tu)
> +{
> +    if (fc->tab.pic_size_in_min_tu != pic_size_in_min_tu) {
> +        min_tu_arrays_free(fc);
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            fc->tab.tb_pos_x0[i] = av_mallocz(pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_x0[0]));
> +            fc->tab.tb_pos_y0[i] = av_mallocz(pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_y0[0])) ;
> +            fc->tab.tb_width[i]  = av_mallocz(pic_size_in_min_tu);
> +            fc->tab.tb_height[i] = av_mallocz(pic_size_in_min_tu);
> +            fc->tab.pcmf[i]      = av_mallocz(pic_size_in_min_tu);
> +            if (!fc->tab.tb_pos_x0[i] || !fc->tab.tb_pos_y0[i] ||
> +                !fc->tab.tb_width[i] || !fc->tab.tb_height[i] || !fc->tab.pcmf[i])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            fc->tab.tu_coded_flag[i] = av_mallocz(pic_size_in_min_tu);
> +            if (!fc->tab.tu_coded_flag[i])
> +                return AVERROR(ENOMEM);
> +
> +            fc->tab.qp[i] = av_mallocz(pic_size_in_min_tu);
> +            if (!fc->tab.qp[i])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        fc->tab.tu_joint_cbcr_residual_flag  = av_mallocz(pic_size_in_min_tu);
> +        if (!fc->tab.tu_joint_cbcr_residual_flag)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            memset(fc->tab.tb_pos_x0[i], 0, pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_x0[0]));
> +            memset(fc->tab.tb_pos_y0[i], 0, pic_size_in_min_tu * sizeof(*fc->tab.tb_pos_y0[0])) ;
> +            memset(fc->tab.tb_width[i], 0, pic_size_in_min_tu);
> +            memset(fc->tab.tb_height[i], 0, pic_size_in_min_tu);
> +            memset(fc->tab.pcmf[i], 0, pic_size_in_min_tu);
> +        }
> +
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            memset(fc->tab.tu_coded_flag[i], 0, pic_size_in_min_tu);
> +            memset(fc->tab.qp[i], 0, pic_size_in_min_tu);
> +        }
> +        memset(fc->tab.tu_joint_cbcr_residual_flag, 0, pic_size_in_min_tu);
> +    }
> +    return 0;
> +}
> +
> +static void min_pu_arrays_free(VVCFrameContext *fc)
> +{
> +    av_freep(&fc->tab.mvf);
> +    av_freep(&fc->tab.msf);
> +    av_freep(&fc->tab.iaf);
> +    av_freep(&fc->tab.mmi);
> +    ff_refstruct_pool_uninit(&fc->tab_dmvr_mvf_pool);
> +}
> +
> +static int min_pu_arrays_init(VVCFrameContext *fc, const int pic_size_in_min_pu)
> +{
> +    if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
> +        min_pu_arrays_free(fc);
> +        fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
> +        fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
> +        fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
> +        fc->tab.mvf  = av_mallocz(pic_size_in_min_pu * sizeof(*fc->tab.mvf));

Do these have to be separate allocations? If there were allocated
jointly, one memset below would suffice.

> +        if (!fc->tab.msf || !fc->tab.iaf || !fc->tab.mmi || !fc->tab.mvf)
> +            return AVERROR(ENOMEM);
> +        fc->tab_dmvr_mvf_pool  = ff_refstruct_pool_alloc(pic_size_in_min_pu * sizeof(MvField), FF_REFSTRUCT_POOL_FLAG_ZERO_EVERY_TIME);
> +        if (!fc->tab_dmvr_mvf_pool)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        memset(fc->tab.msf, 0, pic_size_in_min_pu);
> +        memset(fc->tab.iaf, 0, pic_size_in_min_pu);
> +        memset(fc->tab.mmi, 0, pic_size_in_min_pu);
> +        memset(fc->tab.mvf, 0, pic_size_in_min_pu * sizeof(*fc->tab.mvf));
> +    }
> +
> +    return 0;
> +}
> +
> +static void bs_arrays_free(VVCFrameContext *fc)
> +{
> +    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +        av_freep(&fc->tab.horizontal_bs[i]);
> +        av_freep(&fc->tab.vertical_bs[i]);
> +    }
> +    av_freep(&fc->tab.horizontal_q);
> +    av_freep(&fc->tab.horizontal_p);
> +    av_freep(&fc->tab.vertical_p);
> +    av_freep(&fc->tab.vertical_q);
> +}
> +
> +static int bs_arrays_init(VVCFrameContext *fc, const int bs_width, const int bs_height)
> +{
> +    if (fc->tab.bs_width != bs_width || fc->tab.bs_height != bs_height) {
> +        bs_arrays_free(fc);
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            fc->tab.horizontal_bs[i] = av_calloc(bs_width, bs_height);
> +            fc->tab.vertical_bs[i]   = av_calloc(bs_width, bs_height);
> +            if (!fc->tab.horizontal_bs[i] || !fc->tab.vertical_bs[i])
> +                return AVERROR(ENOMEM);
> +        }
> +        fc->tab.horizontal_q = av_calloc(bs_width, bs_height);
> +        fc->tab.horizontal_p = av_calloc(bs_width, bs_height);
> +        fc->tab.vertical_p   = av_calloc(bs_width, bs_height);
> +        fc->tab.vertical_q   = av_calloc(bs_width, bs_height);
> +        if (!fc->tab.horizontal_q || !fc->tab.horizontal_p || !fc->tab.vertical_p || !fc->tab.vertical_q)
> +            return AVERROR(ENOMEM);
> +    } else {
> +        for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +            memset(fc->tab.horizontal_bs[i], 0, bs_width * bs_height);
> +            memset(fc->tab.vertical_bs[i], 0, bs_width * bs_height);
> +        }
> +        memset(fc->tab.horizontal_q, 0, bs_width * bs_height);
> +        memset(fc->tab.horizontal_p, 0, bs_width * bs_height);
> +        memset(fc->tab.vertical_p, 0, bs_width * bs_height);
> +        memset(fc->tab.vertical_q, 0, bs_width * bs_height);
> +    }
> +    return 0;
> +}
> +
> +static void pixel_buffer_free(VVCFrameContext *fc)
> +{
> +    for (int i = 0; i < VVC_MAX_SAMPLE_ARRAYS; i++) {
> +        av_freep(&fc->tab.sao_pixel_buffer_h[i]);
> +        av_freep(&fc->tab.sao_pixel_buffer_v[i]);
> +        for (int j = 0; j < 2; j++) {
> +            av_freep(&fc->tab.alf_pixel_buffer_h[i][j]);
> +            av_freep(&fc->tab.alf_pixel_buffer_v[i][j]);
> +        }
> +    }
> +}
> +
> +static int pixel_buffer_init(VVCFrameContext *fc, const int width, const int height,
> +    const int ctu_width, const int ctu_height, const int chroma_format_idc, const int ps)
> +{
> +    const VVCSPS *sps = fc->ps.sps;
> +    const int c_end   = chroma_format_idc ? VVC_MAX_SAMPLE_ARRAYS : 1;
> +
> +    if (fc->tab.chroma_format_idc != chroma_format_idc ||
> +        fc->tab.width != width || fc->tab.height != height ||
> +        fc->tab.ctu_width != ctu_width || fc->tab.ctu_height != ctu_height) {
> +        pixel_buffer_free(fc);
> +        for (int c_idx = 0; c_idx < c_end; c_idx++) {
> +            const int w = width >> sps->hshift[c_idx];
> +            const int h = height >> sps->vshift[c_idx];
> +            fc->tab.sao_pixel_buffer_h[c_idx] = av_malloc((w * 2 * ctu_height) << ps);
> +            fc->tab.sao_pixel_buffer_v[c_idx] = av_malloc((h * 2 * ctu_width)  << ps);
> +            if (!fc->tab.sao_pixel_buffer_h[c_idx] || !fc->tab.sao_pixel_buffer_v[c_idx])
> +                return AVERROR(ENOMEM);
> +        }
> +
> +        for (int c_idx = 0; c_idx < c_end; c_idx++) {
> +            const int w = width >> sps->hshift[c_idx];
> +            const int h = height >> sps->vshift[c_idx];
> +            const int border_pixels = c_idx ? ALF_BORDER_CHROMA : ALF_BORDER_LUMA;
> +            for (int i = 0; i < 2; i++) {
> +                fc->tab.alf_pixel_buffer_h[c_idx][i] = av_malloc((w * border_pixels * ctu_height) << ps);
> +                fc->tab.alf_pixel_buffer_v[c_idx][i] = av_malloc(h * ALF_PADDING_SIZE * ctu_width);
> +                if (!fc->tab.alf_pixel_buffer_h[c_idx][i] || !fc->tab.alf_pixel_buffer_v[c_idx][i])
> +                    return AVERROR(ENOMEM);
> +            }
> +        }
> +    }
> +    return 0;
> +}
> +
> +static void pic_arrays_free(VVCFrameContext *fc)
> +{
> +    ctb_arrays_free(fc);
> +    min_cb_arrays_free(fc);
> +    min_pu_arrays_free(fc);
> +    min_tu_arrays_free(fc);
> +    bs_arrays_free(fc);
> +    ff_refstruct_pool_uninit(&fc->cu_pool);
> +    ff_refstruct_pool_uninit(&fc->tu_pool);
> +    pixel_buffer_free(fc);
> +
> +    for (int i = 0; i < 2; i++)
> +        av_freep(&fc->tab.msm[i]);
> +    av_freep(&fc->tab.ispmf);
> +
> +    fc->tab.ctu_count = 0;
> +    fc->tab.ctu_size  = 0;
> +    fc->tab.pic_size_in_min_cb = 0;
> +    fc->tab.pic_size_in_min_pu = 0;
> +    fc->tab.pic_size_in_min_tu = 0;
> +    fc->tab.width              = 0;
> +    fc->tab.height             = 0;
> +    fc->tab.ctu_width          = 0;
> +    fc->tab.ctu_height         = 0;
> +    fc->tab.bs_width           = 0;
> +    fc->tab.bs_height          = 0;
> +}
> +
> +static int pic_arrays_init(VVCContext *s, VVCFrameContext *fc)
> +{
> +    const VVCSPS *sps               = fc->ps.sps;
> +    const VVCPPS *pps               = fc->ps.pps;
> +    const int ctu_size              = 1 << sps->ctb_log2_size_y << sps->ctb_log2_size_y;
> +    const int pic_size_in_min_cb    = pps->min_cb_width * pps->min_cb_height;
> +    const int pic_size_in_min_pu    = pps->min_pu_width * pps->min_pu_height;
> +    const int pic_size_in_min_tu    = pps->min_tu_width * pps->min_tu_height;
> +    const int w32                   = AV_CEIL_RSHIFT(pps->width,  5);
> +    const int h32                   = AV_CEIL_RSHIFT(pps->height,  5);
> +    const int w64                   = AV_CEIL_RSHIFT(pps->width,  6);
> +    const int h64                   = AV_CEIL_RSHIFT(pps->height,  6);
> +    const int bs_width              = (fc->ps.pps->width >> 2) + 1;
> +    const int bs_height             = (fc->ps.pps->height >> 2) + 1;
> +    int ret;
> +
> +    if ((ret = ctb_arrays_init(fc, pps->ctb_count, ctu_size)) < 0)
> +        goto fail;
> +
> +    if ((ret = min_cb_arrays_init(fc, pic_size_in_min_cb)) < 0)
> +        goto fail;
> +
> +    if ((ret = min_pu_arrays_init(fc, pic_size_in_min_pu)) < 0)
> +        goto fail;
> +
> +    if ((ret = min_tu_arrays_init(fc, pic_size_in_min_tu)) < 0)
> +        goto fail;
> +
> +    if ((ret = bs_arrays_init(fc, bs_width, bs_height)) < 0)
> +        goto fail;
> +
> +    if ((ret = pixel_buffer_init(fc, pps->width, pps->height, pps->ctb_width, pps->ctb_height,
> +        sps->r->sps_chroma_format_idc, sps->pixel_shift)) < 0)
> +        goto fail;
> +
> +    if (AV_CEIL_RSHIFT(fc->tab.width,  5) != w32 || AV_CEIL_RSHIFT(fc->tab.height,  5) != h32) {
> +        for (int i = LUMA; i <= CHROMA; i++) {
> +            av_freep(&fc->tab.msm[i]);
> +            fc->tab.msm[i] = av_calloc(w32, h32);
> +            if (!fc->tab.msm[i])
> +                goto fail;
> +        }
> +    } else {
> +        for (int i = LUMA; i <= CHROMA; i++)
> +            memset(fc->tab.msm[i], 0, w32 * h32);
> +    }
> +    if (AV_CEIL_RSHIFT(fc->tab.width,  6) != w64 || AV_CEIL_RSHIFT(fc->tab.height,  6) != h64) {
> +        av_freep(&fc->tab.ispmf);
> +        fc->tab.ispmf = av_calloc(w64, h64);
> +        if (!fc->tab.ispmf)
> +            goto fail;
> +    } else {
> +        memset(fc->tab.ispmf, 0, w64 * h64);
> +    }
> +
> +    if (!fc->cu_pool) {
> +        fc->cu_pool = ff_refstruct_pool_alloc(sizeof(CodingUnit), 0);
> +        if (!fc->cu_pool)
> +            goto fail;

The size of the objects contained in this pool don't depend on any
bitstream parameters. You can therefore simply use a single pool (in
VVCContext) that is allocated in vvc_decode_init() and freed in
vvc_decode_free().
The same goes for tu_pool below.

> +    }
> +
> +    if (!fc->tu_pool) {
> +        fc->tu_pool = ff_refstruct_pool_alloc(sizeof(TransformUnit), 0);
> +        if (!fc->tu_pool)
> +            goto fail;
> +    }
> +
> +    fc->tab.ctu_count = pps->ctb_count;
> +    fc->tab.ctu_size  = ctu_size;
> +    fc->tab.pic_size_in_min_cb = pic_size_in_min_cb;
> +    fc->tab.pic_size_in_min_pu = pic_size_in_min_pu;
> +    fc->tab.pic_size_in_min_tu = pic_size_in_min_tu;
> +    fc->tab.width              = pps->width;
> +    fc->tab.height             = pps->height;
> +    fc->tab.ctu_width          = pps->ctb_width;
> +    fc->tab.ctu_height         = pps->ctb_height;
> +    fc->tab.chroma_format_idc  = sps->r->sps_chroma_format_idc;
> +    fc->tab.pixel_shift        = sps->pixel_shift;
> +    fc->tab.bs_width           = bs_width;
> +    fc->tab.bs_height          = bs_height;
> +
> +    return 0;
> +fail:
> +    pic_arrays_free(fc);
> +    return ret;
> +}
> +
> +static int min_positive(const int idx, const int diff, const int min_diff)
> +{
> +    return diff > 0 && (idx < 0 || diff < min_diff);
> +}
> +
> +static int max_negtive(const int idx, const int diff, const int max_diff)
> +{
> +    return diff < 0 && (idx < 0 || diff > max_diff);
> +}
> +
> +typedef int (*smvd_find_fxn)(const int idx, const int diff, const int old_diff);
> +
> +static int8_t smvd_find(const VVCFrameContext *fc, const SliceContext *sc, int lx, smvd_find_fxn find)
> +{
> +    const H266RawSliceHeader *rsh   = sc->sh.r;
> +    const RefPicList *rpl           = sc->rpl + lx;
> +    const int poc                   = fc->ref->poc;
> +    int8_t idx                      = -1;
> +    int old_diff                    = -1;
> +    for (int i = 0; i < rsh->num_ref_idx_active[lx]; i++) {
> +        if (!rpl->isLongTerm[i]) {
> +            int diff = poc - rpl->list[i];
> +            if (find(idx, diff, old_diff)) {
> +                idx = i;
> +                old_diff = diff;
> +            }
> +        }
> +    }
> +    return idx;
> +}
> +
> +static void vvc_smvd_ref_idx(const VVCFrameContext *fc, SliceContext *sc)
> +{
> +    VVCSH *sh = &sc->sh;
> +    if (IS_B(sh->r)) {
> +        sh->ref_idx_sym[0] = smvd_find(fc, sc, 0, min_positive);
> +        sh->ref_idx_sym[1] = smvd_find(fc, sc, 1, max_negtive);
> +        if (sh->ref_idx_sym[0] == -1 || sh->ref_idx_sym[1] == -1) {
> +            sh->ref_idx_sym[0] = smvd_find(fc, sc, 0, max_negtive);
> +            sh->ref_idx_sym[1] = smvd_find(fc, sc, 1, min_positive);
> +        }
> +    }
> +}
> +
> +static void eps_free(SliceContext *slice)
> +{
> +    av_freep(&slice->eps);
> +}
> +
> +static void slices_free(VVCFrameContext *fc)
> +{
> +    if (fc->slices) {
> +        for (int i = 0; i < fc->nb_slices_allocated; i++) {
> +            SliceContext *slice = fc->slices[i];
> +            if (slice) {
> +                ff_refstruct_unref(&slice->sh.r);
> +                eps_free(slice);
> +                av_free(slice);
> +            }
> +        }
> +        av_freep(&fc->slices);
> +    }
> +    fc->nb_slices_allocated = 0;
> +    fc->nb_slices = 0;
> +}
> +
> +static int slices_realloc(VVCFrameContext *fc)
> +{
> +    void *p;
> +    const int size = (fc->nb_slices_allocated + 1) * 3 / 2;
> +
> +    if (fc->nb_slices < fc->nb_slices_allocated)
> +        return 0;
> +
> +    p = av_realloc(fc->slices, size * sizeof(*fc->slices));

av_realloc_array()

> +    if (!p)
> +        return AVERROR(ENOMEM);
> +
> +    fc->slices = p;
> +    for (int i = fc->nb_slices_allocated; i < size; i++) {
> +        fc->slices[i] = av_calloc(1, sizeof(*fc->slices[0]));

av_mallocz().

> +        if (!fc->slices[i]) {
> +            for (int j = fc->nb_slices_allocated; j < i; j++)
> +                av_freep(&fc->slices[j]);
> +            return AVERROR(ENOMEM);

Can't you simply set fc->nb_slices_allocated to i in order to avoid this
loop?

> +        }
> +        fc->slices[i]->slice_idx = i;
> +    }
> +    fc->nb_slices_allocated = size;
> +    return 0;
> +}
> +
> +static void ep_init_cabac_decoder(SliceContext *sc, const int index, const H2645NAL *nal, GetBitContext *gb)
> +{
> +    const H266RawSliceHeader *rsh   = sc->sh.r;
> +    EntryPoint *ep                  = sc->eps + index;
> +    int size;
> +
> +    if (index < rsh->num_entry_points) {
> +        int skipped = 0;
> +        int64_t start =  (gb->index >> 3);
> +        int64_t end = start + rsh->sh_entry_point_offset_minus1[index] + 1;
> +        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] <= start) {
> +            skipped++;
> +        }
> +        while (skipped < nal->skipped_bytes && nal->skipped_bytes_pos[skipped] < end) {
> +            end--;
> +            skipped++;
> +        }
> +        size = end - start;
> +    } else {
> +        size = get_bits_left(gb) / 8;
> +    }
> +    ff_init_cabac_decoder (&ep->cc, gb->buffer + get_bits_count(gb) / 8, size);
> +    skip_bits(gb, size * 8);
> +}
> +
> +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
> +{
> +    const VVCSH *sh             = &sc->sh;
> +    const H266RawSlice *slice   = (const H266RawSlice *)unit->content;

Please no pointless casts. Also, why is there unnecessary whitespace in
front of '='?

> +    int nb_eps                  = sh->r->num_entry_points + 1;
> +    int ctu_addr                = 0;
> +    GetBitContext gb;
> +
> +    if (sc->nb_eps != nb_eps) {
> +        eps_free(sc);
> +        sc->eps = av_calloc(nb_eps, sizeof(*sc->eps));
> +        if (!sc->eps)
> +            return AVERROR(ENOMEM);

In case of error, sc->eps is NULL, yet sc->nb_eps may be != 0. Stuff
like this can (and does) lead to crashes.

> +        sc->nb_eps = nb_eps;
> +    }
> +
> +    init_get_bits8(&gb, slice->data, slice->data_size);
> +    for (int i = 0; i < sc->nb_eps; i++)
> +    {
> +        EntryPoint *ep = sc->eps + i;
> +
> +        ep->ctu_start = ctu_addr;
> +        ep->ctu_end   = (i + 1 == sc->nb_eps ? sh->num_ctus_in_curr_slice : sh->entry_point_start_ctu[i]);
> +
> +        for (int j = ep->ctu_start; j < ep->ctu_end; j++) {
> +            const int rs = sc->sh.ctb_addr_in_curr_slice[j];
> +            fc->tab.slice_idx[rs] = sc->slice_idx;
> +        }
> +
> +        ep_init_cabac_decoder(sc, i, nal, &gb);
> +
> +        if (i + 1 < sc->nb_eps)
> +            ctu_addr = sh->entry_point_start_ctu[i];
> +    }
> +
> +    return 0;
> +}
> +
> +static VVCFrameContext* get_frame_context(const VVCContext *s, const VVCFrameContext *fc, const int delta)
> +{
> +    const int size = s->nb_fcs;
> +    const int idx = (fc - s->fcs + delta  + size) % size;
> +    return s->fcs + idx;
> +}
> +
> +static int vvc_ref_frame(VVCFrameContext *fc, VVCFrame *dst, VVCFrame *src)

src should be const.

> +{
> +    int ret;
> +
> +    ret = av_frame_ref(dst->frame, src->frame);
> +    if (ret < 0)
> +        return ret;
> +
> +    ff_refstruct_replace(&dst->progress, src->progress);
> +
> +    ff_refstruct_replace(&dst->tab_dmvr_mvf, src->tab_dmvr_mvf);
> +
> +    ff_refstruct_replace(&dst->rpl_tab, src->rpl_tab);
> +    ff_refstruct_replace(&dst->rpl, src->rpl);
> +    dst->nb_rpl_elems = src->nb_rpl_elems;
> +
> +    dst->poc = src->poc;
> +    dst->ctb_count = src->ctb_count;
> +    dst->flags = src->flags;
> +    dst->sequence = src->sequence;
> +
> +    return 0;
> +}
> +
> +static av_cold void frame_context_free(VVCFrameContext *fc)
> +{
> +    slices_free(fc);
> +
> +    for (int i = 0; i < FF_ARRAY_ELEMS(fc->DPB); i++) {
> +        ff_vvc_unref_frame(fc, &fc->DPB[i], ~0);
> +        av_frame_free(&fc->DPB[i].frame);
> +    }
> +
> +    ff_vvc_frame_thread_free(fc);
> +    pic_arrays_free(fc);
> +    av_frame_free(&fc->output_frame);
> +    ff_vvc_frame_ps_free(&fc->ps);
> +    av_freep(&fc->avctx);
> +}
> +
> +static av_cold int frame_context_init(VVCFrameContext *fc, AVCodecContext *avctx)
> +{
> +
> +    fc->avctx = av_memdup(avctx, sizeof(*avctx));

When I read this, I presumed you are using multiple AVCodecContexts to
store the ever changing state of the AVCodecContext fields similarly to
update_context_from_thread() in pthread_frame.c. But it seems you don't.
These contexts are only used as a) logcontexts (where the actual
user-facing AVCodecContext should be used, so that the user can make
sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
export_frame_params() where only some basic fields
(dimension-related+pix_fmt) is set. Presumably c) is done for b).

But the user is allowed to change the provided callbacks in the master
context at any time. E.g. the call to ff_thread_get_buffer() in
vvc_refs.c currently uses the VVCFrameContext and therefore uses the
get_buffer2 callback in place now (during av_memdup()). This is wrong.

I think you can just remove VVCFrameContext.avctx and use the
user-facing AVCodecContext if you set the AVFrame properties that are
normally derived from the AVCodecContext directly on the AVFrame before
ff_thread_get_buffer().

> +    if (!fc->avctx)
> +        goto fail;
> +
> +    fc->output_frame = av_frame_alloc();
> +    if (!fc->output_frame)
> +        goto fail;
> +
> +    for (int j = 0; j < FF_ARRAY_ELEMS(fc->DPB); j++) {
> +        fc->DPB[j].frame = av_frame_alloc();
> +        if (!fc->DPB[j].frame)
> +            goto fail;
> +    }
> +
> +    return 0;
> +fail:
> +    return AVERROR(ENOMEM);
> +}
> +
> +static int frame_context_setup(VVCFrameContext *fc, VVCContext *s)
> +{
> +    int ret = 0;
> +
> +    // copy refs from the last frame
> +    if (s->nb_frames && s->nb_fcs > 1) {
> +        VVCFrameContext *prev = get_frame_context(s, fc, -1);
> +        for (int i = 0; i < FF_ARRAY_ELEMS(fc->DPB); i++) {
> +            ff_vvc_unref_frame(fc, &fc->DPB[i], ~0);
> +            if (prev->DPB[i].frame->buf[0]) {
> +                ret = vvc_ref_frame(fc, &fc->DPB[i], &prev->DPB[i]);
> +                if (ret < 0)
> +                    goto fail;
> +            }
> +        }
> +    }
> +
> +    if (IS_IDR(s)) {
> +        s->seq_decode = (s->seq_decode + 1) & 0xff;
> +        ff_vvc_clear_refs(fc);
> +    }
> +
> +    ret = pic_arrays_init(s, fc);
> +    if (ret < 0)
> +        goto fail;
> +    ff_vvc_dsp_init(&fc->vvcdsp, fc->ps.sps->bit_depth);
> +    ff_videodsp_init(&fc->vdsp, fc->ps.sps->bit_depth);
> +
> +fail:
> +    return ret;
> +}
> +
> +static void export_frame_params(VVCFrameContext *fc)
> +{
> +    AVCodecContext *c   = fc->avctx;
> +    const VVCSPS *sps   = fc->ps.sps;
> +    const VVCPPS *pps   = fc->ps.pps;
> +
> +    c->pix_fmt          = sps->pix_fmt;
> +    c->coded_width      = pps->width;
> +    c->coded_height     = pps->height;
> +    c->width            = pps->width  - pps->r->pps_conf_win_left_offset - pps->r->pps_conf_win_right_offset;
> +    c->height           = pps->height - pps->r->pps_conf_win_top_offset - pps->r->pps_conf_win_bottom_offset;
> +}
> +
> +static int decode_slice(VVCContext *s, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
> +{
> +    int ret = 0;
> +    SliceContext *sc;
> +    VVCSH *sh;
> +    const int is_first_slice = !fc->nb_slices;
> +
> +    ret = slices_realloc(fc);
> +    if (ret < 0)
> +        return ret;
> +    sc = fc->slices[fc->nb_slices];
> +
> +    sh = &sc->sh;
> +
> +    if (ret < 0)
> +        goto fail;
> +
> +    s->vcl_unit_type = nal->type;
> +    if (is_first_slice) {
> +        //first slice
> +        ret = ff_vvc_decode_frame_ps(&fc->ps, s);
> +        if (ret < 0)
> +            return ret;
> +
> +        ret = frame_context_setup(fc, s);
> +        if (ret < 0)
> +            goto fail;
> +
> +        export_frame_params(fc);
> +    }
> +
> +    ret = ff_vvc_decode_sh(&sc->sh, &fc->ps, unit);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (is_first_slice) {
> +        ret = vvc_frame_start(s, fc, sc);
> +        if (ret < 0)
> +            return ret;
> +    } else if (fc->ref) {
> +        if (!IS_I(sh->r)) {
> +            ret = ff_vvc_slice_rpl(s, fc, sc);
> +            if (ret < 0) {
> +                av_log(fc->avctx, AV_LOG_WARNING,
> +                       "Error constructing the reference lists for the current slice.\n");
> +                return ret;
> +            }
> +        }
> +    } else {
> +        av_log(fc->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
> +        return ret;
> +    }
> +
> +    if (!IS_I(sh->r))
> +        vvc_smvd_ref_idx(fc, sc);
> +
> +    ret = init_slice_context(sc, fc, nal, unit);
> +    if (ret < 0)
> +        goto fail;
> +    fc->nb_slices++;
> +
> +fail:
> +    return ret;
> +}
> +
> +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const H2645NAL *nal, const CodedBitstreamUnit *unit)
> +{
> +    int  ret;
> +
> +    s->temporal_id   = nal->temporal_id;
> +
> +    switch (unit->type) {
> +    case VVC_VPS_NUT:
> +    case VVC_SPS_NUT:
> +    case VVC_PPS_NUT:
> +        /* vps, sps, sps cached by s->cbc */
> +        break;
> +    case VVC_TRAIL_NUT:
> +    case VVC_STSA_NUT:
> +    case VVC_RADL_NUT:
> +    case VVC_RASL_NUT:
> +    case VVC_IDR_W_RADL:
> +    case VVC_IDR_N_LP:
> +    case VVC_CRA_NUT:
> +    case VVC_GDR_NUT:
> +        ret = decode_slice(s, fc, nal, unit);
> +        if (ret < 0)
> +            goto fail;
> +        break;
> +    case VVC_PREFIX_APS_NUT:
> +    case VVC_SUFFIX_APS_NUT:
> +        ret = ff_vvc_decode_aps(&s->ps, unit);
> +        if (ret < 0)
> +            goto fail;
> +        break;
> +    default:
> +        av_log(s->avctx, AV_LOG_INFO,
> +               "Skipping NAL unit %d\n", unit->type);

This will probably be very noisy (and warn for every SEI). I don't think
it is even needed, as h2645_parse.c already contains debug log messages
to display the unit type.

> +    }
> +
> +    return 0;
> +fail:
> +    return ret;

A fail that is only "return ret" is pointless (not only here).

> +}
> +
> +static int decode_nal_units(VVCContext *s, VVCFrameContext *fc, AVPacket *avpkt)
> +{
> +    const CodedBitstreamH266Context *h266   = (const CodedBitstreamH266Context *)s->cbc->priv_data;
> +    CodedBitstreamFragment *frame           = &s->current_frame;
> +    int i, ret = 0;
> +    int eos_at_start = 1;
> +    s->last_eos = s->eos;
> +    s->eos = 0;
> +
> +    ff_cbs_fragment_reset(frame);
> +    ret = ff_cbs_read_packet(s->cbc, frame, avpkt);
> +    if (ret < 0) {
> +        av_log(s->avctx, AV_LOG_ERROR, "Failed to read packet.\n");
> +        return ret;
> +    }
> +    /* decode the NAL units */
> +    for (i = 0; i < frame->nb_units; i++) {
> +        const H2645NAL *nal             = h266->common.read_packet.nals + i;
> +        const CodedBitstreamUnit *unit  = frame->units + i;
> +
> +        if (unit->type == VVC_EOB_NUT || unit->type == VVC_EOS_NUT) {
> +            if (eos_at_start)
> +                s->last_eos = 1;
> +            else
> +                s->eos = 1;
> +        } else {
> +            ret = decode_nal_unit(s, fc, nal, unit);
> +            if (ret < 0) {
> +                av_log(s->avctx, AV_LOG_WARNING,
> +                        "Error parsing NAL unit #%d.\n", i);
> +                goto fail;
> +            }
> +        }
> +    }
> +    return 0;
> +
> +fail:
> +    if (fc->ref)
> +        ff_vvc_report_frame_finished(fc->ref);
> +    return ret;
> +}
> +
> +static int set_output_format(const VVCContext *s, const AVFrame *output)
> +{
> +    AVCodecContext *c = s->avctx;
> +    int ret;
> +
> +    if (output->width != c->width || output->height != c->height) {
> +        if ((ret = ff_set_dimensions(c, output->width, output->height)) < 0)
> +            return ret;
> +    }
> +    c->pix_fmt = output->format;
> +    return 0;
> +}
> +
> +static int wait_delayed_frame(VVCContext *s, AVFrame *output, int *got_output)
> +{
> +    VVCFrameContext *delayed = get_frame_context(s, s->fcs, s->nb_frames - s->nb_delayed);
> +    int ret = ff_vvc_frame_wait(s, delayed);
> +
> +    if (!ret && delayed->output_frame->buf[0]) {
> +        av_frame_move_ref(output, delayed->output_frame);
> +        ret = set_output_format(s, output);
> +        if (!ret)
> +            *got_output = 1;
> +    }
> +    s->nb_delayed--;
> +
> +    return ret;
> +}
> +
> +static int submit_frame(VVCContext *s, VVCFrameContext *fc, AVFrame *output, int *got_output)
> +{
> +    int ret;
> +    s->nb_frames++;
> +    s->nb_delayed++;
> +    ff_vvc_frame_submit(s, fc);
> +    if (s->nb_delayed >= s->nb_fcs) {
> +        if ((ret = wait_delayed_frame(s, output, got_output)) < 0)
> +            return ret;
> +    }
> +    return 0;
> +}
>  
>  static int vvc_decode_frame(AVCodecContext *avctx, AVFrame *output,
>      int *got_output, AVPacket *avpkt)
>  {
> +    VVCContext *s = avctx->priv_data;
> +    VVCFrameContext *fc;
> +    int ret;
> +
> +    if (!avpkt->size) {
> +        while (s->nb_delayed) {
> +            if ((ret = wait_delayed_frame(s, output, got_output)) < 0)
> +                return ret;
> +            if (*got_output)
> +                return 0;
> +        }
> +        if (s->nb_frames) {
> +            //we still have frames cached in dpb.
> +            VVCFrameContext *last = get_frame_context(s, s->fcs, s->nb_frames - 1);
> +
> +            ret = ff_vvc_output_frame(s, last, output, 0, 1);
> +            if (ret < 0)
> +                return ret;
> +            if (ret) {
> +                *got_output = ret;
> +                if ((ret = set_output_format(s, output)) < 0)
> +                    return ret;
> +            }
> +        }
> +        return 0;
> +    }
> +
> +    fc = get_frame_context(s, s->fcs, s->nb_frames);
> +
> +    fc->nb_slices = 0;
> +    fc->decode_order = s->nb_frames;
> +
> +    ret = decode_nal_units(s, fc, avpkt);
> +    if (ret < 0)
> +        return ret;
> +
> +    ret = submit_frame(s, fc, output, got_output);
> +    if (ret < 0)
> +        return ret;
> +
>      return avpkt->size;
>  }
>  
>  static void vvc_decode_flush(AVCodecContext *avctx)

Should also be av_cold

>  {
> +    VVCContext *s = avctx->priv_data;
> +    int got_output;
> +    AVFrame *output = av_frame_alloc();

Allocating a frame for flushing is bad enough, but you are only flushing
if said allocating succeeds. If it does not, then we never wait for
frames which are currently decoded by other threads, don't we? So there
can be races and even crashes when this function is called from
vvc_decode_free() and allocation.
Instead you could pass NULL to wait_delayed_frame() and make it unref
the frames (instead of moving them) in case the output frame is NULL.

> +
> +    if (output) {
> +        while (s->nb_delayed) {
> +            wait_delayed_frame(s, output, &got_output);
> +            if (got_output) {
> +                av_frame_unref(output);
> +            }
> +        }
> +        av_frame_free(&output);
> +    }
>  }
>  
>  static av_cold int vvc_decode_free(AVCodecContext *avctx)
>  {
> +    VVCContext *s = avctx->priv_data;
> +    int i;
> +
> +    ff_cbs_fragment_free(&s->current_frame);

Is it sure that the fragment is not in use (given that other threads may
be running now before vvc_decode_flush())?

> +    vvc_decode_flush(avctx);
> +    ff_vvc_executor_free(&s->executor);
> +    if (s->fcs) {
> +        for (i = 0; i < s->nb_fcs; i++)

for (int i = 0; is better as it has smaller scope; in this case, it also
allows to save a line of code. Something similar is possible in
decode_nal_units(), please check the other patches, too.

> +            frame_context_free(s->fcs + i);
> +        av_free(s->fcs);
> +    }
> +    ff_vvc_ps_uninit(&s->ps);
> +    ff_cbs_close(&s->cbc);
> +
>      return 0;
>  }
>  
> +#define VVC_MAX_FRMAE_DELAY 16

typo

>  static av_cold int vvc_decode_init(AVCodecContext *avctx)
>  {
> +    VVCContext *s       = avctx->priv_data;
> +    int ret;
> +
> +    s->avctx = avctx;
> +
> +    if (ff_cbs_init(&s->cbc, AV_CODEC_ID_VVC, avctx))
> +        goto fail;

Forward the error code.

> +
> +    s->nb_fcs = (avctx->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 : FFMIN(av_cpu_count(), VVC_MAX_FRMAE_DELAY);

This may evaluate av_cpu_count() multiple times. Furthermore I don't
know why this define is used here at all: With frame threading, the
number of frame threads is not limited by the delay/number of reordering
frames at all (we even have frame-threading for decoders without
frame-reordering at all).

But worst of this is that you do not check avctx->thread_count at all.

> +    s->fcs = av_calloc(s->nb_fcs, sizeof(*s->fcs));
> +    if (!s->fcs)
> +        goto fail;
> +
> +    for (int i = 0; i < s->nb_fcs; i++) {
> +        VVCFrameContext *fc = s->fcs + i;
> +        ret = frame_context_init(fc, avctx);
> +        if (ret < 0)
> +            goto fail;
> +    }
> +
> +    s->executor = ff_vvc_executor_alloc(s, s->nb_fcs);
> +    if (!s->executor)
> +        goto fail;
> +
> +    s->eos = 1;
> +    GDR_SET_RECOVERED(s);
> +    memset(&ff_vvc_default_scale_m, 16, sizeof(ff_vvc_default_scale_m));

This needs to be done once (i.e. protected by an AVOnce) and not every
time a decoder is set up. Otherwise there might be data races.

> +
>      return 0;
> +
> +fail:
> +    vvc_decode_free(avctx);

Unnecessary, as this decoder has the FF_CODEC_CAP_INIT_CLEANUP set. In
fact, given that vvc_decode_free() uses av_free() instead of av_freep()
for s->fcs, calling vvc_decode_free() here can lead to a use-after-free
(namely when vvc_decode_free() is called generically later).

> +    return AVERROR(ENOMEM);
>  }
>  
>  const FFCodec ff_vvc_decoder = {