[FFmpeg-devel] [PATCH] avcodec/libjxldec: emit proper PTS to decoded AVFrame

Anton Khirnov anton at khirnov.net
Thu Dec 14 10:28:35 EET 2023


Quoting Leo Izen (2023-12-08 18:31:06)
> If a sequence of JXL images is encapsulated in a container that has PTS
> information, we should use the PTS information from the container. At
> this time there is no container that does this, but if JPEG XL support
> is ever added to NUT, AVTransport, or some other container, this commit
> should allow the PTS information those containers provide to work as
> expected.
> 
> Signed-off-by: Leo Izen <leo.izen at gmail.com>
> ---
>  libavcodec/libjxldec.c | 77 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 57 insertions(+), 20 deletions(-)
> 
> diff --git a/libavcodec/libjxldec.c b/libavcodec/libjxldec.c
> index 002740d9c1..494060ac8c 100644
> --- a/libavcodec/libjxldec.c
> +++ b/libavcodec/libjxldec.c
> @@ -370,6 +370,7 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>  
>      while (1) {
>          size_t remaining;
> +        JxlFrameHeader header;
>  
>          if (!pkt->size) {
>              av_packet_unref(pkt);
> @@ -428,13 +429,16 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>              }
>              if ((ret = ff_set_dimensions(avctx, ctx->basic_info.xsize, ctx->basic_info.ysize)) < 0)
>                  return ret;
> +            /*
> +             * If animation is present, we use the timebase provided by
> +             *    the animated image itself.
> +             * If the image is not animated, we use ctx->pts
> +             *    to refer to the frame number, not an actual
> +             *    PTS value, thus we may leave ctx->timebase unset.
> +             */
>              if (ctx->basic_info.have_animation)
>                  ctx->timebase = av_make_q(ctx->basic_info.animation.tps_denominator,
>                                            ctx->basic_info.animation.tps_numerator);
> -            else if (avctx->pkt_timebase.num)
> -                ctx->timebase = avctx->pkt_timebase;
> -            else
> -                ctx->timebase = AV_TIME_BASE_Q;
>              continue;
>          case JXL_DEC_COLOR_ENCODING:
>              av_log(avctx, AV_LOG_DEBUG, "COLOR_ENCODING event emitted\n");
> @@ -462,23 +466,24 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>  #endif
>              continue;
>          case JXL_DEC_FRAME:
> +            /* Frame here refers to the Frame bundle, not a decoded picture */
>              av_log(avctx, AV_LOG_DEBUG, "FRAME event emitted\n");
> -            if (!ctx->basic_info.have_animation || ctx->prev_is_last) {
> +            if (ctx->prev_is_last) {
> +                /*
> +                 * The last frame sent was tagged as "is_last" which
> +                 * means this is a new image file altogether.
> +                 */
>                  ctx->frame->pict_type = AV_PICTURE_TYPE_I;
>                  ctx->frame->flags |= AV_FRAME_FLAG_KEY;
>              }
> -            if (ctx->basic_info.have_animation) {
> -                JxlFrameHeader header;
> -                if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
> -                    av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
> -                    return AVERROR_EXTERNAL;
> -                }
> -                ctx->prev_is_last = header.is_last;
> -                ctx->frame_duration = header.duration;
> -            } else {
> -                ctx->prev_is_last = 1;
> -                ctx->frame_duration = 1;
> +            if (JxlDecoderGetFrameHeader(ctx->decoder, &header) != JXL_DEC_SUCCESS) {
> +                av_log(avctx, AV_LOG_ERROR, "Bad libjxl dec frame event\n");
> +                return AVERROR_EXTERNAL;
>              }
> +            ctx->prev_is_last = header.is_last;
> +            /* zero duration for animation means the frame is not presented */
> +            if (ctx->basic_info.have_animation && header.duration)
> +                ctx->frame_duration = header.duration;
>              continue;
>          case JXL_DEC_FULL_IMAGE:
>              /* full image is one frame, even if animated */
> @@ -490,12 +495,44 @@ static int libjxl_receive_frame(AVCodecContext *avctx, AVFrame *frame)
>                  /* ownership is transfered, and it is not ref-ed */
>                  ctx->iccp = NULL;
>              }
> -            if (avctx->pkt_timebase.num) {
> -                ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
> -                ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
> +            if (ctx->basic_info.have_animation) {
> +                if (avctx->pkt_timebase.num) {
> +                    /*
> +                     * ideally, the demuxer set avctx->pkt_timebase to equal the animation's timebase
> +                     * or something strictly finer. This is true about the jpegxl_anim demuxer.
> +                     */
> +                    ctx->frame->pts = av_rescale_q(ctx->pts, ctx->timebase, avctx->pkt_timebase);
> +                    ctx->frame->duration = av_rescale_q(ctx->frame_duration, ctx->timebase, avctx->pkt_timebase);
> +                } else {
> +                    /*
> +                     * If we don't know the container timebase, we have to set the frame->timebase,
> +                     * even if it is currently ignored by most users. We don't have permission
> +                     * to set avctx->pkt_timebase.
> +                     */
> +                    ctx->frame->time_base = ctx->timebase;
> +                    ctx->frame->pts = ctx->pts;
> +                    ctx->frame->duration = ctx->frame_duration;
> +                }
> +            } else if (avctx->pkt_timebase.num) {
> +                if (pkt->pts != AV_NOPTS_VALUE) {
> +                    /* The container has provided the PTS for us, so we don't need to count frames. */
> +                    ctx->frame->pts = pkt->pts;
> +                } else {
> +                    /*
> +                     * The demuxer has provided us with a timebase, but not with PTS information.
> +                     * We use 1/1 as a dummy timebase, for 1fps as a dummy framerate, and set the
> +                     * PTS based on frame count.
> +                     */
> +                    const AVRational dummy = {.num = 1, .den = 1};
> +                    ctx->frame->pts = av_rescale_q(ctx->pts, dummy, avctx->pkt_timebase);
> +                }
> +                ctx->frame->duration = pkt->duration;
>              } else {
> +                /*
> +                 * There is no timing information. Set the frame PTS to frame counter.
> +                 */
>                  ctx->frame->pts = ctx->pts;
> -                ctx->frame->duration = ctx->frame_duration;
> +                ctx->frame->duration = 0;

This logic seems shady to me. The decoder should mess with pts as little
as possible and whenever it can just copy the packet value to the frame.
Any codec-level timestamps should not be trusted.

Now this does not work when a single packet decodes into multiple
frames, then you have to add increments of frame duration to the
original packet pts. But you should still preserve the original value as
the base - it might not start at 0.

Also, decoders are not allowed to set AVFrame.time_base. And you should
probably set AVCodecContext.framerate.

-- 
Anton Khirnov


More information about the ffmpeg-devel mailing list