[FFmpeg-devel] [PATCH v2 3/5] libavcodec/webp: add support for animated WebP decoding

Wed Jul 12 03:20:23 EEST 2023

On Thu, Jul 6, 2023 at 4:28 AM Thilo Borgmann <thilo.borgmann at mail.de> wrote:
>
> From: Josef Zlomek <josef at pex.com>
>
> Fixes: 4907
>
> Adds support for decoding of animated WebP.
>
> The WebP decoder adds the animation related features according to the specs:
> https://developers.google.com/speed/webp/docs/riff_container#animation
> The frames of the animation may be smaller than the image canvas.
> Therefore, the frame is decoded to a temporary frame,
> then it is blended into the canvas, the canvas is copied to the output frame,
> and finally the frame is disposed from the canvas.
>
> The output to AV_PIX_FMT_YUVA420P/AV_PIX_FMT_YUV420P is still supported.
> The background color is specified only as BGRA in the WebP file
> so it is converted to YUVA if YUV formats are output.
>
> Signed-off-by: Josef Zlomek <josef at pex.com>
> ---
>  Changelog               |   1 +
>  libavcodec/codec_desc.c |   3 +-
>  libavcodec/version.h    |   2 +-
>  libavcodec/webp.c       | 714 ++++++++++++++++++++++++++++++++++++----
>  4 files changed, 658 insertions(+), 62 deletions(-)
>
> diff --git a/Changelog b/Changelog
> index 3876082844..271926ed8f 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -25,6 +25,7 @@ version <next>:
>  - Raw VVC bitstream parser, muxer and demuxer
>  - Bitstream filter for editing metadata in VVC streams
>  - Bitstream filter for converting VVC from MP4 to Annex B
> +- animated WebP parser/decoder
>
>  version 6.0:
>  - Radiance HDR image support
> diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
> index 4406dd8318..47a38a4036 100644
> --- a/libavcodec/codec_desc.c
> +++ b/libavcodec/codec_desc.c
> @@ -1259,8 +1259,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
>          .type      = AVMEDIA_TYPE_VIDEO,
>          .name      = "webp",
>          .long_name = NULL_IF_CONFIG_SMALL("WebP"),
> -        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
> -                     AV_CODEC_PROP_LOSSLESS,
> +        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,

The frames are all intra, though with animation they may be smaller
than the canvas and combined with the previous frame.

>          .mime_types= MT("image/webp"),
>      },
>      {
> diff --git a/libavcodec/version.h b/libavcodec/version.h
> index 9411511e04..9f55381cf1 100644
> --- a/libavcodec/version.h
> +++ b/libavcodec/version.h
> @@ -30,7 +30,7 @@
>  #include "version_major.h"
>
>  #define LIBAVCODEC_VERSION_MINOR  22
> -#define LIBAVCODEC_VERSION_MICRO 100
> +#define LIBAVCODEC_VERSION_MICRO 101
>
>  #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
>                                                 LIBAVCODEC_VERSION_MINOR, \
> diff --git a/libavcodec/webp.c b/libavcodec/webp.c
> index 15152ec8fb..bee43fcf19 100644
> --- a/libavcodec/webp.c
> +++ b/libavcodec/webp.c
> @@ -35,12 +35,16 @@
>   * Exif metadata
>   * ICC profile
>   *
> + * @author Josef Zlomek, Pexeso Inc. <josef at pex.com>
> + * Animation
> + *
>   * Unimplemented:
> - *   - Animation
>   *   - XMP metadata
>   */
>
> +#include "libavcodec/packet.h"
>  #include "libavutil/imgutils.h"
> +#include "libavutil/colorspace.h"
>
>  #define BITSTREAM_READER_LE
>  #include "avcodec.h"
> @@ -178,6 +182,8 @@ typedef struct ImageContext {
>  typedef struct WebPContext {
>      VP8Context v;                       /* VP8 Context used for lossy decoding */
>      GetBitContext gb;                   /* bitstream reader for main image chunk */
> +    ThreadFrame canvas_frame;           /* ThreadFrame for canvas */
> +    AVFrame *frame;                     /* AVFrame for decoded frame */
>      AVFrame *alpha_frame;               /* AVFrame for alpha data decompressed from VP8L */
>      AVPacket *pkt;                      /* AVPacket to be passed to the underlying VP8 decoder */
>      AVCodecContext *avctx;              /* parent AVCodecContext */
> @@ -189,9 +195,24 @@ typedef struct WebPContext {
>      int alpha_data_size;                /* alpha chunk data size */
>      int has_exif;                       /* set after an EXIF chunk has been processed */
>      int has_iccp;                       /* set after an ICCP chunk has been processed */
> -    int width;                          /* image width */
> -    int height;                         /* image height */
> -    int lossless;                       /* indicates lossless or lossy */
> +    int vp8x_flags;                     /* global flags from VP8X chunk */
> +    int canvas_width;                   /* canvas width */
> +    int canvas_height;                  /* canvas height */
> +    int anmf_flags;                     /* frame flags from ANMF chunk */
> +    int width;                          /* frame width */
> +    int height;                         /* frame height */
> +    int pos_x;                          /* frame position X */
> +    int pos_y;                          /* frame position Y */
> +    int prev_anmf_flags;                /* previous frame flags from ANMF chunk */
> +    int prev_width;                     /* previous frame width */
> +    int prev_height;                    /* previous frame height */
> +    int prev_pos_x;                     /* previous frame position X */
> +    int prev_pos_y;                     /* previous frame position Y */
> +    int await_progress;                 /* value of progress to wait for */
> +    uint8_t background_argb[4];         /* background color in ARGB format */
> +    uint8_t background_yuva[4];         /* background color in YUVA format */
> +    const uint8_t *background_data[4];  /* "planes" for background color in YUVA format */
> +    uint8_t transparent_yuva[4];        /* transparent black in YUVA format */
>
>      int nb_transforms;                  /* number of transforms */
>      enum TransformType transforms[4];   /* transformations used in the image, in order */
> @@ -555,7 +576,7 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
>      img->frame->height = h;
>
>      if (role == IMAGE_ROLE_ARGB && !img->is_alpha_primary) {
> -        ret = ff_thread_get_buffer(s->avctx, img->frame, 0);
> +        ret = ff_get_buffer(s->avctx, img->frame, 0);
>      } else
>          ret = av_frame_get_buffer(img->frame, 1);
>      if (ret < 0)
> @@ -1053,7 +1074,7 @@ static int apply_color_indexing_transform(WebPContext *s)
>      return 0;
>  }
>
> -static void update_canvas_size(AVCodecContext *avctx, int w, int h)
> +static void update_frame_size(AVCodecContext *avctx, int w, int h)
>  {
>      WebPContext *s = avctx->priv_data;
>      if (s->width && s->width != w) {
> @@ -1076,7 +1097,6 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
>      int w, h, ret, i, used;
>
>      if (!is_alpha_chunk) {
> -        s->lossless = 1;
>          avctx->pix_fmt = AV_PIX_FMT_ARGB;
>      }
>
> @@ -1093,7 +1113,7 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
>          w = get_bits(&s->gb, 14) + 1;
>          h = get_bits(&s->gb, 14) + 1;
>
> -        update_canvas_size(avctx, w, h);
> +        update_frame_size(avctx, w, h);
>
>          ret = ff_set_dimensions(avctx, s->width, s->height);
>          if (ret < 0)
> @@ -1290,7 +1310,6 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
>          s->v.actually_webp = 1;
>      }
>      avctx->pix_fmt = s->has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
> -    s->lossless = 0;
>
>      if (data_size > INT_MAX) {
>          av_log(avctx, AV_LOG_ERROR, "unsupported chunk size\n");
> @@ -1308,7 +1327,7 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
>      if (!*got_frame)
>          return AVERROR_INVALIDDATA;
>
> -    update_canvas_size(avctx, avctx->width, avctx->height);
> +    update_frame_size(avctx, avctx->width, avctx->height);
>
>      if (s->has_alpha) {
>          ret = vp8_lossy_decode_alpha(avctx, p, s->alpha_data,
> @@ -1318,41 +1337,17 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
>      }
>      return ret;
>  }
> +int init_canvas_frame(WebPContext *s, int format, int key_frame);
>
> -static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> -                             int *got_frame, AVPacket *avpkt)
> +static int webp_decode_frame_common(AVCodecContext *avctx, uint8_t *data, int size,
> +                               int *got_frame, int key_frame)

indent is off

>  {
>      WebPContext *s = avctx->priv_data;
>      GetByteContext gb;
>      int ret;
>      uint32_t chunk_type, chunk_size;
> -    int vp8x_flags = 0;
>
> -    s->avctx     = avctx;
> -    s->width     = 0;
> -    s->height    = 0;
> -    *got_frame   = 0;
> -    s->has_alpha = 0;
> -    s->has_exif  = 0;
> -    s->has_iccp  = 0;
> -    bytestream2_init(&gb, avpkt->data, avpkt->size);
> -
> -    if (bytestream2_get_bytes_left(&gb) < 12)
> -        return AVERROR_INVALIDDATA;
> -
> -    if (bytestream2_get_le32(&gb) != MKTAG('R', 'I', 'F', 'F')) {
> -        av_log(avctx, AV_LOG_ERROR, "missing RIFF tag\n");
> -        return AVERROR_INVALIDDATA;
> -    }
> -
> -    chunk_size = bytestream2_get_le32(&gb);
> -    if (bytestream2_get_bytes_left(&gb) < chunk_size)
> -        return AVERROR_INVALIDDATA;
> -
> -    if (bytestream2_get_le32(&gb) != MKTAG('W', 'E', 'B', 'P')) {
> -        av_log(avctx, AV_LOG_ERROR, "missing WEBP tag\n");
> -        return AVERROR_INVALIDDATA;
> -    }
> +    bytestream2_init(&gb, data, size);
>
>      while (bytestream2_get_bytes_left(&gb) > 8) {
>          char chunk_str[5] = { 0 };
> @@ -1363,6 +1358,10 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>              return AVERROR_INVALIDDATA;
>          chunk_size += chunk_size & 1;
>
> +        // we need to dive into RIFF chunk
> +        if (chunk_type == MKTAG('R', 'I', 'F', 'F'))
> +            chunk_size = 4;
> +
>          if (bytestream2_get_bytes_left(&gb) < chunk_size) {
>             /* we seem to be running out of data, but it could also be that the
>                bitstream has trailing junk leading to bogus chunk_size. */
> @@ -1370,10 +1369,26 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>          }
>
>          switch (chunk_type) {
> +        case MKTAG('R', 'I', 'F', 'F'):
> +            if (bytestream2_get_le32(&gb) != MKTAG('W', 'E', 'B', 'P')) {
> +                av_log(avctx, AV_LOG_ERROR, "missing WEBP tag\n");
> +                return AVERROR_INVALIDDATA;
> +            }
> +            s->vp8x_flags    = 0;
> +            s->canvas_width  = 0;
> +            s->canvas_height = 0;
> +            s->has_exif      = 0;
> +            s->has_iccp      = 0;
> +            ff_thread_release_ext_buffer(avctx, &s->canvas_frame);
> +            break;
>          case MKTAG('V', 'P', '8', ' '):
>              if (!*got_frame) {
> -                ret = vp8_lossy_decode_frame(avctx, p, got_frame,
> -                                             avpkt->data + bytestream2_tell(&gb),
> +                ret = init_canvas_frame(s, AV_PIX_FMT_YUVA420P, key_frame);
> +                if (ret < 0)
> +                    return ret;
> +
> +                ret = vp8_lossy_decode_frame(avctx, s->frame, got_frame,
> +                                             data + bytestream2_tell(&gb),
>                                               chunk_size);
>                  if (ret < 0)
>                      return ret;
> @@ -1382,8 +1397,13 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>              break;
>          case MKTAG('V', 'P', '8', 'L'):
>              if (!*got_frame) {
> -                ret = vp8_lossless_decode_frame(avctx, p, got_frame,
> -                                                avpkt->data + bytestream2_tell(&gb),
> +                ret = init_canvas_frame(s, AV_PIX_FMT_ARGB, key_frame);
> +                if (ret < 0)
> +                    return ret;
> +                ff_thread_finish_setup(s->avctx);
> +
> +                ret = vp8_lossless_decode_frame(avctx, s->frame, got_frame,
> +                                                data + bytestream2_tell(&gb),
>                                                  chunk_size, 0);
>                  if (ret < 0)
>                      return ret;
> @@ -1392,14 +1412,16 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>              bytestream2_skip(&gb, chunk_size);
>              break;
>          case MKTAG('V', 'P', '8', 'X'):
> -            if (s->width || s->height || *got_frame) {
> +            if (s->canvas_width || s->canvas_height || *got_frame) {
>                  av_log(avctx, AV_LOG_ERROR, "Canvas dimensions are already set\n");
>                  return AVERROR_INVALIDDATA;
>              }
> -            vp8x_flags = bytestream2_get_byte(&gb);
> +            s->vp8x_flags = bytestream2_get_byte(&gb);
>              bytestream2_skip(&gb, 3);
>              s->width  = bytestream2_get_le24(&gb) + 1;
>              s->height = bytestream2_get_le24(&gb) + 1;
> +            s->canvas_width  = s->width;
> +            s->canvas_height = s->height;
>              ret = av_image_check_size(s->width, s->height, 0, avctx);
>              if (ret < 0)
>                  return ret;
> @@ -1407,7 +1429,7 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>          case MKTAG('A', 'L', 'P', 'H'): {
>              int alpha_header, filter_m, compression;
>
> -            if (!(vp8x_flags & VP8X_FLAG_ALPHA)) {
> +            if (!(s->vp8x_flags & VP8X_FLAG_ALPHA)) {
>                  av_log(avctx, AV_LOG_WARNING,
>                         "ALPHA chunk present, but alpha bit not set in the "
>                         "VP8X header\n");
> @@ -1416,8 +1438,9 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>                  av_log(avctx, AV_LOG_ERROR, "invalid ALPHA chunk size\n");
>                  return AVERROR_INVALIDDATA;
>              }
> +
>              alpha_header       = bytestream2_get_byte(&gb);
> -            s->alpha_data      = avpkt->data + bytestream2_tell(&gb);
> +            s->alpha_data      = data + bytestream2_tell(&gb);
>              s->alpha_data_size = chunk_size - 1;
>              bytestream2_skip(&gb, s->alpha_data_size);
>
> @@ -1444,14 +1467,13 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>                  av_log(avctx, AV_LOG_VERBOSE, "Ignoring extra EXIF chunk\n");
>                  goto exif_end;
>              }
> -            if (!(vp8x_flags & VP8X_FLAG_EXIF_METADATA))
> +            if (!(s->vp8x_flags & VP8X_FLAG_EXIF_METADATA))
>                  av_log(avctx, AV_LOG_WARNING,
>                         "EXIF chunk present, but Exif bit not set in the "
>                         "VP8X header\n");
>
>              s->has_exif = 1;
> -            bytestream2_init(&exif_gb, avpkt->data + exif_offset,
> -                             avpkt->size - exif_offset);
> +            bytestream2_init(&exif_gb, data + exif_offset, size - exif_offset);
>              if (ff_tdecode_header(&exif_gb, &le, &ifd_offset) < 0) {
>                  av_log(avctx, AV_LOG_ERROR, "invalid TIFF header "
>                         "in Exif data\n");
> @@ -1464,7 +1486,7 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
>                  goto exif_end;
>              }
>
> -            av_dict_copy(&p->metadata, exif_metadata, 0);
> +            av_dict_copy(&s->frame->metadata, exif_metadata, 0);
>
>  exif_end:
>              av_dict_free(&exif_metadata);
> @@ -1479,21 +1501,64 @@ exif_end:
>                  bytestream2_skip(&gb, chunk_size);
>                  break;
>              }
> -            if (!(vp8x_flags & VP8X_FLAG_ICC))
> +            if (!(s->vp8x_flags & VP8X_FLAG_ICC))
>                  av_log(avctx, AV_LOG_WARNING,
>                         "ICCP chunk present, but ICC Profile bit not set in the "
>                         "VP8X header\n");
>
>              s->has_iccp = 1;
> -            sd = av_frame_new_side_data(p, AV_FRAME_DATA_ICC_PROFILE, chunk_size);
> +            sd = av_frame_new_side_data(s->frame, AV_FRAME_DATA_ICC_PROFILE, chunk_size);
>              if (!sd)
>                  return AVERROR(ENOMEM);
>
>              bytestream2_get_buffer(&gb, sd->data, chunk_size);
>              break;
>          }
> -        case MKTAG('A', 'N', 'I', 'M'):
> +        case MKTAG('A', 'N', 'I', 'M'): {
> +            const AVPixFmtDescriptor *desc;
> +            int a, r, g, b;
> +            if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION)) {
> +                av_log(avctx, AV_LOG_WARNING,
> +                       "ANIM chunk present, but animation bit not set in the "
> +                       "VP8X header\n");
> +            }
> +            // background is stored as BGRA, we need ARGB
> +            s->background_argb[3] = b = bytestream2_get_byte(&gb);
> +            s->background_argb[2] = g = bytestream2_get_byte(&gb);
> +            s->background_argb[1] = r = bytestream2_get_byte(&gb);
> +            s->background_argb[0] = a = bytestream2_get_byte(&gb);
> +
> +            // convert the background color to YUVA
> +            desc = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
> +            s->background_yuva[desc->comp[0].plane] = RGB_TO_Y_CCIR(r, g, b);
> +            s->background_yuva[desc->comp[1].plane] = RGB_TO_U_CCIR(r, g, b, 0);
> +            s->background_yuva[desc->comp[2].plane] = RGB_TO_V_CCIR(r, g, b, 0);
> +            s->background_yuva[desc->comp[3].plane] = a;
> +
> +            bytestream2_skip(&gb, 2); // loop count is ignored
> +            break;
> +        }
>          case MKTAG('A', 'N', 'M', 'F'):
> +            if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION)) {
> +                av_log(avctx, AV_LOG_WARNING,
> +                       "ANMF chunk present, but animation bit not set in the "
> +                       "VP8X header\n");
> +            }
> +            s->pos_x      = bytestream2_get_le24(&gb) * 2;
> +            s->pos_y      = bytestream2_get_le24(&gb) * 2;
> +            s->width      = bytestream2_get_le24(&gb) + 1;
> +            s->height     = bytestream2_get_le24(&gb) + 1;
> +            bytestream2_skip(&gb, 3);   // duration
> +            s->anmf_flags = bytestream2_get_byte(&gb);
> +
> +            if (s->width  + s->pos_x > s->canvas_width ||
> +                s->height + s->pos_y > s->canvas_height) {
> +                av_log(avctx, AV_LOG_ERROR,
> +                       "frame does not fit into canvas\n");
> +                return AVERROR_INVALIDDATA;
> +            }
> +            s->vp8x_flags |= VP8X_FLAG_ANIMATION;
> +            break;
>          case MKTAG('X', 'M', 'P', ' '):
>              AV_WL32(chunk_str, chunk_type);
>              av_log(avctx, AV_LOG_WARNING, "skipping unsupported chunk: %s\n",
> @@ -1509,21 +1574,508 @@ exif_end:
>          }
>      }
>
> -    if (!*got_frame) {
> -        av_log(avctx, AV_LOG_ERROR, "image data not found\n");
> -        return AVERROR_INVALIDDATA;
> +    return size;
> +}
> +
> +int init_canvas_frame(WebPContext *s, int format, int key_frame)
> +{
> +    AVFrame *canvas = s->canvas_frame.f;
> +    int height;
> +    int ret;
> +
> +    // canvas is needed only for animation
> +    if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION))
> +        return 0;
> +
> +    // avoid init for non-key frames whose format and size did not change
> +    if (!key_frame &&
> +        canvas->data[0] &&
> +        canvas->format == format &&
> +        canvas->width  == s->canvas_width &&
> +        canvas->height == s->canvas_height)
> +        return 0;
> +
> +    // canvas changes within IPPP sequences will loose thread sync

lose.

> +    // because of the ThreadFrame reallocation and will wait forever
> +    // so if frame-threading is used, forbid canvas changes and unlock
> +    // previous frames

I think this might need some additional work after landing. The
canvas, as webp defines it, is fixed. Individual frames can be
smaller. I think mixed content (lossy and lossless) might trigger this
path and result in corruption.

> +    if (!key_frame && canvas->data[0]) {
> +        if (s->avctx->thread_count > 1) {
> +            av_log(s->avctx, AV_LOG_WARNING, "Canvas change detected. The output will be damaged. Use -threads 1 to try decoding with best effort.\n");
> +            // unlock previous frames that have sent an _await() call
> +            ff_thread_report_progress(&s->canvas_frame, INT_MAX, 0);
> +            return AVERROR_PATCHWELCOME;
> +        } else {
> +            // warn for damaged frames
> +            av_log(s->avctx, AV_LOG_WARNING, "Canvas change detected. The output will be damaged.\n");
> +        }
> +    }
> +
> [...]
> +
> +static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> +                             int *got_frame, AVPacket *avpkt)
> +{
> +    WebPContext *s = avctx->priv_data;
> +    int ret;
> +    int key_frame = avpkt->flags & AV_PKT_FLAG_KEY;
> +
> +    for (int i = 0; i < avpkt->side_data_elems; ++i) {
> +        if (avpkt->side_data[i].type == AV_PKT_DATA_NEW_EXTRADATA) {
> +            ret = webp_decode_frame_common(avctx, avpkt->side_data[i].data,
> +                                      avpkt->side_data[i].size,
> +                                      got_frame, key_frame);

indent is off.