[FFmpeg-devel] [PATCH v2 3/5] libavcodec/webp: add support for animated WebP decoding
James Zern
jzern at google.com
Wed Jul 12 03:20:23 EEST 2023
On Thu, Jul 6, 2023 at 4:28 AM Thilo Borgmann <thilo.borgmann at mail.de> wrote:
>
> From: Josef Zlomek <josef at pex.com>
>
> Fixes: 4907
>
> Adds support for decoding of animated WebP.
>
> The WebP decoder adds the animation related features according to the specs:
> https://developers.google.com/speed/webp/docs/riff_container#animation
> The frames of the animation may be smaller than the image canvas.
> Therefore, the frame is decoded to a temporary frame,
> then it is blended into the canvas, the canvas is copied to the output frame,
> and finally the frame is disposed from the canvas.
>
> The output to AV_PIX_FMT_YUVA420P/AV_PIX_FMT_YUV420P is still supported.
> The background color is specified only as BGRA in the WebP file
> so it is converted to YUVA if YUV formats are output.
>
> Signed-off-by: Josef Zlomek <josef at pex.com>
> ---
> Changelog | 1 +
> libavcodec/codec_desc.c | 3 +-
> libavcodec/version.h | 2 +-
> libavcodec/webp.c | 714 ++++++++++++++++++++++++++++++++++++----
> 4 files changed, 658 insertions(+), 62 deletions(-)
>
> diff --git a/Changelog b/Changelog
> index 3876082844..271926ed8f 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -25,6 +25,7 @@ version <next>:
> - Raw VVC bitstream parser, muxer and demuxer
> - Bitstream filter for editing metadata in VVC streams
> - Bitstream filter for converting VVC from MP4 to Annex B
> +- animated WebP parser/decoder
>
> version 6.0:
> - Radiance HDR image support
> diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
> index 4406dd8318..47a38a4036 100644
> --- a/libavcodec/codec_desc.c
> +++ b/libavcodec/codec_desc.c
> @@ -1259,8 +1259,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
> .type = AVMEDIA_TYPE_VIDEO,
> .name = "webp",
> .long_name = NULL_IF_CONFIG_SMALL("WebP"),
> - .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
> - AV_CODEC_PROP_LOSSLESS,
> + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
The frames are all intra, though with animation they may be smaller
than the canvas and combined with the previous frame.
> .mime_types= MT("image/webp"),
> },
> {
> diff --git a/libavcodec/version.h b/libavcodec/version.h
> index 9411511e04..9f55381cf1 100644
> --- a/libavcodec/version.h
> +++ b/libavcodec/version.h
> @@ -30,7 +30,7 @@
> #include "version_major.h"
>
> #define LIBAVCODEC_VERSION_MINOR 22
> -#define LIBAVCODEC_VERSION_MICRO 100
> +#define LIBAVCODEC_VERSION_MICRO 101
>
> #define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
> LIBAVCODEC_VERSION_MINOR, \
> diff --git a/libavcodec/webp.c b/libavcodec/webp.c
> index 15152ec8fb..bee43fcf19 100644
> --- a/libavcodec/webp.c
> +++ b/libavcodec/webp.c
> @@ -35,12 +35,16 @@
> * Exif metadata
> * ICC profile
> *
> + * @author Josef Zlomek, Pexeso Inc. <josef at pex.com>
> + * Animation
> + *
> * Unimplemented:
> - * - Animation
> * - XMP metadata
> */
>
> +#include "libavcodec/packet.h"
> #include "libavutil/imgutils.h"
> +#include "libavutil/colorspace.h"
>
> #define BITSTREAM_READER_LE
> #include "avcodec.h"
> @@ -178,6 +182,8 @@ typedef struct ImageContext {
> typedef struct WebPContext {
> VP8Context v; /* VP8 Context used for lossy decoding */
> GetBitContext gb; /* bitstream reader for main image chunk */
> + ThreadFrame canvas_frame; /* ThreadFrame for canvas */
> + AVFrame *frame; /* AVFrame for decoded frame */
> AVFrame *alpha_frame; /* AVFrame for alpha data decompressed from VP8L */
> AVPacket *pkt; /* AVPacket to be passed to the underlying VP8 decoder */
> AVCodecContext *avctx; /* parent AVCodecContext */
> @@ -189,9 +195,24 @@ typedef struct WebPContext {
> int alpha_data_size; /* alpha chunk data size */
> int has_exif; /* set after an EXIF chunk has been processed */
> int has_iccp; /* set after an ICCP chunk has been processed */
> - int width; /* image width */
> - int height; /* image height */
> - int lossless; /* indicates lossless or lossy */
> + int vp8x_flags; /* global flags from VP8X chunk */
> + int canvas_width; /* canvas width */
> + int canvas_height; /* canvas height */
> + int anmf_flags; /* frame flags from ANMF chunk */
> + int width; /* frame width */
> + int height; /* frame height */
> + int pos_x; /* frame position X */
> + int pos_y; /* frame position Y */
> + int prev_anmf_flags; /* previous frame flags from ANMF chunk */
> + int prev_width; /* previous frame width */
> + int prev_height; /* previous frame height */
> + int prev_pos_x; /* previous frame position X */
> + int prev_pos_y; /* previous frame position Y */
> + int await_progress; /* value of progress to wait for */
> + uint8_t background_argb[4]; /* background color in ARGB format */
> + uint8_t background_yuva[4]; /* background color in YUVA format */
> + const uint8_t *background_data[4]; /* "planes" for background color in YUVA format */
> + uint8_t transparent_yuva[4]; /* transparent black in YUVA format */
>
> int nb_transforms; /* number of transforms */
> enum TransformType transforms[4]; /* transformations used in the image, in order */
> @@ -555,7 +576,7 @@ static int decode_entropy_coded_image(WebPContext *s, enum ImageRole role,
> img->frame->height = h;
>
> if (role == IMAGE_ROLE_ARGB && !img->is_alpha_primary) {
> - ret = ff_thread_get_buffer(s->avctx, img->frame, 0);
> + ret = ff_get_buffer(s->avctx, img->frame, 0);
> } else
> ret = av_frame_get_buffer(img->frame, 1);
> if (ret < 0)
> @@ -1053,7 +1074,7 @@ static int apply_color_indexing_transform(WebPContext *s)
> return 0;
> }
>
> -static void update_canvas_size(AVCodecContext *avctx, int w, int h)
> +static void update_frame_size(AVCodecContext *avctx, int w, int h)
> {
> WebPContext *s = avctx->priv_data;
> if (s->width && s->width != w) {
> @@ -1076,7 +1097,6 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
> int w, h, ret, i, used;
>
> if (!is_alpha_chunk) {
> - s->lossless = 1;
> avctx->pix_fmt = AV_PIX_FMT_ARGB;
> }
>
> @@ -1093,7 +1113,7 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
> w = get_bits(&s->gb, 14) + 1;
> h = get_bits(&s->gb, 14) + 1;
>
> - update_canvas_size(avctx, w, h);
> + update_frame_size(avctx, w, h);
>
> ret = ff_set_dimensions(avctx, s->width, s->height);
> if (ret < 0)
> @@ -1290,7 +1310,6 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
> s->v.actually_webp = 1;
> }
> avctx->pix_fmt = s->has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
> - s->lossless = 0;
>
> if (data_size > INT_MAX) {
> av_log(avctx, AV_LOG_ERROR, "unsupported chunk size\n");
> @@ -1308,7 +1327,7 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
> if (!*got_frame)
> return AVERROR_INVALIDDATA;
>
> - update_canvas_size(avctx, avctx->width, avctx->height);
> + update_frame_size(avctx, avctx->width, avctx->height);
>
> if (s->has_alpha) {
> ret = vp8_lossy_decode_alpha(avctx, p, s->alpha_data,
> @@ -1318,41 +1337,17 @@ static int vp8_lossy_decode_frame(AVCodecContext *avctx, AVFrame *p,
> }
> return ret;
> }
> +int init_canvas_frame(WebPContext *s, int format, int key_frame);
>
> -static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> - int *got_frame, AVPacket *avpkt)
> +static int webp_decode_frame_common(AVCodecContext *avctx, uint8_t *data, int size,
> + int *got_frame, int key_frame)
indent is off
> {
> WebPContext *s = avctx->priv_data;
> GetByteContext gb;
> int ret;
> uint32_t chunk_type, chunk_size;
> - int vp8x_flags = 0;
>
> - s->avctx = avctx;
> - s->width = 0;
> - s->height = 0;
> - *got_frame = 0;
> - s->has_alpha = 0;
> - s->has_exif = 0;
> - s->has_iccp = 0;
> - bytestream2_init(&gb, avpkt->data, avpkt->size);
> -
> - if (bytestream2_get_bytes_left(&gb) < 12)
> - return AVERROR_INVALIDDATA;
> -
> - if (bytestream2_get_le32(&gb) != MKTAG('R', 'I', 'F', 'F')) {
> - av_log(avctx, AV_LOG_ERROR, "missing RIFF tag\n");
> - return AVERROR_INVALIDDATA;
> - }
> -
> - chunk_size = bytestream2_get_le32(&gb);
> - if (bytestream2_get_bytes_left(&gb) < chunk_size)
> - return AVERROR_INVALIDDATA;
> -
> - if (bytestream2_get_le32(&gb) != MKTAG('W', 'E', 'B', 'P')) {
> - av_log(avctx, AV_LOG_ERROR, "missing WEBP tag\n");
> - return AVERROR_INVALIDDATA;
> - }
> + bytestream2_init(&gb, data, size);
>
> while (bytestream2_get_bytes_left(&gb) > 8) {
> char chunk_str[5] = { 0 };
> @@ -1363,6 +1358,10 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> return AVERROR_INVALIDDATA;
> chunk_size += chunk_size & 1;
>
> + // we need to dive into RIFF chunk
> + if (chunk_type == MKTAG('R', 'I', 'F', 'F'))
> + chunk_size = 4;
> +
> if (bytestream2_get_bytes_left(&gb) < chunk_size) {
> /* we seem to be running out of data, but it could also be that the
> bitstream has trailing junk leading to bogus chunk_size. */
> @@ -1370,10 +1369,26 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> }
>
> switch (chunk_type) {
> + case MKTAG('R', 'I', 'F', 'F'):
> + if (bytestream2_get_le32(&gb) != MKTAG('W', 'E', 'B', 'P')) {
> + av_log(avctx, AV_LOG_ERROR, "missing WEBP tag\n");
> + return AVERROR_INVALIDDATA;
> + }
> + s->vp8x_flags = 0;
> + s->canvas_width = 0;
> + s->canvas_height = 0;
> + s->has_exif = 0;
> + s->has_iccp = 0;
> + ff_thread_release_ext_buffer(avctx, &s->canvas_frame);
> + break;
> case MKTAG('V', 'P', '8', ' '):
> if (!*got_frame) {
> - ret = vp8_lossy_decode_frame(avctx, p, got_frame,
> - avpkt->data + bytestream2_tell(&gb),
> + ret = init_canvas_frame(s, AV_PIX_FMT_YUVA420P, key_frame);
> + if (ret < 0)
> + return ret;
> +
> + ret = vp8_lossy_decode_frame(avctx, s->frame, got_frame,
> + data + bytestream2_tell(&gb),
> chunk_size);
> if (ret < 0)
> return ret;
> @@ -1382,8 +1397,13 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> break;
> case MKTAG('V', 'P', '8', 'L'):
> if (!*got_frame) {
> - ret = vp8_lossless_decode_frame(avctx, p, got_frame,
> - avpkt->data + bytestream2_tell(&gb),
> + ret = init_canvas_frame(s, AV_PIX_FMT_ARGB, key_frame);
> + if (ret < 0)
> + return ret;
> + ff_thread_finish_setup(s->avctx);
> +
> + ret = vp8_lossless_decode_frame(avctx, s->frame, got_frame,
> + data + bytestream2_tell(&gb),
> chunk_size, 0);
> if (ret < 0)
> return ret;
> @@ -1392,14 +1412,16 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> bytestream2_skip(&gb, chunk_size);
> break;
> case MKTAG('V', 'P', '8', 'X'):
> - if (s->width || s->height || *got_frame) {
> + if (s->canvas_width || s->canvas_height || *got_frame) {
> av_log(avctx, AV_LOG_ERROR, "Canvas dimensions are already set\n");
> return AVERROR_INVALIDDATA;
> }
> - vp8x_flags = bytestream2_get_byte(&gb);
> + s->vp8x_flags = bytestream2_get_byte(&gb);
> bytestream2_skip(&gb, 3);
> s->width = bytestream2_get_le24(&gb) + 1;
> s->height = bytestream2_get_le24(&gb) + 1;
> + s->canvas_width = s->width;
> + s->canvas_height = s->height;
> ret = av_image_check_size(s->width, s->height, 0, avctx);
> if (ret < 0)
> return ret;
> @@ -1407,7 +1429,7 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> case MKTAG('A', 'L', 'P', 'H'): {
> int alpha_header, filter_m, compression;
>
> - if (!(vp8x_flags & VP8X_FLAG_ALPHA)) {
> + if (!(s->vp8x_flags & VP8X_FLAG_ALPHA)) {
> av_log(avctx, AV_LOG_WARNING,
> "ALPHA chunk present, but alpha bit not set in the "
> "VP8X header\n");
> @@ -1416,8 +1438,9 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> av_log(avctx, AV_LOG_ERROR, "invalid ALPHA chunk size\n");
> return AVERROR_INVALIDDATA;
> }
> +
> alpha_header = bytestream2_get_byte(&gb);
> - s->alpha_data = avpkt->data + bytestream2_tell(&gb);
> + s->alpha_data = data + bytestream2_tell(&gb);
> s->alpha_data_size = chunk_size - 1;
> bytestream2_skip(&gb, s->alpha_data_size);
>
> @@ -1444,14 +1467,13 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> av_log(avctx, AV_LOG_VERBOSE, "Ignoring extra EXIF chunk\n");
> goto exif_end;
> }
> - if (!(vp8x_flags & VP8X_FLAG_EXIF_METADATA))
> + if (!(s->vp8x_flags & VP8X_FLAG_EXIF_METADATA))
> av_log(avctx, AV_LOG_WARNING,
> "EXIF chunk present, but Exif bit not set in the "
> "VP8X header\n");
>
> s->has_exif = 1;
> - bytestream2_init(&exif_gb, avpkt->data + exif_offset,
> - avpkt->size - exif_offset);
> + bytestream2_init(&exif_gb, data + exif_offset, size - exif_offset);
> if (ff_tdecode_header(&exif_gb, &le, &ifd_offset) < 0) {
> av_log(avctx, AV_LOG_ERROR, "invalid TIFF header "
> "in Exif data\n");
> @@ -1464,7 +1486,7 @@ static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> goto exif_end;
> }
>
> - av_dict_copy(&p->metadata, exif_metadata, 0);
> + av_dict_copy(&s->frame->metadata, exif_metadata, 0);
>
> exif_end:
> av_dict_free(&exif_metadata);
> @@ -1479,21 +1501,64 @@ exif_end:
> bytestream2_skip(&gb, chunk_size);
> break;
> }
> - if (!(vp8x_flags & VP8X_FLAG_ICC))
> + if (!(s->vp8x_flags & VP8X_FLAG_ICC))
> av_log(avctx, AV_LOG_WARNING,
> "ICCP chunk present, but ICC Profile bit not set in the "
> "VP8X header\n");
>
> s->has_iccp = 1;
> - sd = av_frame_new_side_data(p, AV_FRAME_DATA_ICC_PROFILE, chunk_size);
> + sd = av_frame_new_side_data(s->frame, AV_FRAME_DATA_ICC_PROFILE, chunk_size);
> if (!sd)
> return AVERROR(ENOMEM);
>
> bytestream2_get_buffer(&gb, sd->data, chunk_size);
> break;
> }
> - case MKTAG('A', 'N', 'I', 'M'):
> + case MKTAG('A', 'N', 'I', 'M'): {
> + const AVPixFmtDescriptor *desc;
> + int a, r, g, b;
> + if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION)) {
> + av_log(avctx, AV_LOG_WARNING,
> + "ANIM chunk present, but animation bit not set in the "
> + "VP8X header\n");
> + }
> + // background is stored as BGRA, we need ARGB
> + s->background_argb[3] = b = bytestream2_get_byte(&gb);
> + s->background_argb[2] = g = bytestream2_get_byte(&gb);
> + s->background_argb[1] = r = bytestream2_get_byte(&gb);
> + s->background_argb[0] = a = bytestream2_get_byte(&gb);
> +
> + // convert the background color to YUVA
> + desc = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
> + s->background_yuva[desc->comp[0].plane] = RGB_TO_Y_CCIR(r, g, b);
> + s->background_yuva[desc->comp[1].plane] = RGB_TO_U_CCIR(r, g, b, 0);
> + s->background_yuva[desc->comp[2].plane] = RGB_TO_V_CCIR(r, g, b, 0);
> + s->background_yuva[desc->comp[3].plane] = a;
> +
> + bytestream2_skip(&gb, 2); // loop count is ignored
> + break;
> + }
> case MKTAG('A', 'N', 'M', 'F'):
> + if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION)) {
> + av_log(avctx, AV_LOG_WARNING,
> + "ANMF chunk present, but animation bit not set in the "
> + "VP8X header\n");
> + }
> + s->pos_x = bytestream2_get_le24(&gb) * 2;
> + s->pos_y = bytestream2_get_le24(&gb) * 2;
> + s->width = bytestream2_get_le24(&gb) + 1;
> + s->height = bytestream2_get_le24(&gb) + 1;
> + bytestream2_skip(&gb, 3); // duration
> + s->anmf_flags = bytestream2_get_byte(&gb);
> +
> + if (s->width + s->pos_x > s->canvas_width ||
> + s->height + s->pos_y > s->canvas_height) {
> + av_log(avctx, AV_LOG_ERROR,
> + "frame does not fit into canvas\n");
> + return AVERROR_INVALIDDATA;
> + }
> + s->vp8x_flags |= VP8X_FLAG_ANIMATION;
> + break;
> case MKTAG('X', 'M', 'P', ' '):
> AV_WL32(chunk_str, chunk_type);
> av_log(avctx, AV_LOG_WARNING, "skipping unsupported chunk: %s\n",
> @@ -1509,21 +1574,508 @@ exif_end:
> }
> }
>
> - if (!*got_frame) {
> - av_log(avctx, AV_LOG_ERROR, "image data not found\n");
> - return AVERROR_INVALIDDATA;
> + return size;
> +}
> +
> +int init_canvas_frame(WebPContext *s, int format, int key_frame)
> +{
> + AVFrame *canvas = s->canvas_frame.f;
> + int height;
> + int ret;
> +
> + // canvas is needed only for animation
> + if (!(s->vp8x_flags & VP8X_FLAG_ANIMATION))
> + return 0;
> +
> + // avoid init for non-key frames whose format and size did not change
> + if (!key_frame &&
> + canvas->data[0] &&
> + canvas->format == format &&
> + canvas->width == s->canvas_width &&
> + canvas->height == s->canvas_height)
> + return 0;
> +
> + // canvas changes within IPPP sequences will loose thread sync
lose.
> + // because of the ThreadFrame reallocation and will wait forever
> + // so if frame-threading is used, forbid canvas changes and unlock
> + // previous frames
I think this might need some additional work after landing. The
canvas, as webp defines it, is fixed. Individual frames can be
smaller. I think mixed content (lossy and lossless) might trigger this
path and result in corruption.
> + if (!key_frame && canvas->data[0]) {
> + if (s->avctx->thread_count > 1) {
> + av_log(s->avctx, AV_LOG_WARNING, "Canvas change detected. The output will be damaged. Use -threads 1 to try decoding with best effort.\n");
> + // unlock previous frames that have sent an _await() call
> + ff_thread_report_progress(&s->canvas_frame, INT_MAX, 0);
> + return AVERROR_PATCHWELCOME;
> + } else {
> + // warn for damaged frames
> + av_log(s->avctx, AV_LOG_WARNING, "Canvas change detected. The output will be damaged.\n");
> + }
> + }
> +
> [...]
> +
> +static int webp_decode_frame(AVCodecContext *avctx, AVFrame *p,
> + int *got_frame, AVPacket *avpkt)
> +{
> + WebPContext *s = avctx->priv_data;
> + int ret;
> + int key_frame = avpkt->flags & AV_PKT_FLAG_KEY;
> +
> + for (int i = 0; i < avpkt->side_data_elems; ++i) {
> + if (avpkt->side_data[i].type == AV_PKT_DATA_NEW_EXTRADATA) {
> + ret = webp_decode_frame_common(avctx, avpkt->side_data[i].data,
> + avpkt->side_data[i].size,
> + got_frame, key_frame);
indent is off.
More information about the ffmpeg-devel
mailing list