[FFmpeg-devel] [PATCH v2 1/2] libavcodec: add support for animated WebP decoding

Wed Jul 29 18:43:07 EEST 2020

Jul 29, 2020, 14:05 by josef at pex.com:

> Fixes: 4907
>
> Adds support for decoding of animated WebP.
> +
> +    ff_thread_release_buffer(s->avctx, &s->canvas_frame);
> +    ret = ff_thread_get_buffer(s->avctx, &s->canvas_frame, AV_GET_BUFFER_FLAG_REF);
> +    if (ret < 0)
> +        return ret;
> +
> +    if (canvas->format == AV_PIX_FMT_ARGB) {
> +        height = canvas->height;
> +        memset(canvas->data[0], 0, height * canvas->linesize[0]);
>

At least some of those image-wide memsets can be replaced with
av_image_fill_black.

> +// divide by 255 and round to nearest
> +// apply a fast variant: (X+127)/255 = ((X+127)*257+257)>>16 = ((X+128)*257)>>16
> +#define FAST_DIV255(x) ((((x) + 128) * 257) >> 16)
>

If you don't need nearest rounding a div would likely be faster, especially at this range.

> +        if (canvas->format == AV_PIX_FMT_ARGB) {
> +            width  = s->width;
> +            height = s->height;
> +            pos_x  = s->pos_x;
> +            pos_y  = s->pos_y;
> +
> +            for (y = 0; y < height; y++) {
> +                const uint8_t *src = frame->data[0] + y * frame->linesize[0];
> +                uint8_t *dst = canvas->data[0] + (y + pos_y) * canvas->linesize[0] + pos_x * sizeof(uint32_t);
> +                for (x = 0; x < width; x++) {
> +                    int src_alpha = src[0];
> +                    int dst_alpha = dst[0];
> +
> +                    if (src_alpha == 255) {
> +                        memcpy(dst, src, 4);
> +                    } else if (dst_alpha == 255) {
> +                        dst[0] = 255;
> +                        dst[1] = FAST_DIV255(src[1] * src_alpha + dst[1] * (255 - src_alpha));
> +                        dst[2] = FAST_DIV255(src[2] * src_alpha + dst[2] * (255 - src_alpha));
> +                        dst[3] = FAST_DIV255(src[3] * src_alpha + dst[3] * (255 - src_alpha));
> +                    } else if (src_alpha + dst_alpha == 0) {
> +                        memset(dst, 0, 4);
> +                    } else {
> +                        int dst_alpha2 = dst_alpha - FAST_DIV255(src_alpha * dst_alpha);
> +                        int blend_alpha = src_alpha + dst_alpha2;
> +                        av_assert0(blend_alpha);
>

Why is there an assert here or in fact anywhere in the pixel path?
We really don't want to abort() in a library if someone sends some wrong data.

> +                    // calculate the average alpha of the tile
> +                    int src_alpha = 0;
> +                    int dst_alpha = 0;
> +                    for (yy = 0; yy < tile_h; yy++) {
> +                        for (xx = 0; xx < tile_w; xx++) {
> +                            src_alpha += frame->data[plane_a][(y * tile_h + yy) * frame->linesize[plane_a] + (x * tile_w + xx)];
> +                            dst_alpha += canvas->data[plane_a][((y + pos_y) * tile_h + yy) * canvas->linesize[plane_a] + ((x + pos_x) * tile_w + xx)];
> +                        }
> +                    }
> +                    src_alpha = RSHIFT(src_alpha, desc->log2_chroma_w + desc->log2_chroma_h);
> +                    dst_alpha = RSHIFT(dst_alpha, desc->log2_chroma_w + desc->log2_chroma_h);
>

This is some pretty horrible and iffy code to see in a decoder, sadly no choice
but to live with it. And blame google. A lot. For everything.

> +                    if (src_alpha == 255) {
> +                        *dst_u = *src_u;
> +                        *dst_v = *src_v;
> +                    } else if (dst_alpha == 255) {
> +                        *dst_u = FAST_DIV255(*src_u * src_alpha + *dst_u * (255 - src_alpha));
> +                        *dst_v = FAST_DIV255(*src_v * src_alpha + *dst_v * (255 - src_alpha));
> +                    } else if (src_alpha + dst_alpha == 0) {
> +                        *dst_u = s->transparent_yuva[1];
> +                        *dst_v = s->transparent_yuva[2];
> +                    } else {
> +                        int dst_alpha2 = dst_alpha - FAST_DIV255(src_alpha * dst_alpha);
> +                        int blend_alpha = src_alpha + dst_alpha2;
> +                        av_assert0(blend_alpha);
> +
> +                        *dst_u = ROUNDED_DIV(*src_u * src_alpha + *dst_u * dst_alpha2, blend_alpha);
> +                        *dst_v = ROUNDED_DIV(*src_v * src_alpha + *dst_v * dst_alpha2, blend_alpha);
> +                    }
>

Are you sure a branch is faster here? A mispredicted branch is something like 30 cycles,
while the FAST_DIV255 macro seems like it has less instructions than that.

> +        for (y = 0; y < height; y++) {
> +            const uint8_t *src = canvas->data[0] + y * canvas->linesize[0];
> +            uint8_t *dst = frame->data[0] + y * frame->linesize[0];
> +            for (x = 0; x < width; x++) {
> +                int src_alpha = src[0];
> +
> +                if (src_alpha == 255) {
> +                    memcpy(dst, src, 4);
> +                } else if (background_alpha == 255) {
> +                    dst[0] = 255;
> +                    dst[1] = FAST_DIV255(src[1] * src_alpha + s->background_argb[1] * (255 - src_alpha));
> +                    dst[2] = FAST_DIV255(src[2] * src_alpha + s->background_argb[2] * (255 - src_alpha));
> +                    dst[3] = FAST_DIV255(src[3] * src_alpha + s->background_argb[3] * (255 - src_alpha));
> +                } else if (src_alpha + background_alpha == 0) {
> +                    memset(dst, 0, 4);
> +                } else {
> +                    int dst_alpha2 = background_alpha - FAST_DIV255(src_alpha * background_alpha);
> +                    int blend_alpha = src_alpha + dst_alpha2;
> +                    av_assert0(blend_alpha);
> +
> +                    dst[0] = blend_alpha;
> +                    dst[1] = ROUNDED_DIV(src[1] * src_alpha + s->background_argb[1] * dst_alpha2, blend_alpha);
> +                    dst[2] = ROUNDED_DIV(src[2] * src_alpha + s->background_argb[2] * dst_alpha2, blend_alpha);
> +                    dst[3] = ROUNDED_DIV(src[3] * src_alpha + s->background_argb[3] * dst_alpha2, blend_alpha);
> +                }
> +                src += 4;
> +                dst += 4;
> +            }
> +        }
>

Could you abstract some blending code in the function? I feel like its repeated a lot of times.

>  AVCodec ff_webp_decoder = {
>  .name           = "webp",
>  .long_name      = NULL_IF_CONFIG_SMALL("WebP image"),
>  .type           = AVMEDIA_TYPE_VIDEO,
>  .id             = AV_CODEC_ID_WEBP,
>  .priv_data_size = sizeof(WebPContext),
> +    .update_thread_context = ONLY_IF_THREADS_ENABLED(webp_update_thread_context),
> +    .init           = webp_decode_init,
>  .decode         = webp_decode_frame,
>  .close          = webp_decode_close,
>  .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
> +    .caps_internal  = FF_CODEC_CAP_ALLOCATE_PROGRESS,
>  };
>

Don't you need to add a flush function to enable proper seeking?
It gets called when you seek. You won't get any data from in between the seek start and end.

The blending code really needs some abstraction. Its 80% of the whole patch.
>From the few glimpses of non-blending code I saw, that part looked okay.