[FFmpeg-devel] [PATCH 1] avfilter/vf_avgblur: switch to faster algorithm
Paul B Mahol
onemda at gmail.com
Sun Sep 26 15:11:29 EEST 2021
On Sun, Sep 26, 2021 at 1:27 PM mypopy at gmail.com <mypopy at gmail.com> wrote:
> On Sun, Sep 26, 2021 at 4:11 PM Paul B Mahol <onemda at gmail.com> wrote:
> >
> > Signed-off-by: Paul B Mahol <onemda at gmail.com>
> > ---
> > libavfilter/vf_avgblur.c | 311 ++++++++++++++------------
> > tests/ref/fate/filter-refcmp-psnr-yuv | 80 +++----
> > 2 files changed, 211 insertions(+), 180 deletions(-)
> >
> > diff --git a/libavfilter/vf_avgblur.c b/libavfilter/vf_avgblur.c
> > index 3e222a43fa..a838285bb4 100644
> > --- a/libavfilter/vf_avgblur.c
> > +++ b/libavfilter/vf_avgblur.c
> > @@ -20,6 +20,7 @@
> > * SOFTWARE.
> > */
> >
> > +#include "libavutil/avassert.h"
> > #include "libavutil/imgutils.h"
> > #include "libavutil/opt.h"
> > #include "libavutil/pixdesc.h"
> > @@ -36,13 +37,15 @@ typedef struct AverageBlurContext {
> > int planes;
> >
> > int depth;
> > + int max;
> > + int area;
> > int planewidth[4];
> > int planeheight[4];
> > - float *buffer;
> > + void *buffer;
> > + uint16_t lut[256 * 256 * 256];
> > int nb_planes;
> >
> > - int (*filter_horizontally)(AVFilterContext *ctx, void *arg, int
> jobnr, int nb_jobs);
> > - int (*filter_vertically)(AVFilterContext *ctx, void *arg, int
> jobnr, int nb_jobs);
> > + int (*filter[2])(AVFilterContext *ctx, void *arg, int jobnr, int
> nb_jobs);
> > } AverageBlurContext;
> >
> > #define OFFSET(x) offsetof(AverageBlurContext, x)
> > @@ -60,124 +63,138 @@ AVFILTER_DEFINE_CLASS(avgblur);
> > typedef struct ThreadData {
> > int height;
> > int width;
> > - uint8_t *ptr;
> > - int linesize;
> > + const void *ptr;
> > + void *dptr;
> > + int linesize, dlinesize;
> > } ThreadData;
> >
> > -#define HORIZONTAL_FILTER(name, type)
> \
> > -static int filter_horizontally_##name(AVFilterContext *ctx, void *arg,
> int jobnr, int nb_jobs)\
> > -{
> \
> > - AverageBlurContext *s = ctx->priv;
> \
> > - ThreadData *td = arg;
> \
> > - const int height = td->height;
> \
> > - const int width = td->width;
> \
> > - const int slice_start = (height * jobnr ) / nb_jobs;
> \
> > - const int slice_end = (height * (jobnr+1)) / nb_jobs;
> \
> > - const int radius = FFMIN(s->radius, width / 2);
> \
> > - const int linesize = td->linesize / sizeof(type);
> \
> > - float *buffer = s->buffer;
> \
> > - const type *src;
> \
> > - float *ptr;
> \
> > - int y, x;
> \
> > -
> \
> > - /* Filter horizontally along each row */
> \
> > - for (y = slice_start; y < slice_end; y++) {
> \
> > - float acc = 0;
> \
> > - int count = 0;
> \
> > -
> \
> > - src = (const type *)td->ptr + linesize * y;
> \
> > - ptr = buffer + width * y;
> \
> > -
> \
> > - for (x = 0; x < radius; x++) {
> \
> > - acc += src[x];
> \
> > - }
> \
> > - count += radius;
> \
> > -
> \
> > - for (x = 0; x <= radius; x++) {
> \
> > - acc += src[x + radius];
> \
> > - count++;
> \
> > - ptr[x] = acc / count;
> \
> > - }
> \
> > -
> \
> > - for (; x < width - radius; x++) {
> \
> > - acc += src[x + radius] - src[x - radius - 1];
> \
> > - ptr[x] = acc / count;
> \
> > - }
> \
> > -
> \
> > - for (; x < width; x++) {
> \
> > - acc -= src[x - radius];
> \
> > - count--;
> \
> > - ptr[x] = acc / count;
> \
> > - }
> \
> > - }
> \
> > -
> \
> > - return 0;
> \
> > +#define LUT_DIV(sum, area) (lut[(sum)])
> > +#define SLOW_DIV(sum, area) ((sum) / (area))
> > +
> > +#define FILTER(name, type, btype, lutunused, areaunused, lutdiv)
> \
> > +static int filter_##name(AVFilterContext *ctx, void *arg, int jobnr,
> int nb_jobs) \
> > +{
> \
> > + AverageBlurContext *s = ctx->priv;
> \
> > + ThreadData *td = arg;
> \
> > + areaunused const int area = s->area;
> \
> > + lutunused const uint16_t *lut = s->lut;
> \
> > + const int size_w = s->radius;
> \
> > + const int size_h = s->radiusV;
> \
> > + btype *col_sum = (btype *)s->buffer + size_w;
> \
> > + const int dlinesize = td->dlinesize / sizeof(type);
> \
> > + const int linesize = td->linesize / sizeof(type);
> \
> > + const int height = td->height;
> \
> > + const int width = td->width;
> \
> > + const type *src = td->ptr;
> \
> > + type *dst = td->dptr;
> \
> > + btype sum = 0;
> \
> > +
> \
> > + for (int x = -size_w; x < 0; x++) {
> \
> > + sum = src[0] * size_h;
> \
> > + for (int y = 0; y <= size_h; y++)
> \
> > + sum += src[y * linesize];
> \
> > + av_assert2(sum >= 0);
> \
> > + col_sum[x] = sum;
> \
> > + }
> \
> > +
> \
> > + for (int x = 0; x < width; x++) {
> \
> > + sum = src[x] * size_h;
> \
> > + for (int y = 0; y <= size_h; y++)
> \
> > + sum += src[x + y * linesize];
> \
> > + av_assert2(sum >= 0);
> \
> > + col_sum[x] = sum;
> \
> > + }
> \
> > +
> \
> > + for (int x = width; x < width + size_w; x++) {
> \
> > + sum = src[width - 1] * size_h;
> \
> > + for (int y = 0; y <= size_h; y++)
> \
> > + sum += src[width - 1 + y * linesize];
> \
> > + av_assert2(sum >= 0);
> \
> > + col_sum[x] = sum;
> \
> > + }
> \
> > +
> \
> > + sum = 0;
> \
> > + for (int x = -size_w; x <= size_w; x++)
> \
> > + sum += col_sum[x];
> \
> > + av_assert2(sum >= 0);
> \
> > + dst[0] = lutdiv(sum, area);
> \
> > +
> \
> > + for (int x = 1; x < width; x++) {
> \
> > + sum = sum - col_sum[x - size_w - 1] + col_sum[x + size_w];
> \
> > + av_assert2(sum >= 0);
> \
> > + dst[x] = lutdiv(sum, area);
> \
> > + }
> \
> > +
> \
> > + src = td->ptr;
> \
> > + src += linesize;
> \
> > + dst += dlinesize;
> \
> > +
> \
> > + for (int y = 1; y < height; y++) {
> \
> > + const int syp = FFMIN(size_h, height - y - 1) * linesize;
> \
> > + const int syn = FFMIN(y, size_h + 1) * linesize;
> \
> > +
> \
> > + sum = 0;
> \
> > +
> \
> > + for (int x = -size_w; x < 0; x++)
> \
> > + col_sum[x] += src[0 + syp] - src[0 - syn];
> \
> > +
> \
> > + for (int x = 0; x < width; x++)
> \
> > + col_sum[x] += src[x + syp] - src[x - syn];
> \
> > +
> \
> > + for (int x = width; x < width + size_w; x++)
> \
> > + col_sum[x] += src[width - 1 + syp] - src[width - 1 - syn];
> \
> > +
> \
> > + for (int x = -size_w; x <= size_w; x++)
> \
> > + sum += col_sum[x];
> \
> > + av_assert2(sum >= 0);
> \
> > + dst[0] = lutdiv(sum, area);
> \
> > +
> \
> > + for (int x = 1; x < width; x++) {
> \
> > + sum = sum - col_sum[x - size_w - 1] + col_sum[x + size_w];
> \
> > + av_assert2(sum >= 0);
> \
> > + dst[x] = lutdiv(sum, area);
> \
> > + }
> \
> > +
> \
> > + src += linesize;
> \
> > + dst += dlinesize;
> \
> > + }
> \
> > +
> \
> > + return 0;
> \
> > }
> >
> > -HORIZONTAL_FILTER(8, uint8_t)
> > -HORIZONTAL_FILTER(16, uint16_t)
> > -
> > -#define VERTICAL_FILTER(name, type)
> \
> > -static int filter_vertically_##name(AVFilterContext *ctx, void *arg,
> int jobnr, int nb_jobs) \
> > -{
> \
> > - AverageBlurContext *s = ctx->priv;
> \
> > - ThreadData *td = arg;
> \
> > - const int height = td->height;
> \
> > - const int width = td->width;
> \
> > - const int slice_start = (width * jobnr ) / nb_jobs;
> \
> > - const int slice_end = (width * (jobnr+1)) / nb_jobs;
> \
> > - const int radius = FFMIN(s->radiusV, height / 2);
> \
> > - const int linesize = td->linesize / sizeof(type);
> \
> > - type *buffer = (type *)td->ptr;
> \
> > - const float *src;
> \
> > - type *ptr;
> \
> > - int i, x;
> \
> > -
> \
> > - /* Filter vertically along each column */
> \
> > - for (x = slice_start; x < slice_end; x++) {
> \
> > - float acc = 0;
> \
> > - int count = 0;
> \
> > -
> \
> > - src = s->buffer + x;
> \
> > -
> \
> > - for (i = 0; i < radius; i++) {
> \
> > - acc += src[0];
> \
> > - src += width;
> \
> > - }
> \
> > - count += radius;
> \
> > -
> \
> > - src = s->buffer + x;
> \
> > - ptr = buffer + x;
> \
> > - for (i = 0; i + radius < height && i <= radius; i++) {
> \
> > - acc += src[(i + radius) * width];
> \
> > - count++;
> \
> > - ptr[i * linesize] = acc / count;
> \
> > - }
> \
> > -
> \
> > - for (; i < height - radius; i++) {
> \
> > - acc += src[(i + radius) * width] - src[(i - radius - 1) *
> width]; \
> > - ptr[i * linesize] = acc / count;
> \
> > - }
> \
> > -
> \
> > - for (; i < height; i++) {
> \
> > - acc -= src[(i - radius) * width];
> \
> > - count--;
> \
> > - ptr[i * linesize] = acc / count;
> \
> > - }
> \
> > - }
> \
> > -
> \
> > - return 0;
> \
> > -}
> > +FILTER(lut8, uint8_t, int32_t, , av_unused, LUT_DIV)
> > +FILTER(lut16, uint16_t, int64_t, , av_unused, LUT_DIV)
> > +
> > +FILTER(slow8, uint8_t, int32_t, av_unused, , SLOW_DIV)
> > +FILTER(slow16, uint16_t, int64_t, av_unused, , SLOW_DIV)
> > +
> > +static void build_lut(AVFilterContext *ctx, int max)
> > +{
> > + AverageBlurContext *s = ctx->priv;
> > + const int area = (2 * s->radiusV + 1) * (2 * s->radius + 1);
> > +
> > + s->area = area;
> > + if (max * area >= FF_ARRAY_ELEMS(s->lut))
> > + return;
> > +
> > + for (int i = 0, j = 0, k = 0; i < max * area; i++, j++) {
> > + if (j == area) {
> > + k++;
> > + j = 0;
> > + }
> >
> > -VERTICAL_FILTER(8, uint8_t)
> > -VERTICAL_FILTER(16, uint16_t)
> > + s->lut[i] = k;
> > + }
> > +}
> >
> > static int config_input(AVFilterLink *inlink)
> > {
> > + AVFilterContext *ctx = inlink->dst;
> > const AVPixFmtDescriptor *desc =
> av_pix_fmt_desc_get(inlink->format);
> > - AverageBlurContext *s = inlink->dst->priv;
> > + AverageBlurContext *s = ctx->priv;
> >
> > s->depth = desc->comp[0].depth;
> > + s->max = 1 << s->depth;
> > s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w,
> desc->log2_chroma_w);
> > s->planewidth[0] = s->planewidth[3] = inlink->w;
> > s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h,
> desc->log2_chroma_h);
> > @@ -185,21 +202,20 @@ static int config_input(AVFilterLink *inlink)
> >
> > s->nb_planes = av_pix_fmt_count_planes(inlink->format);
> >
> > - s->buffer = av_malloc_array(inlink->w, inlink->h *
> sizeof(*s->buffer));
> > + s->buffer = av_calloc(inlink->w + (1024 * 2 + 1), 4 * ((s->depth +
> 7) / 8));
> > if (!s->buffer)
> > return AVERROR(ENOMEM);
> >
> > - if (s->radiusV <= 0) {
> > + if (s->radiusV <= 0)
> > s->radiusV = s->radius;
> > - }
> >
> > - if (s->depth == 8) {
> > - s->filter_horizontally = filter_horizontally_8;
> > - s->filter_vertically = filter_vertically_8;
> > - } else {
> > - s->filter_horizontally = filter_horizontally_16;
> > - s->filter_vertically = filter_vertically_16;
> > - }
> > + s->filter[0] = s->depth <= 8 ? filter_lut8 : filter_lut16;
> > + s->filter[1] = s->depth <= 8 ? filter_slow8 : filter_slow16;
> > +
> > + s->radius = FFMIN(s->planewidth[1] / 2, s->radius);
> > + s->radiusV = FFMIN(s->planeheight[1] / 2, s->radiusV);
> > +
> > + build_lut(ctx, s->max);
> >
> > return 0;
> > }
> > @@ -209,19 +225,16 @@ static void averageiir2d(AVFilterContext *ctx,
> AVFrame *in, AVFrame *out, int pl
> > AverageBlurContext *s = ctx->priv;
> > const int width = s->planewidth[plane];
> > const int height = s->planeheight[plane];
> > - const int nb_threads = ff_filter_get_nb_threads(ctx);
> > + const int slow = (s->max * s->area) >= FF_ARRAY_ELEMS(s->lut);
> > ThreadData td;
> >
> > td.width = width;
> > td.height = height;
> > td.ptr = in->data[plane];
> > td.linesize = in->linesize[plane];
> > - ff_filter_execute(ctx, s->filter_horizontally, &td,
> > - NULL, FFMIN(height, nb_threads));
> > - td.ptr = out->data[plane];
> > - td.linesize = out->linesize[plane];
> > - ff_filter_execute(ctx, s->filter_vertically, &td,
> > - NULL, FFMIN(width, nb_threads));
> > + td.dptr = out->data[plane];
> > + td.dlinesize = out->linesize[plane];
> > + s->filter[slow](ctx, &td, 0, 0);
> > }
> >
> > static int query_formats(AVFilterContext *ctx)
> > @@ -259,16 +272,12 @@ static int filter_frame(AVFilterLink *inlink,
> AVFrame *in)
> > AVFrame *out;
> > int plane;
> >
> > - if (av_frame_is_writable(in)) {
> > - out = in;
> > - } else {
> > - out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> > - if (!out) {
> > - av_frame_free(&in);
> > - return AVERROR(ENOMEM);
> > - }
> > - av_frame_copy_props(out, in);
> > + out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> > + if (!out) {
> > + av_frame_free(&in);
> > + return AVERROR(ENOMEM);
> > }
> > + av_frame_copy_props(out, in);
> >
> > for (plane = 0; plane < s->nb_planes; plane++) {
> > const int height = s->planeheight[plane];
> > @@ -285,11 +294,33 @@ static int filter_frame(AVFilterLink *inlink,
> AVFrame *in)
> > averageiir2d(ctx, in, out, plane);
> > }
> >
> > - if (out != in)
> > - av_frame_free(&in);
> > + av_frame_free(&in);
> > return ff_filter_frame(outlink, out);
> > }
> >
> > +static int process_command(AVFilterContext *ctx, const char *cmd, const
> char *args,
> > + char *res, int res_len, int flags)
> > +{
> > + AverageBlurContext *s = ctx->priv;
> > + const int area = s->area;
> > + int ret;
> > +
> > + ret = ff_filter_process_command(ctx, cmd, args, res, res_len,
> flags);
> > + if (ret < 0)
> > + return ret;
> > +
> > + if (s->radiusV <= 0)
> > + s->radiusV = s->radius;
> > +
> > + s->radius = FFMIN(s->planewidth[1] / 2, s->radius);
> > + s->radiusV = FFMIN(s->planeheight[1] / 2, s->radiusV);
> > +
> > + if (area != (2 * s->radiusV + 1) * (2 * s->radius + 1))
> > + build_lut(ctx, s->max);
> > +
> > + return 0;
> > +}
> > +
> > static av_cold void uninit(AVFilterContext *ctx)
> > {
> > AverageBlurContext *s = ctx->priv;
> > @@ -322,6 +353,6 @@ const AVFilter ff_vf_avgblur = {
> > .query_formats = query_formats,
> > FILTER_INPUTS(avgblur_inputs),
> > FILTER_OUTPUTS(avgblur_outputs),
> > - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC |
> AVFILTER_FLAG_SLICE_THREADS,
> > - .process_command = ff_filter_process_command,
> > + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,
> > + .process_command = process_command,
> > };
> > diff --git a/tests/ref/fate/filter-refcmp-psnr-yuv
> b/tests/ref/fate/filter-refcmp-psnr-yuv
> > index 0e634ed0e4..196d3da74e 100644
> > --- a/tests/ref/fate/filter-refcmp-psnr-yuv
> > +++ b/tests/ref/fate/filter-refcmp-psnr-yuv
> > @@ -1,45 +1,45 @@
> > frame:0 pts:0 pts_time:0
> > -lavfi.psnr.mse.y=222.06
> > -lavfi.psnr.psnr.y=24.67
> > -lavfi.psnr.mse.u=339.38
> > -lavfi.psnr.psnr.u=22.82
> > -lavfi.psnr.mse.v=705.41
> > -lavfi.psnr.psnr.v=19.65
> > -lavfi.psnr.mse_avg=372.23
> > -lavfi.psnr.psnr_avg=22.42
> > +lavfi.psnr.mse.y=218.435333
> > +lavfi.psnr.psnr.y=24.737576
> > +lavfi.psnr.mse.u=336.693390
> > +lavfi.psnr.psnr.u=22.858458
> > +lavfi.psnr.mse.v=698.968384
> > +lavfi.psnr.psnr.v=19.686228
> > +lavfi.psnr.mse_avg=368.133118
> > +lavfi.psnr.psnr_avg=22.470755
> > frame:1 pts:1 pts_time:1
> > -lavfi.psnr.mse.y=236.74
> > -lavfi.psnr.psnr.y=24.39
> > -lavfi.psnr.mse.u=416.17
> > -lavfi.psnr.psnr.u=21.94
> > -lavfi.psnr.mse.v=704.98
> > -lavfi.psnr.psnr.v=19.65
> > -lavfi.psnr.mse_avg=398.66
> > -lavfi.psnr.psnr_avg=22.12
> > +lavfi.psnr.mse.y=232.656189
> > +lavfi.psnr.psnr.y=24.463657
> > +lavfi.psnr.mse.u=413.841064
> > +lavfi.psnr.psnr.u=21.962467
> > +lavfi.psnr.mse.v=693.103577
> > +lavfi.psnr.psnr.v=19.722822
> > +lavfi.psnr.mse_avg=393.064240
> > +lavfi.psnr.psnr_avg=22.186169
> > frame:2 pts:2 pts_time:2
> > -lavfi.psnr.mse.y=234.79
> > -lavfi.psnr.psnr.y=24.42
> > -lavfi.psnr.mse.u=435.72
> > -lavfi.psnr.psnr.u=21.74
> > -lavfi.psnr.mse.v=699.60
> > -lavfi.psnr.psnr.v=19.68
> > -lavfi.psnr.mse_avg=401.23
> > -lavfi.psnr.psnr_avg=22.10
> > +lavfi.psnr.mse.y=230.470032
> > +lavfi.psnr.psnr.y=24.504660
> > +lavfi.psnr.mse.u=433.524109
> > +lavfi.psnr.psnr.u=21.760672
> > +lavfi.psnr.mse.v=693.391174
> > +lavfi.psnr.psnr.v=19.721020
> > +lavfi.psnr.mse_avg=396.963837
> > +lavfi.psnr.psnr_avg=22.143293
> > frame:3 pts:3 pts_time:3
> > -lavfi.psnr.mse.y=250.88
> > -lavfi.psnr.psnr.y=24.14
> > -lavfi.psnr.mse.u=479.73
> > -lavfi.psnr.psnr.u=21.32
> > -lavfi.psnr.mse.v=707.55
> > -lavfi.psnr.psnr.v=19.63
> > -lavfi.psnr.mse_avg=422.26
> > -lavfi.psnr.psnr_avg=21.88
> > +lavfi.psnr.mse.y=247.346817
> > +lavfi.psnr.psnr.y=24.197741
> > +lavfi.psnr.mse.u=476.365723
> > +lavfi.psnr.psnr.u=21.351398
> > +lavfi.psnr.mse.v=700.987549
> > +lavfi.psnr.psnr.v=19.673700
> > +lavfi.psnr.mse_avg=418.011719
> > +lavfi.psnr.psnr_avg=21.918919
> > frame:4 pts:4 pts_time:4
> > -lavfi.psnr.mse.y=241.05
> > -lavfi.psnr.psnr.y=24.31
> > -lavfi.psnr.mse.u=505.04
> > -lavfi.psnr.psnr.u=21.10
> > -lavfi.psnr.mse.v=716.00
> > -lavfi.psnr.psnr.v=19.58
> > -lavfi.psnr.mse_avg=425.79
> > -lavfi.psnr.psnr_avg=21.84
> > +lavfi.psnr.mse.y=237.129654
> > +lavfi.psnr.psnr.y=24.380945
> > +lavfi.psnr.mse.u=503.722931
> > +lavfi.psnr.psnr.u=21.108887
> > +lavfi.psnr.mse.v=708.932678
> > +lavfi.psnr.psnr.v=19.624754
> > +lavfi.psnr.mse_avg=421.728729
> > +lavfi.psnr.psnr_avg=21.880472
> > --
> > 2.33.0
> >
>
> Do you have some performance data after applying the faster algorithm
> in your test bed? I think the data will help others, thx
>
Previous algorithm used floats, and did everything in 2 pass.
This code is faster several times or have same speed as previous code(when
using big size values).
For small radius divisions are avoided with luts and this give biggest
speed up.
I have number but they are only useful for my setup. Now filter is even
faster than heavily optimized gblur filter under clang-12.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>
More information about the ffmpeg-devel
mailing list