[FFmpeg-devel] [PATCH 1/2] avfilter/vf_zscale: add slice threading
Pavel Koshevoy
pkoshevoy at gmail.com
Fri May 31 22:44:08 EEST 2019
On Fri, May 31, 2019 at 4:46 AM Paul B Mahol <onemda at gmail.com> wrote:
>
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
> libavfilter/vf_zscale.c | 335 +++++++++++++++++++++++++---------------
> 1 file changed, 214 insertions(+), 121 deletions(-)
>
> diff --git a/libavfilter/vf_zscale.c b/libavfilter/vf_zscale.c
> index f0309272fa..c53bb08ccc 100644
> --- a/libavfilter/vf_zscale.c
> +++ b/libavfilter/vf_zscale.c
> @@ -74,6 +74,16 @@ enum var_name {
> VARS_NB
> };
>
> +typedef struct ZScaleThreadContext {
> + void *tmp;
> + size_t tmp_size;
> +
> + zimg_image_format src_format, dst_format;
> + zimg_image_format alpha_src_format, alpha_dst_format;
> + zimg_graph_builder_params alpha_params, params;
> + zimg_filter_graph *alpha_graph, *graph;
> +} ZScaleThreadContext;
> +
> typedef struct ZScaleContext {
> const AVClass *class;
>
> @@ -100,6 +110,8 @@ typedef struct ZScaleContext {
> double nominal_peak_luminance;
> int approximate_gamma;
>
> + int nb_threads;
> +
> char *w_expr; ///< width expression string
> char *h_expr; ///< height expression string
>
> @@ -110,13 +122,7 @@ typedef struct ZScaleContext {
>
> int force_original_aspect_ratio;
>
> - void *tmp;
> - size_t tmp_size;
> -
> - zimg_image_format src_format, dst_format;
> - zimg_image_format alpha_src_format, alpha_dst_format;
> - zimg_graph_builder_params alpha_params, params;
> - zimg_filter_graph *alpha_graph, *graph;
> + ZScaleThreadContext *ztd;
>
> enum AVColorSpace in_colorspace, out_colorspace;
> enum AVColorTransferCharacteristic in_trc, out_trc;
> @@ -204,6 +210,12 @@ static int config_props(AVFilterLink *outlink)
> int ret;
> int factor_w, factor_h;
>
> + s->nb_threads = ff_filter_get_nb_threads(ctx);
> + av_freep(&s->ztd);
> + s->ztd = av_calloc(s->nb_threads, sizeof(*s->ztd));
> + if (!s->ztd)
> + return AVERROR(ENOMEM);
> +
> var_values[VAR_IN_W] = var_values[VAR_IW] = inlink->w;
> var_values[VAR_IN_H] = var_values[VAR_IH] = inlink->h;
> var_values[VAR_OUT_W] = var_values[VAR_OW] = NAN;
> @@ -458,10 +470,12 @@ static int convert_range(enum AVColorRange color_range)
> }
>
> static void format_init(zimg_image_format *format, AVFrame *frame, const AVPixFmtDescriptor *desc,
> - int colorspace, int primaries, int transfer, int range, int location)
> + int colorspace, int primaries, int transfer, int range, int location,
> + int width, int height,
> + double active_top, double active_height, int set_active)
> {
> - format->width = frame->width;
> - format->height = frame->height;
> + format->width = width;
> + format->height = height;
> format->subsample_w = desc->log2_chroma_w;
> format->subsample_h = desc->log2_chroma_h;
> format->depth = desc->comp[0].depth;
> @@ -472,6 +486,10 @@ static void format_init(zimg_image_format *format, AVFrame *frame, const AVPixFm
> format->transfer_characteristics = transfer == - 1 ? convert_trc(frame->color_trc) : transfer;
> format->pixel_range = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? ZIMG_RANGE_FULL : range == -1 ? convert_range(frame->color_range) : range;
> format->chroma_location = location == -1 ? convert_chroma_location(frame->chroma_location) : location;
> + if (!set_active)
> + return;
> + format->active_region.top = active_top;
> + format->active_region.height = active_height;
> }
>
> static int graph_build(zimg_filter_graph **graph, zimg_graph_builder_params *params,
> @@ -502,16 +520,163 @@ static int graph_build(zimg_filter_graph **graph, zimg_graph_builder_params *par
> return 0;
> }
>
> +typedef struct ThreadData {
> + AVFrame *in, *out;
> + const AVPixFmtDescriptor *desc, *odesc;
> +} ThreadData;
> +
> +static int prepare_graph(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
> +{
> + ZScaleContext *s = ctx->priv;
> + ThreadData *td = arg;
> + AVFrame *in = td->in;
> + AVFrame *out = td->out;
> + const AVPixFmtDescriptor *desc = td->desc;
> + const AVPixFmtDescriptor *odesc = td->odesc;
> + const int in_slice_start = (in->height * jobnr) / nb_jobs;
> + const int in_slice_end = (in->height * (jobnr+1)) / nb_jobs;
> + const int out_slice_start = (out->height * jobnr) / nb_jobs;
> + const int out_slice_end = (out->height * (jobnr+1)) / nb_jobs;
> + const double scale_h = (double)in->height / (double)out->height;
> + double active_top = out_slice_start * scale_h;
> + double active_height = (out_slice_end - out_slice_start) * scale_h;
> + int ret;
> +
> + zimg_image_format_default(&s->ztd[jobnr].src_format, ZIMG_API_VERSION);
> + zimg_image_format_default(&s->ztd[jobnr].dst_format, ZIMG_API_VERSION);
> + zimg_graph_builder_params_default(&s->ztd[jobnr].params, ZIMG_API_VERSION);
> +
> + s->ztd[jobnr].params.dither_type = s->dither;
> + s->ztd[jobnr].params.cpu_type = ZIMG_CPU_AUTO;
> + s->ztd[jobnr].params.resample_filter = s->filter;
> + s->ztd[jobnr].params.resample_filter_uv = s->filter;
> + s->ztd[jobnr].params.nominal_peak_luminance = s->nominal_peak_luminance;
> + s->ztd[jobnr].params.allow_approximate_gamma = s->approximate_gamma;
> +
> + format_init(&s->ztd[jobnr].src_format, in, desc, s->colorspace_in,
> + s->primaries_in, s->trc_in, s->range_in, s->chromal_in,
> + in->width, in->height,
> + active_top, active_height, 1);
> + format_init(&s->ztd[jobnr].dst_format, out, odesc, s->colorspace,
> + s->primaries, s->trc, s->range, s->chromal,
> + out->width, out_slice_end - out_slice_start,
> + 0, 0, 0);
> +
> + ret = graph_build(&s->ztd[jobnr].graph, &s->ztd[jobnr].params, &s->ztd[jobnr].src_format, &s->ztd[jobnr].dst_format,
> + &s->ztd[jobnr].tmp, &s->ztd[jobnr].tmp_size);
> + if (ret)
> + return ret;
> +
> + if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> + zimg_image_format_default(&s->ztd[jobnr].alpha_src_format, ZIMG_API_VERSION);
> + zimg_image_format_default(&s->ztd[jobnr].alpha_dst_format, ZIMG_API_VERSION);
> + zimg_graph_builder_params_default(&s->ztd[jobnr].alpha_params, ZIMG_API_VERSION);
> +
> + s->ztd[jobnr].alpha_params.dither_type = s->dither;
> + s->ztd[jobnr].alpha_params.cpu_type = ZIMG_CPU_AUTO;
> + s->ztd[jobnr].alpha_params.resample_filter = s->filter;
> +
> + s->ztd[jobnr].alpha_src_format.width = in->width;
> + s->ztd[jobnr].alpha_src_format.height = in->height;
> + s->ztd[jobnr].alpha_src_format.depth = desc->comp[0].depth;
> + s->ztd[jobnr].alpha_src_format.pixel_type = (desc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : desc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
> + s->ztd[jobnr].alpha_src_format.color_family = ZIMG_COLOR_GREY;
> + s->ztd[jobnr].alpha_src_format.active_region.left = 0;
> + s->ztd[jobnr].alpha_src_format.active_region.top = in_slice_start;
> + s->ztd[jobnr].alpha_src_format.active_region.width = in->width;
> + s->ztd[jobnr].alpha_src_format.active_region.height = in_slice_end - in_slice_start;
> +
> + s->ztd[jobnr].alpha_dst_format.width = out->width;
> + s->ztd[jobnr].alpha_dst_format.height = out->height;
> + s->ztd[jobnr].alpha_dst_format.depth = odesc->comp[0].depth;
> + s->ztd[jobnr].alpha_dst_format.pixel_type = (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : odesc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
> + s->ztd[jobnr].alpha_dst_format.color_family = ZIMG_COLOR_GREY;
> +
> + zimg_filter_graph_free(s->ztd[jobnr].alpha_graph);
> + s->ztd[jobnr].alpha_graph = zimg_filter_graph_build(&s->ztd[jobnr].alpha_src_format, &s->ztd[jobnr].alpha_dst_format, &s->ztd[jobnr].alpha_params);
> + if (!s->ztd[jobnr].alpha_graph) {
> + return print_zimg_error(ctx);
> + }
> + }
> +
> + return 0;
> +}
> +
> +static int zscale_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
> +{
> + ZScaleContext *s = ctx->priv;
> + ThreadData *td = arg;
> + AVFrame *in = td->in;
> + AVFrame *out = td->out;
> + const AVPixFmtDescriptor *desc = td->desc;
> + const AVPixFmtDescriptor *odesc = td->odesc;
> + zimg_image_buffer_const src_buf = { ZIMG_API_VERSION };
> + zimg_image_buffer dst_buf = { ZIMG_API_VERSION };
> + int ret = AVERROR(EINVAL);
> +
> + for (int plane = 0; plane < 3; plane++) {
> + const int height = plane > 0 ? AV_CEIL_RSHIFT(out->height, odesc->log2_chroma_h) : out->height;
> + const int out_slice_start = (height * jobnr) / nb_jobs;
> + int p = desc->comp[plane].plane;
> +
> + src_buf.plane[plane].data = in->data[p];
> + src_buf.plane[plane].stride = in->linesize[p];
> + src_buf.plane[plane].mask = -1;
> +
> + p = odesc->comp[plane].plane;
> + dst_buf.plane[plane].data = out->data[p] + out_slice_start * out->linesize[p];
> + dst_buf.plane[plane].stride = out->linesize[p];
> + dst_buf.plane[plane].mask = -1;
> + }
> +
> + if (s->ztd[jobnr].graph)
> + ret = zimg_filter_graph_process(s->ztd[jobnr].graph, &src_buf, &dst_buf, s->ztd[jobnr].tmp, 0, 0, 0, 0);
> + if (ret)
> + return print_zimg_error(ctx);
> +
> + if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> + const int out_slice_start = (out->height * jobnr) / nb_jobs;
> +
> + src_buf.plane[0].data = in->data[3];
> + src_buf.plane[0].stride = in->linesize[3];
> + src_buf.plane[0].mask = -1;
> +
> + dst_buf.plane[0].data = out->data[3] + out_slice_start * out->linesize[3];
> + dst_buf.plane[0].stride = out->linesize[3];
> + dst_buf.plane[0].mask = -1;
> +
> + ret = zimg_filter_graph_process(s->ztd[jobnr].alpha_graph, &src_buf, &dst_buf, s->ztd[jobnr].tmp, 0, 0, 0, 0);
> + if (ret)
> + return print_zimg_error(ctx);
> + } else if (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> + int x, y;
> +
> + if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) {
> + for (y = 0; y < out->height; y++) {
> + for (x = 0; x < out->width; x++) {
> + AV_WN32(out->data[3] + x * odesc->comp[3].step + y * out->linesize[3],
> + av_float2int(1.0f));
> + }
> + }
> + } else {
> + for (y = 0; y < out->height; y++)
> + memset(out->data[3] + y * out->linesize[3], 0xff, out->width);
> + }
> + }
> +
> + return 0;
> +}
> +
> static int filter_frame(AVFilterLink *link, AVFrame *in)
> {
> - ZScaleContext *s = link->dst->priv;
> - AVFilterLink *outlink = link->dst->outputs[0];
> + AVFilterContext *ctx = link->dst;
> + ZScaleContext *s = ctx->priv;
> + AVFilterLink *outlink = ctx->outputs[0];
> const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(link->format);
> const AVPixFmtDescriptor *odesc = av_pix_fmt_desc_get(outlink->format);
> - zimg_image_buffer_const src_buf = { ZIMG_API_VERSION };
> - zimg_image_buffer dst_buf = { ZIMG_API_VERSION };
> char buf[32];
> - int ret = 0, plane;
> + int ret = 0;
> + ThreadData td;
> AVFrame *out;
>
> out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> @@ -552,41 +717,28 @@ static int filter_frame(AVFilterLink *link, AVFrame *in)
> return ret;
> }
>
> - zimg_image_format_default(&s->src_format, ZIMG_API_VERSION);
> - zimg_image_format_default(&s->dst_format, ZIMG_API_VERSION);
> - zimg_graph_builder_params_default(&s->params, ZIMG_API_VERSION);
> -
> - s->params.dither_type = s->dither;
> - s->params.cpu_type = ZIMG_CPU_AUTO;
> - s->params.resample_filter = s->filter;
> - s->params.resample_filter_uv = s->filter;
> - s->params.nominal_peak_luminance = s->nominal_peak_luminance;
> - s->params.allow_approximate_gamma = s->approximate_gamma;
> -
> - format_init(&s->src_format, in, desc, s->colorspace_in,
> - s->primaries_in, s->trc_in, s->range_in, s->chromal_in);
> - format_init(&s->dst_format, out, odesc, s->colorspace,
> - s->primaries, s->trc, s->range, s->chromal);
> + td.out = out;
> + td.in = in;
> + td.desc = desc;
> + td.odesc = odesc;
> + ret = ctx->internal->execute(ctx, prepare_graph, &td, NULL, FFMIN3(in->height, out->height, s->nb_threads));
> + if (ret)
> + goto fail;
>
> if (s->colorspace != -1)
> - out->colorspace = (int)s->dst_format.matrix_coefficients;
> + out->colorspace = (int)s->ztd[0].dst_format.matrix_coefficients;
>
> if (s->primaries != -1)
> - out->color_primaries = (int)s->dst_format.color_primaries;
> + out->color_primaries = (int)s->ztd[0].dst_format.color_primaries;
>
> if (s->range != -1)
> - out->color_range = (int)s->dst_format.pixel_range + 1;
> + out->color_range = (int)s->ztd[0].dst_format.pixel_range + 1;
>
> if (s->trc != -1)
> - out->color_trc = (int)s->dst_format.transfer_characteristics;
> + out->color_trc = (int)s->ztd[0].dst_format.transfer_characteristics;
>
> if (s->chromal != -1)
> - out->chroma_location = (int)s->dst_format.chroma_location - 1;
> -
> - ret = graph_build(&s->graph, &s->params, &s->src_format, &s->dst_format,
> - &s->tmp, &s->tmp_size);
> - if (ret < 0)
> - goto fail;
> + out->chroma_location = (int)s->ztd[0].dst_format.chroma_location - 1;
>
> s->in_colorspace = in->colorspace;
> s->in_trc = in->color_trc;
> @@ -596,101 +748,38 @@ static int filter_frame(AVFilterLink *link, AVFrame *in)
> s->out_trc = out->color_trc;
> s->out_primaries = out->color_primaries;
> s->out_range = out->color_range;
> -
> - if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> - zimg_image_format_default(&s->alpha_src_format, ZIMG_API_VERSION);
> - zimg_image_format_default(&s->alpha_dst_format, ZIMG_API_VERSION);
> - zimg_graph_builder_params_default(&s->alpha_params, ZIMG_API_VERSION);
> -
> - s->alpha_params.dither_type = s->dither;
> - s->alpha_params.cpu_type = ZIMG_CPU_AUTO;
> - s->alpha_params.resample_filter = s->filter;
> -
> - s->alpha_src_format.width = in->width;
> - s->alpha_src_format.height = in->height;
> - s->alpha_src_format.depth = desc->comp[0].depth;
> - s->alpha_src_format.pixel_type = (desc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : desc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
> - s->alpha_src_format.color_family = ZIMG_COLOR_GREY;
> -
> - s->alpha_dst_format.width = out->width;
> - s->alpha_dst_format.height = out->height;
> - s->alpha_dst_format.depth = odesc->comp[0].depth;
> - s->alpha_dst_format.pixel_type = (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : odesc->comp[0].depth > 8 ? ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE;
> - s->alpha_dst_format.color_family = ZIMG_COLOR_GREY;
> -
> - zimg_filter_graph_free(s->alpha_graph);
> - s->alpha_graph = zimg_filter_graph_build(&s->alpha_src_format, &s->alpha_dst_format, &s->alpha_params);
> - if (!s->alpha_graph) {
> - ret = print_zimg_error(link->dst);
> - goto fail;
> - }
> - }
> }
>
> if (s->colorspace != -1)
> - out->colorspace = (int)s->dst_format.matrix_coefficients;
> + out->colorspace = (int)s->ztd[0].dst_format.matrix_coefficients;
>
> if (s->primaries != -1)
> - out->color_primaries = (int)s->dst_format.color_primaries;
> + out->color_primaries = (int)s->ztd[0].dst_format.color_primaries;
>
> if (s->range != -1)
> - out->color_range = (int)s->dst_format.pixel_range;
> + out->color_range = (int)s->ztd[0].dst_format.pixel_range;
>
> if (s->trc != -1)
> - out->color_trc = (int)s->dst_format.transfer_characteristics;
> + out->color_trc = (int)s->ztd[0].dst_format.transfer_characteristics;
> +
> + if (s->chromal != -1)
> + out->chroma_location = (int)s->ztd[0].dst_format.chroma_location - 1;
>
> av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
> (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
> (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
> INT_MAX);
>
> - for (plane = 0; plane < 3; plane++) {
> - int p = desc->comp[plane].plane;
> - src_buf.plane[plane].data = in->data[p];
> - src_buf.plane[plane].stride = in->linesize[p];
> - src_buf.plane[plane].mask = -1;
> -
> - p = odesc->comp[plane].plane;
> - dst_buf.plane[plane].data = out->data[p];
> - dst_buf.plane[plane].stride = out->linesize[p];
> - dst_buf.plane[plane].mask = -1;
> - }
> -
> - ret = zimg_filter_graph_process(s->graph, &src_buf, &dst_buf, s->tmp, 0, 0, 0, 0);
> - if (ret) {
> - ret = print_zimg_error(link->dst);
> + if (!s->ztd[0].graph) {
> + ret = AVERROR(EINVAL);
> goto fail;
> }
>
> - if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> - src_buf.plane[0].data = in->data[3];
> - src_buf.plane[0].stride = in->linesize[3];
> - src_buf.plane[0].mask = -1;
> -
> - dst_buf.plane[0].data = out->data[3];
> - dst_buf.plane[0].stride = out->linesize[3];
> - dst_buf.plane[0].mask = -1;
> -
> - ret = zimg_filter_graph_process(s->alpha_graph, &src_buf, &dst_buf, s->tmp, 0, 0, 0, 0);
> - if (ret) {
> - ret = print_zimg_error(link->dst);
> - goto fail;
> - }
> - } else if (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) {
> - int x, y;
> -
> - if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) {
> - for (y = 0; y < out->height; y++) {
> - for (x = 0; x < out->width; x++) {
> - AV_WN32(out->data[3] + x * odesc->comp[3].step + y * out->linesize[3],
> - av_float2int(1.0f));
> - }
> - }
> - } else {
> - for (y = 0; y < outlink->h; y++)
> - memset(out->data[3] + y * out->linesize[3], 0xff, outlink->w);
> - }
> - }
> + td.out = out;
> + td.in = in;
> + td.desc = desc;
> + td.odesc = odesc;
> + ret = ctx->internal->execute(ctx, zscale_slice, &td, NULL, FFMIN3(in->height, out->height, s->nb_threads));
>
> fail:
> av_frame_free(&in);
> @@ -706,10 +795,13 @@ static void uninit(AVFilterContext *ctx)
> {
> ZScaleContext *s = ctx->priv;
>
> - zimg_filter_graph_free(s->graph);
> - zimg_filter_graph_free(s->alpha_graph);
> - av_freep(&s->tmp);
> - s->tmp_size = 0;
> + for (int i = 0; i < s->nb_threads; i++) {
> + zimg_filter_graph_free(s->ztd[i].graph);
> + zimg_filter_graph_free(s->ztd[i].alpha_graph);
> + av_freep(&s->ztd[i].tmp);
> + s->ztd[i].tmp_size = 0;
> + }
> + av_freep(&s->ztd);
> }
>
> static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
> @@ -890,4 +982,5 @@ AVFilter ff_vf_zscale = {
> .inputs = avfilter_vf_zscale_inputs,
> .outputs = avfilter_vf_zscale_outputs,
> .process_command = process_command,
> + .flags = AVFILTER_FLAG_SLICE_THREADS,
> };
> --
> 2.17.1
I've had to use zscale to convert 10-bit 4k60p video from HLG HDR to
SDR (bt709). It was ~36x times slower than real time. What I ended
up doing to speed it up was to generate CLUT image (16-bit yuv444
65x65x65 sampling of input color space), lay it out as a 2D image
(512x537), and run it through zscale to generate the HDR->SDR
transform CLUT. Then I used the CLUT instead of zscale for every
frame... that got me to about ~3.5x times slower than realtime
converting 60fps 10-bit 4k HLG to SDR (and I don't know any assembly,
so I didn't attempt to optimize the CLUT trilinear optimization with
SIMD, so maybe it could be faster still). I then ported to CUDA and
was able to convert 4k60p HLG->SDR faster than realtime on a Pascal
GPU.
So, I'm not sure that adding slice threading to zscale is the best
optimization for it. I think capturing the effect of zscale in a CLUT
would be a more significant optimization.
Just my 2 cents, hope this helps.
Pavel.
More information about the ffmpeg-devel
mailing list