[FFmpeg-devel] [PATCH] lavfi: add opencl tonemap filter.

Tue May 22 03:18:30 EEST 2018

On 21/05/18 07:50, Ruiling Song wrote:
> This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.
> 
> An example command to use this filter with vaapi codecs:
> FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
> opencl=ocl at va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
> vaapi -i INPUT -filter_hw_device ocl -filter_complex \
> '[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
> [x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2 OUTPUT
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---

I assume you're testing with Beignet for this sort of mapping to work?  I tried it with Beignet on Coffee Lake with 10-bit videos and it looks sensible, though it is rather hard to tell whether it is in some sense "correct".

Given a non-P010 input video it fails with build errors when compling the kernels:

[Parsed_tonemap_opencl_1 @ 0x55b700e51540] Failed to build program: -11.
[Parsed_tonemap_opencl_1 @ 0x55b700e51540] Build log:
/home/mrt/video/ffmpeg/opencl/libavfilter/opencl/colorspace_basic.cl:125:19: error: use of undeclared identifier 'null'; did you mean 'all'?
stringInput.cl:7:21: note: expanded from macro 'rgb_matrix'

That case should probably be caught earlier and rejected with a clear message.

On Mali:

$ ./ffmpeg_g -v 55 -y -i ~/test/The\ World\ in\ HDR.mkv -init_hw_device opencl -filter_hw_device opencl0 -an -vf 'format=p010,hwupload,tonemap_opencl=t=bt2020:tonemap=linear:format=p010,hwdownload,format=p010' -c:v libx264 out.mp4
...
[tonemap_opencl @ 0x8201d7c0] Filter input: opencl, 3840x2160 (0).
[Parsed_tonemap_opencl_2 @ 0x8201d760] Failed to enqueue kernel: -5.

That's an RK3288 with a Mali T760, clinfo: <https://0x0.st/se5r.txt>, full log: <https://0x0.st/se5s.log>.

(The Rockchip hardware decoder can do H.265 Main 10, but the output format isn't P010 so it's easier to use VP9 here.)

Some more thoughts below, I haven't read through all of it carefully.

Thanks,

- Mark

>  configure                              |   1 +
>  libavfilter/Makefile                   |   2 +
>  libavfilter/allfilters.c               |   1 +
>  libavfilter/colorspace_basic.c         |  89 ++++++
>  libavfilter/colorspace_basic.h         |  40 +++
>  libavfilter/opencl/colorspace_basic.cl | 179 +++++++++++
>  libavfilter/opencl/tonemap.cl          | 258 +++++++++++++++
>  libavfilter/opencl_source.h            |   2 +
>  libavfilter/vf_tonemap_opencl.c        | 560 +++++++++++++++++++++++++++++++++
>  9 files changed, 1132 insertions(+)
>  create mode 100644 libavfilter/colorspace_basic.c
>  create mode 100644 libavfilter/colorspace_basic.h
>  create mode 100644 libavfilter/opencl/colorspace_basic.cl
>  create mode 100644 libavfilter/opencl/tonemap.cl
>  create mode 100644 libavfilter/vf_tonemap_opencl.c
> 
> ...
> diff --git a/libavfilter/opencl/tonemap.cl b/libavfilter/opencl/tonemap.cl
> new file mode 100644
> index 0000000..03cf3e2
> --- /dev/null
> +++ b/libavfilter/opencl/tonemap.cl
> @@ -0,0 +1,258 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define REFERENCE_WHITE 100.0f
> +extern float3 lrgb2yuv(float3);
> +extern float3 yuv2lrgb(float3);
> +extern float3 lrgb2lrgb(float3);
> +extern float get_luma_src(float3);
> +extern float get_luma_dst(float3);
> +extern float3 ootf(float3);
> +extern float3 inverse_ootf(float3);
> +struct detection_result {
> +    float peak;
> +    float average;
> +};
> +
> +float hable_f(float in) {
> +    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
> +    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
> +}
> +
> +float direct(float s, float peak) {
> +    return s;
> +}
> +
> +float linear(float s, float peak) {
> +    return s * tone_param / peak;
> +}
> +
> +float gamma(float s, float peak) {
> +    float p = s > 0.05f ? s /peak : 0.05f / peak;
> +    float v = pow(p, 1.0f / tone_param);
> +    return s > 0.05f ? v : (s * v /0.05f);
> +}
> +
> +float clip(float s, float peak) {
> +    return clamp(s * tone_param, 0.0f, 1.0f);
> +}
> +
> +float reinhard(float s, float peak) {
> +    return s / (s + tone_param) * (peak + tone_param) / peak;
> +}
> +
> +float hable(float s, float peak) {
> +    return hable_f(s)/hable_f(peak);
> +}
> +
> +float mobius(float s, float peak) {
> +    float j = tone_param;
> +    float a, b;
> +
> +    if (s <= j)
> +        return s;
> +
> +    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
> +    b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f);
> +
> +    return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b);
> +}
> +
> +// detect peak/average signal of a frame, the algorithm was ported from:
> +// libplacebo (https://github.com/haasn/libplacebo)
> +struct detection_result
> +detect_peak_avg(global uint *util_buf, __local uint *sum_wg,
> +            float signal, float peak) {
> +    global uint *avg_buf = util_buf;
> +    global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1;
> +    global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1;
> +    global uint *max_total_p = counter_wg_p + 1;
> +    global uint *avg_total_p = max_total_p + 1;
> +    global uint *frame_idx_p = avg_total_p + 1;
> +    global uint *scene_frame_num_p = frame_idx_p + 1;
> +
> +    uint frame_idx = *frame_idx_p;
> +    uint scene_frame_num = *scene_frame_num_p;
> +
> +    size_t lidx = get_local_id(0);
> +    size_t lidy = get_local_id(1);
> +    size_t lsizex = get_local_size(0);
> +    size_t lsizey = get_local_size(1);
> +    uint num_wg = get_num_groups(0) * get_num_groups(1);
> +    size_t group_idx = get_group_id(0);
> +    size_t group_idy = get_group_id(1);
> +    struct detection_result r = {peak, sdr_avg};
> +    *sum_wg = 0;

This is technically a data race - maybe set it in only the first workitem?

> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update workgroup sum
> +    atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE));

I think the numbers you're adding together here sum to at most something like 16 * 16 * 100 * 1023?  Can you make sure this can't overflow and add a comment on that.

> +    barrier(CLK_LOCAL_MEM_FENCE);
> +
> +    // update frame peak/avg using work-group-average.
> +    if (lidx == 0 && lidy == 0) {
> +        uint avg_wg = *sum_wg / (lsizex * lsizey);
> +        atomic_max(&peak_buf[frame_idx], avg_wg);
> +        atomic_add(&avg_buf[frame_idx], avg_wg);

Similarly this one?  (width/16 * height/16 * 100 * 1023, I think, which might overflow for 8K?)

> +    }
> +
> +    if (scene_frame_num > 0) {
> +        float peak = (float)*max_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        float avg = (float)*avg_total_p / (REFERENCE_WHITE * scene_frame_num);
> +        r.peak = max(1.0f, peak);
> +        r.average = max(0.25f, avg);

fmax()?  (max() is an integer function, not sure what it does to 0.25f.)

> +    }
> +
> +    if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1) {
> +        *counter_wg_p = 0;
> +        avg_buf[frame_idx] /= num_wg;
> +
> +        if (scene_threshold > 0.0f) {
> +            uint cur_max = peak_buf[frame_idx];
> +            uint cur_avg = avg_buf[frame_idx];
> +            int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p;
> +
> +            if (abs(diff) > scene_frame_num * scene_threshold * REFERENCE_WHITE) {
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  avg_buf[i] = 0;
> +                for (uint i = 0; i < DETECTION_FRAMES + 1; i++)
> +                  peak_buf[i] = 0;
> +                *avg_total_p = *max_total_p = 0;
> +                *scene_frame_num_p = 0;
> +                avg_buf[frame_idx] = cur_avg;
> +                peak_buf[frame_idx] = cur_max;
> +            }
> +        }
> +        uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1);
> +        // add current frame, subtract next frame
> +        *max_total_p += peak_buf[frame_idx] - peak_buf[next];
> +        *avg_total_p += avg_buf[frame_idx] - avg_buf[next];
> +        // reset next frame
> +        peak_buf[next] = avg_buf[next] = 0;
> +        *frame_idx_p = next;
> +        *scene_frame_num_p = min(*scene_frame_num_p + 1, (uint)DETECTION_FRAMES);
> +    }
> +    return r;
> +}
> +
> +__constant const float desat_param = 0.5f;
> +__constant const float dst_peak    = 1.0f;
> +
> +float3 map_one_pixel_rgb(float3 rgb, float peak, float average) {
> +    float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f);

More max(), also below.

> +    // de-saturate
> +    if (desat_param > 0.0f) {
> +        float luma = get_luma_dst(rgb);
> +        float base = 0.18f * dst_peak;

Magic number might want some explaination.

> +        float coeff = max(sig - base, 1e-6f) / max(sig, 1e-6f);
> +        coeff = native_powr(coeff, 10.0f / desat_param);
> +        rgb = mix(rgb, (float3)luma, (float3)coeff);
> +        sig = mix(sig, luma, coeff);
> +    }
> +
> +    float sig_old = sig;
> +    float slope = min(1.0f, sdr_avg / average);
> +    sig *= slope;
> +    peak *= slope;
> +
> +    sig = TONE_FUNC(sig, peak);
> +    rgb *= (sig/sig_old);
> +    return rgb;
> +}
> +// map from source space YUV to destination space RGB
> +float3 map_to_dst_space_from_yuv(float3 yuv) {
> +    float3 c = yuv2lrgb(yuv);
> +    c = ootf(c);
> +    c = lrgb2lrgb(c);
> +    return c;
> +}
> +
> +// convert from rgb to yuv, with possible inverse-ootf
> +float3 convert_to_yuv(float3 c) {
> +    c = inverse_ootf(c);
> +    return lrgb2yuv(c);
> +}
> +
> +__kernel void tonemap(__write_only image2d_t dst1,
> +                      __write_only image2d_t dst2,
> +                      __read_only  image2d_t src1,
> +                      __read_only  image2d_t src2,
> +#ifdef THIRD_PLANE
> +                      __write_only image2d_t dst3,
> +                      __read_only  image2d_t src3,
> +#endif
> +                      global uint *util_buf,
> +                      float peak
> +                      )
> +{
> +    __local uint sum_wg;
> +    const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
> +                               CLK_FILTER_NEAREST);
> +    int xi = get_global_id(0);
> +    int yi = get_global_id(1);
> +    // each work item process four pixels
> +    int x = 2 * xi;
> +    int y = 2 * yi;
> +
> +    float y0 = read_imagef(src1, sampler, (int2)(x,     y)).x;
> +    float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x;
> +    float y2 = read_imagef(src1, sampler, (int2)(x,     y + 1)).x;
> +    float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x;
> +#ifdef THIRD_PLANE
> +    float u = read_imagef(src2, sampler, (int2)(xi, yi)).x;
> +    float v = read_imagef(src3, sampler, (int2)(xi, yi)).x;
> +    float2 uv = (float2)(u, v);
> +#else
> +    float2 uv = read_imagef(src2, sampler, (int2)(xi,     yi)).xy;
> +#endif
> +
> +    float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y));
> +    float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y));
> +    float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y));
> +    float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y));
> +
> +    float sig0 = max(c0.x, max(c0.y, c0.z));
> +    float sig1 = max(c1.x, max(c1.y, c1.z));
> +    float sig2 = max(c2.x, max(c2.y, c2.z));
> +    float sig3 = max(c3.x, max(c3.y, c3.z));
> +    float sig = max(sig0, max(sig1, max(sig2, sig3)));
> +
> +    struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak);
> +
> +    float3 c0_old = c0, c1_old = c1, c2_old = c2;
> +    c0 = map_one_pixel_rgb(c0, r.peak, r.average);
> +    c1 = map_one_pixel_rgb(c1, r.peak, r.average);
> +    c2 = map_one_pixel_rgb(c2, r.peak, r.average);
> +    c3 = map_one_pixel_rgb(c3, r.peak, r.average);
> +
> +    float3 yuv0 = convert_to_yuv(c0);
> +    float3 yuv1 = convert_to_yuv(c1);
> +    float3 yuv2 = convert_to_yuv(c2);
> +    float3 yuv3 = convert_to_yuv(c3);
> +
> +    write_imagef(dst1, (int2)(x, y), (float4)(yuv0.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y), (float4)(yuv1.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x, y+1), (float4)(yuv2.x, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst1, (int2)(x+1, y+1), (float4)(yuv3.x, 0.0f, 0.0f, 1.0f));
> +#ifdef THIRD_PLANE
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, 0.0f, 0.0f, 1.0f));
> +    write_imagef(dst3, (int2)(xi, yi), (float4)(yuv0.z, 0.0f, 0.0f, 1.0f));
> +#else
> +    write_imagef(dst2, (int2)(xi, yi), (float4)(yuv0.y, yuv0.z, 0.0f, 1.0f));
> +#endif
> +}
> diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
> index 4bb9969..c5b3f37 100644
> --- a/libavfilter/opencl_source.h
> +++ b/libavfilter/opencl_source.h
> @@ -21,7 +21,9 @@
>  
>  extern const char *ff_opencl_source_avgblur;
>  extern const char *ff_opencl_source_convolution;
> +extern const char *ff_opencl_source_colorspace_basic;
>  extern const char *ff_opencl_source_overlay;
> +extern const char *ff_opencl_source_tonemap;
>  extern const char *ff_opencl_source_unsharp;
>  
>  #endif /* AVFILTER_OPENCL_SOURCE_H */
> diff --git a/libavfilter/vf_tonemap_opencl.c b/libavfilter/vf_tonemap_opencl.c
> new file mode 100644
> index 0000000..e2311e0
> --- /dev/null
> +++ b/libavfilter/vf_tonemap_opencl.c
> @@ -0,0 +1,560 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include <float.h>
> +
> +#include "libavutil/bprint.h"
> +#include "libavutil/common.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/mastering_display_metadata.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "avfilter.h"
> +#include "internal.h"
> +#include "opencl.h"
> +#include "opencl_source.h"
> +#include "video.h"
> +#include "colorspace_basic.h"
> +
> +#define DETECTION_FRAMES 63
> +#define REFERENCE_WHITE 100.0f
> +
> +enum TonemapAlgorithm {
> +    TONEMAP_NONE,
> +    TONEMAP_LINEAR,
> +    TONEMAP_GAMMA,
> +    TONEMAP_CLIP,
> +    TONEMAP_REINHARD,
> +    TONEMAP_HABLE,
> +    TONEMAP_MOBIUS,
> +    TONEMAP_MAX,
> +};
> +
> +typedef struct TonemapOpenCLContext {
> +    OpenCLFilterContext ocf;
> +
> +    enum AVColorSpace colorspace, colorspace_in, colorspace_out;
> +    enum AVColorTransferCharacteristic trc, trc_in, trc_out;
> +    enum AVColorPrimaries primaries, primaries_in, primaries_out;
> +    enum AVColorRange range, range_in, range_out;
> +
> +    enum TonemapAlgorithm tonemap;
> +    enum AVPixelFormat    format;
> +    double                peak;
> +    double                param;
> +    int                   initialised;
> +    cl_kernel             kernel;
> +    cl_command_queue      command_queue;
> +    cl_mem                util_mem;
> +    DECLARE_ALIGNED(64, int32_t, util_buf)[2 * DETECTION_FRAMES + 7];
> +} TonemapOpenCLContext;
> +
> +const char *yuv_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "rgb2yuv_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "rgb2yuv_bt2020",
> +};
> +
> +const char *rgb_coff[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709] = "yuv2rgb_bt709",
> +    [AVCOL_SPC_BT2020_NCL] = "yuv2rgb_bt2020",
> +};
> +
> +const char *linearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_SMPTE2084] = "eotf_st2084",
> +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_oetf_hlg",
> +};
> +
> +const char *ootf_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_ARIB_STD_B67] = "ootf_hlg",
> +    [AVCOL_TRC_SMPTE2084] = "",
> +};
> +
> +const char *inverse_ootf_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_ARIB_STD_B67] = "inverse_ootf_hlg",
> +    [AVCOL_TRC_SMPTE2084] = "",
> +};
> +
> +const char *delinearize_funcs[AVCOL_TRC_NB] = {
> +    [AVCOL_TRC_BT709]     = "inverse_eotf_bt1886",
> +    [AVCOL_TRC_BT2020_10] = "inverse_eotf_bt1886",
> +};
> +
> +static const struct LumaCoefficients luma_coefficients[AVCOL_SPC_NB] = {
> +    [AVCOL_SPC_BT709]      = { 0.2126, 0.7152, 0.0722 },
> +    [AVCOL_SPC_BT2020_NCL] = { 0.2627, 0.6780, 0.0593 },
> +};
> +
> +struct ColorPrimaries primaries_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.640, 0.330, 0.300, 0.600, 0.150, 0.060 },
> +    [AVCOL_PRI_BT2020] = { 0.708, 0.292, 0.170, 0.797, 0.131, 0.046 },
> +};
> +
> +struct WhitePoint whitepoint_table[AVCOL_PRI_NB] = {
> +    [AVCOL_PRI_BT709]  = { 0.3127, 0.3290 },
> +    [AVCOL_PRI_BT2020] = { 0.3127, 0.3290 },
> +};
> +
> +const char *tonemap_func[TONEMAP_MAX] = {
> +    [TONEMAP_NONE]     = "direct",
> +    [TONEMAP_LINEAR]   = "linear",
> +    [TONEMAP_GAMMA]    = "gamma",
> +    [TONEMAP_CLIP]     = "clip",
> +    [TONEMAP_REINHARD] = "reinhard",
> +    [TONEMAP_HABLE]    = "hable",
> +    [TONEMAP_MOBIUS]   = "mobius",
> +};
> +
> +static void get_rgb2rgb_matrix(enum AVColorPrimaries in, enum AVColorPrimaries out,
> +                               double rgb2rgb[3][3]) {
> +    double rgb2xyz[3][3], xyz2rgb[3][3];
> +
> +    fill_rgb2xyz_table(&primaries_table[out], &whitepoint_table[out], rgb2xyz);
> +    invert_matrix3x3(rgb2xyz, xyz2rgb);
> +    fill_rgb2xyz_table(&primaries_table[in], &whitepoint_table[in], rgb2xyz);
> +    mul3x3(rgb2rgb, rgb2xyz, xyz2rgb);
> +}
> +
> +#define OPENCL_SOURCE_NB 3
> +// Average light level for SDR signals. This is equal to a signal level of 0.5
> +// under a typical presentation gamma of about 2.0.
> +static const float sdr_avg = 0.25f;
> +static const float scene_threshold = 0.2f;
> +
> +static int tonemap_opencl_init(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int rgb2rgb_passthrough = 1;
> +    double rgb2rgb[3][3];
> +    struct LumaCoefficients luma_src, luma_dst;
> +    cl_int cle;
> +    int err;
> +    AVBPrint header;
> +    const char *opencl_sources[OPENCL_SOURCE_NB];
> +
> +    av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);
> +
> +    switch(ctx->tonemap) {
> +    case TONEMAP_GAMMA:
> +        if (isnan(ctx->param))
> +            ctx->param = 1.8f;
> +        break;
> +    case TONEMAP_REINHARD:
> +        if (!isnan(ctx->param))
> +            ctx->param = (1.0f - ctx->param) / ctx->param;
> +        break;
> +    case TONEMAP_MOBIUS:
> +        if (isnan(ctx->param))
> +            ctx->param = 0.3f;
> +        break;
> +    }
> +
> +    if (isnan(ctx->param))
> +        ctx->param = 1.0f;
> +
> +    av_bprintf(&header, "__constant const float tone_param = %.4ff;\n",
> +               ctx->param);
> +    av_bprintf(&header, "__constant const float sdr_avg = %.4ff;\n", sdr_avg);
> +    av_bprintf(&header, "__constant const float scene_threshold = %.4ff;\n",
> +               scene_threshold);
> +    av_bprintf(&header, "#define TONE_FUNC %s\n", tonemap_func[ctx->tonemap]);
> +    av_bprintf(&header, "#define DETECTION_FRAMES %d\n", DETECTION_FRAMES);
> +
> +    if (ctx->primaries_out != ctx->primaries_in) {
> +        get_rgb2rgb_matrix(ctx->primaries_in, ctx->primaries_out, rgb2rgb);
> +        rgb2rgb_passthrough = 0;
> +    }
> +    if (ctx->range_in == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_IN\n");
> +
> +    if (ctx->range_out == AVCOL_RANGE_JPEG)
> +        av_bprintf(&header, "#define FULL_RANGE_OUT\n");
> +
> +    if (rgb2rgb_passthrough)
> +        av_bprintf(&header, "#define RGB2RGB_PASSTHROUGH\n");
> +    else {
> +        av_bprintf(&header, "__constant float rgb2rgb[9] = {\n");
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[0][0], rgb2rgb[0][1], rgb2rgb[0][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff,\n",
> +                   rgb2rgb[1][0], rgb2rgb[1][1], rgb2rgb[1][2]);
> +        av_bprintf(&header, "    %.4ff, %.4ff, %.4ff};\n",
> +                   rgb2rgb[2][0], rgb2rgb[2][1], rgb2rgb[2][2]);
> +    }
> +
> +    av_bprintf(&header, "#define rgb_matrix %s\n",
> +               rgb_coff[ctx->colorspace_in]);

You need to check ctx->colorspace_in before this point - if it isn't a supported value then the kernel fails to compile.  (And it can go off the end if the user builds with a higher value of AVCOL_SPC_NB.)

Similarly the other function name defines below.

> +    av_bprintf(&header, "#define yuv_matrix %s\n",
> +               yuv_coff[ctx->colorspace_out]);
> +
> +    luma_src = luma_coefficients[ctx->colorspace_in];
> +    luma_dst = luma_coefficients[ctx->colorspace_out];
> +    av_bprintf(&header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_src.cr, luma_src.cg, luma_src.cb);
> +    av_bprintf(&header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",
> +               luma_dst.cr, luma_dst.cg, luma_dst.cb);
> +
> +    av_bprintf(&header, "#define linearize %s\n", linearize_funcs[ctx->trc_in]);
> +    av_bprintf(&header, "#define delinearize %s\n",
> +               delinearize_funcs[ctx->trc_out]);
> +
> +    av_bprintf(&header, "#define ootf_impl %s\n", ootf_funcs[ctx->trc_in]);
> +    av_bprintf(&header, "#define inverse_ootf_impl %s\n",
> +               inverse_ootf_funcs[ctx->trc_in]);
> +
> +
> +    opencl_sources[0] = header.str;
> +    opencl_sources[1] = ff_opencl_source_tonemap;
> +    opencl_sources[2] = ff_opencl_source_colorspace_basic;
> +    err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB);
> +
> +    av_bprint_finalize(&header, NULL);
> +    if (err < 0)
> +        goto fail;
> +
> +    ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
> +                                              ctx->ocf.hwctx->device_id,
> +                                              0, &cle);
> +    if (!ctx->command_queue) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create OpenCL "
> +               "command queue: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->kernel = clCreateKernel(ctx->ocf.program, "tonemap", &cle);
> +    if (!ctx->kernel) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create kernel: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->util_mem = clCreateBuffer(ctx->ocf.hwctx->context,
> +                                   CL_MEM_USE_HOST_PTR |
> +                                   CL_MEM_HOST_NO_ACCESS,
> +                                   sizeof(ctx->util_buf), ctx->util_buf, &cle);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to create util buffer: %d.\n", cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }
> +
> +    ctx->initialised = 1;
> +    return 0;
> +
> +fail:
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->command_queue)
> +        clReleaseCommandQueue(ctx->command_queue);
> +    if (ctx->kernel)
> +        clReleaseKernel(ctx->kernel);
> +    return err;
> +}
> +
> +static int tonemap_opencl_config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *avctx = outlink->src;
> +    TonemapOpenCLContext *s = avctx->priv;
> +    int ret;
> +    s->ocf.output_format = s->format == AV_PIX_FMT_NONE ? AV_PIX_FMT_NV12 : s->format;
> +    ret = ff_opencl_filter_config_output(outlink);
> +    if (ret < 0)
> +        return ret;
> +
> +    return 0;
> +}
> +
> +static int launch_kernel(AVFilterContext *avctx, cl_kernel kernel,
> +                         AVFrame *output, AVFrame *input, float peak) {
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    int err = AVERROR(ENOSYS);
> +    size_t global_work[2];
> +    size_t local_work[2];
> +    cl_int cle;
> +
> +    cle = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "destination image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input->data[0]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 1st plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input->data[1]);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 4, sizeof(cl_mem), &ctx->util_mem);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "source image 2nd plane: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    cle = clSetKernelArg(kernel, 5, sizeof(cl_float), &peak);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to set kernel "
> +               "peak luma: %d.\n", cle);
> +        return AVERROR(EINVAL);
> +    }
> +
> +    local_work[0]  = 16;
> +    local_work[1]  = 16;
> +    // Note the work size based on uv plane, as we process a 2x2 quad in one workitem
> +    err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,
> +                                                1, 16);
> +    if (err < 0)
> +        return err;
> +
> +    cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,
> +                                 global_work, local_work,
> +                                 0, NULL, NULL);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to enqueue kernel: %d.\n",
> +               cle);
> +        return AVERROR(EIO);
> +    }
> +    return 0;
> +}
> +
> +static double determine_signal_peak(AVFrame *in)
> +{
> +    AVFrameSideData *sd = av_frame_get_side_data(in, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
> +    double peak = 0;
> +
> +    if (sd) {
> +        AVContentLightMetadata *clm = (AVContentLightMetadata *)sd->data;
> +        peak = clm->MaxCLL / REFERENCE_WHITE;
> +    }
> +
> +    sd = av_frame_get_side_data(in, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
> +    if (!peak && sd) {
> +        AVMasteringDisplayMetadata *metadata = (AVMasteringDisplayMetadata *)sd->data;
> +        if (metadata->has_luminance)
> +            peak = av_q2d(metadata->max_luminance) / REFERENCE_WHITE;
> +    }
> +
> +    // if not SMPTE2084, we would assume HLG
> +    if (!peak)
> +        peak = in->color_trc == AVCOL_TRC_SMPTE2084 ? 100.0f : 12.0f;
> +
> +    return peak;
> +}
> +
> +static int tonemap_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
> +{
> +    AVFilterContext    *avctx = inlink->dst;
> +    AVFilterLink     *outlink = avctx->outputs[0];
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    AVFrame *output = NULL;
> +    cl_int cle;
> +    int err;
> +    double peak = ctx->peak;
> +
> +    AVHWFramesContext *input_frames_ctx =
> +        (AVHWFramesContext*)input->hw_frames_ctx->data;
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(input->format),
> +           input->width, input->height, input->pts);
> +
> +    if (!input->hw_frames_ctx)
> +        return AVERROR(EINVAL);
> +
> +    output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
> +    if (!output) {
> +        err = AVERROR(ENOMEM);
> +        goto fail;
> +    }
> +
> +    err = av_frame_copy_props(output, input);
> +    if (err < 0)
> +        goto fail;
> +
> +    if (!peak)
> +        peak = determine_signal_peak(input);
> +
> +    if (ctx->trc != -1)
> +        output->color_trc = ctx->trc;
> +    if (ctx->primaries != -1)
> +        output->color_primaries = ctx->primaries;
> +    if (ctx->colorspace != -1)
> +        output->colorspace = ctx->colorspace;
> +    if (ctx->range != -1)
> +        output->color_range = ctx->range;
> +
> +    ctx->trc_in = input->color_trc;
> +    ctx->trc_out = output->color_trc;
> +    ctx->colorspace_in = input->colorspace;
> +    ctx->colorspace_out = output->colorspace;
> +    ctx->primaries_in = input->color_primaries;
> +    ctx->primaries_out = output->color_primaries;
> +    ctx->range_in = input->color_range;
> +    ctx->range_out = output->color_range;
> +
> +    if (!ctx->initialised) {
> +        err = tonemap_opencl_init(avctx);
> +        if (err < 0)
> +            goto fail;
> +    }
> +
> +    switch(input_frames_ctx->sw_format) {
> +    case AV_PIX_FMT_P010:
> +        err = launch_kernel(avctx, ctx->kernel, output, input, peak);
> +        if (err < 0) goto fail;
> +        break;
> +    default:
> +        av_log(ctx, AV_LOG_ERROR, "unsupported format in tonemap_opencl.\n");
> +        err = AVERROR(ENOSYS);
> +        goto fail;
> +    }
> +
> +    cle = clFinish(ctx->command_queue);
> +    if (cle != CL_SUCCESS) {
> +        av_log(avctx, AV_LOG_ERROR, "Failed to finish command queue: %d.\n",
> +               cle);
> +        err = AVERROR(EIO);
> +        goto fail;
> +    }

It might be nice to add some debug output here showing the what transformation was actually applied and maybe some of the persistent parameters from util_buf (they would be easier to verify as sensible).

> +
> +    av_frame_free(&input);
> +
> +    av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> +           av_get_pix_fmt_name(output->format),
> +           output->width, output->height, output->pts);
> +
> +    return ff_filter_frame(outlink, output);
> +
> +fail:
> +    clFinish(ctx->command_queue);
> +    av_frame_free(&input);
> +    av_frame_free(&output);
> +    return err;
> +}
> +
> +static av_cold void tonemap_opencl_uninit(AVFilterContext *avctx)
> +{
> +    TonemapOpenCLContext *ctx = avctx->priv;
> +    cl_int cle;
> +
> +    if (ctx->util_mem)
> +        clReleaseMemObject(ctx->util_mem);
> +    if (ctx->kernel) {
> +        cle = clReleaseKernel(ctx->kernel);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "kernel: %d.\n", cle);
> +    }
> +
> +    if (ctx->command_queue) {
> +        cle = clReleaseCommandQueue(ctx->command_queue);
> +        if (cle != CL_SUCCESS)
> +            av_log(avctx, AV_LOG_ERROR, "Failed to release "
> +                   "command queue: %d.\n", cle);
> +    }
> +
> +    ff_opencl_filter_uninit(avctx);
> +}
> +
> +#define OFFSET(x) offsetof(TonemapOpenCLContext, x)
> +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
> +static const AVOption tonemap_opencl_options[] = {
> +    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_NONE}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
> +    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
> +    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
> +    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
> +    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
> +    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
> +    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
> +    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
> +    { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
> +    { "t",        "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "transfer" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},         0, 0, FLAGS, "transfer" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},     0, 0, FLAGS, "transfer" },
> +    { "matrix", "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    { "m",      "set colorspace matrix", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "matrix" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},         0, 0, FLAGS, "matrix" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},    0, 0, FLAGS, "matrix" },
> +    { "primaries", "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    { "p",         "set color primaries", OFFSET(primaries), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "primaries" },
> +    {     "bt709",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},         0, 0, FLAGS, "primaries" },
> +    {     "bt2020",           0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},        0, 0, FLAGS, "primaries" },
> +    { "range",         "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    { "r",             "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, "range" },
> +    {     "tv",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "pc",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    {     "limited",       0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},         0, 0, FLAGS, "range" },
> +    {     "full",          0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},         0, 0, FLAGS, "range" },
> +    { "format",    "output pixel format", OFFSET(format), AV_OPT_TYPE_INT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_GBRAP12LE, FLAGS, "fmt" },
> +    {     "nv12",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_NV12},          0, 0, FLAGS, "fmt" },
> +    {     "p010",            0,       0,                 AV_OPT_TYPE_CONST, {.i64 = AV_PIX_FMT_P010},          0, 0, FLAGS, "fmt" },

Can you use AV_OPT_TYPE_PIXFMT?

> +    { "peak",      "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
> +    { "param",     "tonemap parameter",   OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(tonemap_opencl);
> +
> +static const AVFilterPad tonemap_opencl_inputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame = &tonemap_opencl_filter_frame,
> +        .config_props = &ff_opencl_filter_config_input,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad tonemap_opencl_outputs[] = {
> +    {
> +        .name         = "default",
> +        .type         = AVMEDIA_TYPE_VIDEO,
> +        .config_props = &tonemap_opencl_config_output,
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_tonemap_opencl = {
> +    .name           = "tonemap_opencl",
> +    .description    = NULL_IF_CONFIG_SMALL("perform HDR to SDR conversion with tonemapping"),
> +    .priv_size      = sizeof(TonemapOpenCLContext),
> +    .priv_class     = &tonemap_opencl_class,
> +    .init           = &ff_opencl_filter_init,
> +    .uninit         = &tonemap_opencl_uninit,
> +    .query_formats  = &ff_opencl_filter_query_formats,
> +    .inputs         = tonemap_opencl_inputs,
> +    .outputs        = tonemap_opencl_outputs,
> +    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
> +};
>