[FFmpeg-devel] [PATCH] avfilter/vf_nlmeans_opencl: making filter independent of bit depth

Sat Apr 3 20:31:48 EEST 2021

On 21/03/2021 23:22, Lucas Clemente Vella wrote:
> This filter originally quantized OpenCL float images fetchs in 256 levels,
> and computed the integral image of squared differences in 32 bit integers.
> 
> This had two consequences:
> 1) it could overflow if the image resolution was big enough (I got overflows
> in a 4K video);
> 2) it dropped precision from bit depths higher than 8 bits.
> 
> Now the integral image is computed with float values in range [0, 1], instead
> of integers in range [0, 255] (then squared), so there is no longer the risk
> of overflow.
> 
> And even with the accumulated floating point error over the integral image,
> the resulting difference between this float implementation and an experimental
> uint64 implementation with 65535 quantization levels is less than 0.08%
> on the worst difference (per component), and less than 0.002% on average. For
> reference, the smallest variation possible on a 10-bit quantization is 0.098%
> of the total intensity. This was tested on a 4K frame from an 10-bit source.

Does the sum actually avoid floating-point error beyond simple rounding?

Given only 23 bits of precision, it looks like there is a danger of losing things by adding a set of small numbers to a large number and the large number being unchanged.

> 
> Signed-off-by: Lucas Clemente Vella <lvella at gmail.com>
> ---
>   libavfilter/opencl/nlmeans.cl   | 31 ++++++++++++++----------------
>   libavfilter/vf_nlmeans_opencl.c | 34 +++++----------------------------
>   2 files changed, 19 insertions(+), 46 deletions(-)
> 
> diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
> index 72bd681fd6..6d78a41e46 100644
> --- a/libavfilter/opencl/nlmeans.cl
> +++ b/libavfilter/opencl/nlmeans.cl
> @@ -20,7 +20,7 @@ const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
>                              CLK_ADDRESS_CLAMP_TO_EDGE   |
>                              CLK_FILTER_NEAREST);
>   
> -kernel void horiz_sum(__global uint4 *integral_img,
> +kernel void horiz_sum(__global float4 *integral_img,
>                         __read_only image2d_t src,
>                         int width,
>                         int height,
> @@ -31,7 +31,7 @@ kernel void horiz_sum(__global uint4 *integral_img,
>       int y = get_global_id(0);
>       int work_size = get_global_size(0);
>   
> -    uint4 sum = (uint4)(0);
> +    float4 sum = 0.0;

Surprise double.

>       float4 s2;
>       for (int i = 0; i < width; i++) {
>           float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
> @@ -39,28 +39,25 @@ kernel void horiz_sum(__global uint4 *integral_img,
>           s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
>           s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
>           s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
> -        sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
> +        sum += (s1 - s2) * (s1 - s2);
>           integral_img[y * width + i] = sum;
>       }
>   }
>   
> -kernel void vert_sum(__global uint4 *integral_img,
> -                     __global int *overflow,
> +kernel void vert_sum(__global float4 *integral_img,
>                        int width,
>                        int height)
>   {
>       int x = get_global_id(0);
> -    uint4 sum = 0;
> +    float4 sum = 0;

Please write floats as floats (0.0f), even if the implicit conversion is correct.

>       for (int i = 0; i < height; i++) {
> -        if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
> -            atomic_inc(overflow);
>           integral_img[i * width + x] += sum;
>           sum = integral_img[i * width + x];
>       }
>   }
>   
>   kernel void weight_accum(global float *sum, global float *weight,
> -                         global uint4 *integral_img, __read_only image2d_t src,
> +                         global float4 *integral_img, __read_only image2d_t src,
>                            int width, int height, int p, float h,
>                            int4 dx, int4 dy)
>   {
> @@ -75,16 +72,16 @@ kernel void weight_accum(global float *sum, global float *weight,
>       int y = get_global_id(1);
>       int4 xoff = x + dx;
>       int4 yoff = y + dy;
> -    uint4 a = 0, b = 0, c = 0, d = 0;
> -    uint4 src_pix = 0;
> +    float4 a = 0, b = 0, c = 0, d = 0;
> +    float4 src_pix = 0;
>   
>       // out-of-bounding-box?
>       int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
>   
> -    src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
> -    src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
> -    src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
> -    src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
> +    src_pix.x = read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x;
> +    src_pix.y = read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x;
> +    src_pix.z = read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x;
> +    src_pix.w = read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x;
>       if (!oobb) {
>           a = integral_img[(y - p) * width + x - p];
>           b = integral_img[(y + p) * width + x - p];
> @@ -93,7 +90,7 @@ kernel void weight_accum(global float *sum, global float *weight,
>       }
>   
>       float4 patch_diff = convert_float4(d + a - c - b);
> -    float4 w = native_exp(-patch_diff / (h * h));
> +    float4 w = native_exp(-patch_diff * (float4)(255.0f * 255.0f) / (h * h));
>       float w_sum = w.x + w.y + w.z + w.w;
>       weight[y * width + x] += w_sum;
>       sum[y * width + x] += dot(w, convert_float4(src_pix));
> @@ -109,7 +106,7 @@ kernel void average(__write_only image2d_t dst,
>       float w = weight[y * dim.x + x];
>       float s = sum[y * dim.x + x];
>       float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
> -    float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
> +    float r = (s + src_pix) / (1.0f + w);
>       if (x < dim.x && y < dim.y)
>           write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
>   }
> diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
> index e57b5e0873..0b69f3b6c4 100644
> --- a/libavfilter/vf_nlmeans_opencl.c
> +++ b/libavfilter/vf_nlmeans_opencl.c
> @@ -30,11 +30,9 @@
>   #include "opencl_source.h"
>   #include "video.h"
>   
> -// TODO:
> -//      the integral image may overflow 32bit, consider using 64bit
> -
>   static const enum AVPixelFormat supported_formats[] = {
>       AV_PIX_FMT_YUV420P,
> +    AV_PIX_FMT_YUV420P16LE,
>       AV_PIX_FMT_YUV444P,

Is this the only format it has added support for?  Feels like it could do others which in the MSBs.

>       AV_PIX_FMT_GBRP,
>   };
> @@ -59,7 +57,6 @@ typedef struct NLMeansOpenCLContext {
>       cl_mem                integral_img;
>       cl_mem                weight;
>       cl_mem                sum;
> -    cl_mem                overflow; // overflow in integral image?
>       double                sigma;
>       float                 h;
>       int                   chroma_w;
> @@ -129,7 +126,7 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
>                        "average kernel %d.\n", cle);
>   
>       ctx->integral_img = clCreateBuffer(ctx->ocf.hwctx->context, 0,
> -                                       4 * width * height * sizeof(cl_int),
> +                                       4 * width * height * sizeof(cl_float),
>                                          NULL, &cle);
>       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
>                        "integral image %d.\n", cle);
> @@ -144,11 +141,6 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
>       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
>                        "sum buffer %d.\n", cle);
>   
> -    ctx->overflow = clCreateBuffer(ctx->ocf.hwctx->context, 0,
> -                                   sizeof(cl_int), NULL, &cle);
> -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
> -                     "overflow buffer %d.\n", cle);
> -
>       ctx->initialised = 1;
>       return 0;
>   
> @@ -161,7 +153,6 @@ fail:
>       CL_RELEASE_MEMORY(ctx->integral_img);
>       CL_RELEASE_MEMORY(ctx->weight);
>       CL_RELEASE_MEMORY(ctx->sum);
> -    CL_RELEASE_MEMORY(ctx->overflow);
>   
>       CL_RELEASE_QUEUE(ctx->command_queue);
>       return err;
> @@ -239,9 +230,8 @@ static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src,
>           // vertical pass
>           // integral(x, y) = sum(integral(x, v)) for v in [0, y]
>           CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img);
> -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);
> -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);
> -        CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &width);
> +        CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &height);

Not affecting this patch, but a side thought: are these arguments actually needed?  They are used as the work sizes, so might be accessible inside the kernel already.

>           cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
>                                        1, NULL, worksize2, NULL, 0, NULL, NULL);
>           CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
> @@ -293,8 +283,7 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
>       const AVPixFmtDescriptor *desc;
>       enum AVPixelFormat in_format;
>       cl_mem src, dst;
> -    const cl_int zero = 0;
> -    int w, h, err, cle, overflow, p, patch, research;
> +    int w, h, err, cle, p, patch, research;
>   
>       av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
>              av_get_pix_fmt_name(input->format),
> @@ -331,11 +320,6 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
>               goto fail;
>       }
>   
> -    cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> -                               0, sizeof(cl_int), &zero, 0, NULL, NULL);
> -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"
> -                     "detection buffer %d.\n", cle);
> -
>       for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
>           src = (cl_mem) input->data[p];
>           dst = (cl_mem) output->data[p];
> @@ -351,17 +335,10 @@ static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
>           if (err < 0)
>               goto fail;
>       }
> -    // overflow occurred?
> -    cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
> -                              0, sizeof(cl_int), &overflow, 0, NULL, NULL);
> -    CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle);
>   
>       cle = clFinish(ctx->command_queue);
>       CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
>   
> -    if (overflow > 0)
> -        av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow);
> -
>       av_frame_free(&input);
>   
>       av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
> @@ -390,7 +367,6 @@ static av_cold void nlmeans_opencl_uninit(AVFilterContext *avctx)
>       CL_RELEASE_MEMORY(ctx->integral_img);
>       CL_RELEASE_MEMORY(ctx->weight);
>       CL_RELEASE_MEMORY(ctx->sum);
> -    CL_RELEASE_MEMORY(ctx->overflow);
>   
>       CL_RELEASE_QUEUE(ctx->command_queue);
>   
> 

Thanks,

- Mark