[FFmpeg-devel] [PATCH] avfilter/vf_nlmeans_opencl: 16-bit depth compatibility
Lucas Clemente Vella
lvella at gmail.com
Mon Mar 22 01:18:20 EET 2021
I do not recommend this patch for inclusion in master.
I used this implementation to quantify the floating pointing error
introduced by my next patch, which I think is better suited for
general usage, because the it is much faster and the error is
very small.
Adds 16-bit depth compatibility to the filter, at the cost
of being twice as slower.
Signed-off-by: Lucas Clemente Vella <lvella at gmail.com>
---
libavfilter/opencl/nlmeans.cl | 28 ++++++++++++++--------------
libavfilter/vf_nlmeans_opencl.c | 6 ++----
2 files changed, 16 insertions(+), 18 deletions(-)
diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
index 72bd681fd6..69e630d9fc 100644
--- a/libavfilter/opencl/nlmeans.cl
+++ b/libavfilter/opencl/nlmeans.cl
@@ -20,7 +20,7 @@ const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST);
-kernel void horiz_sum(__global uint4 *integral_img,
+kernel void horiz_sum(__global ulong4 *integral_img,
__read_only image2d_t src,
int width,
int height,
@@ -31,7 +31,7 @@ kernel void horiz_sum(__global uint4 *integral_img,
int y = get_global_id(0);
int work_size = get_global_size(0);
- uint4 sum = (uint4)(0);
+ ulong4 sum = (ulong4)(0);
float4 s2;
for (int i = 0; i < width; i++) {
float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
@@ -39,20 +39,20 @@ kernel void horiz_sum(__global uint4 *integral_img,
s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
- sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
+ sum += convert_ulong4((s1 - s2) * (s1 - s2) * 65535 * 65535);
integral_img[y * width + i] = sum;
}
}
-kernel void vert_sum(__global uint4 *integral_img,
+kernel void vert_sum(__global ulong4 *integral_img,
__global int *overflow,
int width,
int height)
{
int x = get_global_id(0);
- uint4 sum = 0;
+ ulong4 sum = 0;
for (int i = 0; i < height; i++) {
- if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
+ if (any((ulong4)ULONG_MAX - integral_img[i * width + x] < sum))
atomic_inc(overflow);
integral_img[i * width + x] += sum;
sum = integral_img[i * width + x];
@@ -60,7 +60,7 @@ kernel void vert_sum(__global uint4 *integral_img,
}
kernel void weight_accum(global float *sum, global float *weight,
- global uint4 *integral_img, __read_only image2d_t src,
+ global ulong4 *integral_img, __read_only image2d_t src,
int width, int height, int p, float h,
int4 dx, int4 dy)
{
@@ -75,16 +75,16 @@ kernel void weight_accum(global float *sum, global float *weight,
int y = get_global_id(1);
int4 xoff = x + dx;
int4 yoff = y + dy;
- uint4 a = 0, b = 0, c = 0, d = 0;
+ ulong4 a = 0, b = 0, c = 0, d = 0;
uint4 src_pix = 0;
// out-of-bounding-box?
int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >= width;
- src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
- src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
- src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
- src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
+ src_pix.x = (int)(65535 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
+ src_pix.y = (int)(65535 * read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x);
+ src_pix.z = (int)(65535 * read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x);
+ src_pix.w = (int)(65535 * read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x);
if (!oobb) {
a = integral_img[(y - p) * width + x - p];
b = integral_img[(y + p) * width + x - p];
@@ -93,7 +93,7 @@ kernel void weight_accum(global float *sum, global float *weight,
}
float4 patch_diff = convert_float4(d + a - c - b);
- float4 w = native_exp(-patch_diff / (h * h));
+ float4 w = native_exp(-patch_diff * (float4)1.5140274644582053e-05 / (h * h));
float w_sum = w.x + w.y + w.z + w.w;
weight[y * width + x] += w_sum;
sum[y * width + x] += dot(w, convert_float4(src_pix));
@@ -109,7 +109,7 @@ kernel void average(__write_only image2d_t dst,
float w = weight[y * dim.x + x];
float s = sum[y * dim.x + x];
float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
- float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
+ float r = (s + src_pix * 65535) / (1.0f + w) / 65535.0f;
if (x < dim.x && y < dim.y)
write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
}
diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
index e57b5e0873..82d08d0f3c 100644
--- a/libavfilter/vf_nlmeans_opencl.c
+++ b/libavfilter/vf_nlmeans_opencl.c
@@ -30,11 +30,9 @@
#include "opencl_source.h"
#include "video.h"
-// TODO:
-// the integral image may overflow 32bit, consider using 64bit
-
static const enum AVPixelFormat supported_formats[] = {
AV_PIX_FMT_YUV420P,
+ AV_PIX_FMT_YUV420P16LE,
AV_PIX_FMT_YUV444P,
AV_PIX_FMT_GBRP,
};
@@ -129,7 +127,7 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int width, int height)
"average kernel %d.\n", cle);
ctx->integral_img = clCreateBuffer(ctx->ocf.hwctx->context, 0,
- 4 * width * height * sizeof(cl_int),
+ 4 * width * height * sizeof(cl_long),
NULL, &cle);
CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
"integral image %d.\n", cle);
--
2.27.0
More information about the ffmpeg-devel
mailing list