[FFmpeg-cvslog] nlmeans_vulkan: parallelize workgroup invocations

Wed Oct 11 18:18:52 EEST 2023

ffmpeg | branch: master | Lynne <dev at lynne.ee> | Fri Sep 15 21:55:59 2023 +0200| [f31d0f11417067a3fc9d53085c32f4ba82b252e4] | committer: Lynne

nlmeans_vulkan: parallelize workgroup invocations

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f31d0f11417067a3fc9d53085c32f4ba82b252e4
---

 libavfilter/Makefile               |   3 +-
 libavfilter/vf_nlmeans_vulkan.c    | 438 +++++++++++++++++++------------------
 libavfilter/vulkan/prefix_sum.comp | 151 -------------
 3 files changed, 224 insertions(+), 368 deletions(-)

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 9a100cd665..603b532ad0 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -395,8 +395,7 @@ OBJS-$(CONFIG_MULTIPLY_FILTER)               += vf_multiply.o
 OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_negate.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
-OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER)         += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o \
-                                                vulkan/prefix_sum.o
+OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER)         += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o
 OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index 9741dd67ac..2b8f97d7d9 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -38,9 +38,10 @@ typedef struct NLMeansVulkanContext {
     VkSampler sampler;
 
     AVBufferPool *integral_buf_pool;
-    AVBufferPool *state_buf_pool;
     AVBufferPool *ws_buf_pool;
 
+    FFVkBuffer xyoffsets_buf;
+
     int pl_weights_rows;
     FFVulkanPipeline pl_weights;
     FFVkSPIRVShader shd_weights;
@@ -66,107 +67,97 @@ typedef struct NLMeansVulkanContext {
 
 extern const char *ff_source_prefix_sum_comp;
 
-static void insert_first(FFVkSPIRVShader *shd, int r, int horiz, int plane, int comp)
+static void insert_first(FFVkSPIRVShader *shd, int r, const char *off, int horiz, int plane, int comp)
 {
-    GLSLF(2,     s1    = texture(input_img[%i], ivec2(x + %i, y + %i))[%i];
-          ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
-
-    if (TYPE_ELEMS == 4) {
-        GLSLF(2, s2[0] = texture(input_img[%i], ivec2(x + %i + xoffs[0], y + %i + yoffs[0]))[%i];
-              ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
-        GLSLF(2, s2[1] = texture(input_img[%i], ivec2(x + %i + xoffs[1], y + %i + yoffs[1]))[%i];
-              ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
-        GLSLF(2, s2[2] = texture(input_img[%i], ivec2(x + %i + xoffs[2], y + %i + yoffs[2]))[%i];
-              ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
-        GLSLF(2, s2[3] = texture(input_img[%i], ivec2(x + %i + xoffs[3], y + %i + yoffs[3]))[%i];
-              ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
-    } else {
-        for (int i = 0; i < 16; i++) {
-            GLSLF(2, s2[%i][%i] = texture(input_img[%i], ivec2(x + %i + xoffs[%i], y + %i + yoffs[%i]))[%i];
-                  ,i / 4, i % 4, plane, horiz ? r : 0, i, !horiz ? r : 0, i, comp);
-        }
-    }
-
-    GLSLC(2, s2 = (s1 - s2) * (s1 - s2);                                       );
+    GLSLF(4, s1    = texture(input_img[%i], pos + ivec2(%i + %s, %i + %s))[%i];
+          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+
+    GLSLF(4, s2[0] = texture(input_img[%i], pos + offs[0] + ivec2(%i + %s, %i + %s))[%i];
+          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+    GLSLF(4, s2[1] = texture(input_img[%i], pos + offs[1] + ivec2(%i + %s, %i + %s))[%i];
+          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+    GLSLF(4, s2[2] = texture(input_img[%i], pos + offs[2] + ivec2(%i + %s, %i + %s))[%i];
+          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+    GLSLF(4, s2[3] = texture(input_img[%i], pos + offs[3] + ivec2(%i + %s, %i + %s))[%i];
+          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+
+    GLSLC(4, s2 = (s1 - s2) * (s1 - s2);                                                    );
 }
 
 static void insert_horizontal_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp)
 {
-    GLSLF(1, x = int(gl_GlobalInvocationID.x) * %i;                   ,nb_rows);
-    if (!first) {
-        GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
-                                gl_StorageSemanticsBuffer,
-                                gl_SemanticsAcquireRelease |
-                                gl_SemanticsMakeAvailable |
-                                gl_SemanticsMakeVisible);                     );
-    }
-    GLSLF(1, for (y = 0; y < height[%i]; y++) {                               ,plane);
-    GLSLC(2,     offset = uint64_t(int_stride)*y*T_ALIGN;                     );
-    GLSLC(2,     dst = DataBuffer(uint64_t(integral_data) + offset);          );
-    GLSLC(0,                                                                  );
-    if (first) {
-        for (int r = 0; r < nb_rows; r++) {
-            insert_first(shd, r, 1, plane, comp);
-            GLSLF(2, dst.v[x + %i] = s2;                                    ,r);
-            GLSLC(0,                                                          );
-        }
-    }
-    GLSLC(2,     barrier();                                                   );
-    GLSLC(2,     prefix_sum(dst, 1, dst, 1);                                  );
-    GLSLC(1, }                                                                );
-    GLSLC(0,                                                                  );
+    GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i;                           ,nb_rows);
+    if (!first)
+        GLSLC(1, barrier();                                                       );
+    GLSLC(0,                                                                      );
+    GLSLF(1, if (pos.y < height[%i]) {                                            ,plane);
+    GLSLC(2,     #pragma unroll(1)                                                );
+    GLSLF(2,     for (r = 0; r < %i; r++) {                                       ,nb_rows);
+    GLSLC(3,         prefix_sum = DTYPE(0);                                       );
+    GLSLC(3,         offset = uint64_t(int_stride)*(pos.y + r)*T_ALIGN;           );
+    GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);          );
+    GLSLC(0,                                                                      );
+    GLSLF(3,         for (pos.x = 0; pos.x < width[%i]; pos.x++) {                ,plane);
+    if (first)
+        insert_first(shd, 0, "r", 0, plane, comp);
+    else
+        GLSLC(4,         s2 = dst.v[pos.x];                                       );
+    GLSLC(4,             dst.v[pos.x] = s2 + prefix_sum;                          );
+    GLSLC(4,             prefix_sum += s2;                                        );
+    GLSLC(3,         }                                                            );
+    GLSLC(2,     }                                                                );
+    GLSLC(1, }                                                                    );
+    GLSLC(0,                                                                      );
 }
 
 static void insert_vertical_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp)
 {
-    GLSLF(1, y = int(gl_GlobalInvocationID.x) * %i;                   ,nb_rows);
-    if (!first) {
-        GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
-                                gl_StorageSemanticsBuffer,
-                                gl_SemanticsAcquireRelease |
-                                gl_SemanticsMakeAvailable |
-                                gl_SemanticsMakeVisible);                     );
-    }
-    GLSLF(1, for (x = 0; x < width[%i]; x++) {                                ,plane);
-    GLSLC(2,     dst = DataBuffer(uint64_t(integral_data) + x*T_ALIGN);       );
-
-    for (int r = 0; r < nb_rows; r++) {
-        if (first) {
-            insert_first(shd, r, 0, plane, comp);
-            GLSLF(2, integral_data.v[(y + %i)*int_stride + x] = s2;         ,r);
-            GLSLC(0,                                                          );
-        }
-    }
-
-    GLSLC(2,     barrier();                                                   );
-    GLSLC(2,     prefix_sum(dst, int_stride, dst, int_stride);                );
-    GLSLC(1, }                                                                );
-    GLSLC(0,                                                                  );
+    GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i;                           ,nb_rows);
+    GLSLC(1, #pragma unroll(1)                                                    );
+    GLSLF(1, for (r = 0; r < %i; r++)                                             ,nb_rows);
+    GLSLC(2,     psum[r] = DTYPE(0);                                              );
+    GLSLC(0,                                                                      );
+    if (!first)
+        GLSLC(1, barrier();                                                       );
+    GLSLC(0,                                                                      );
+    GLSLF(1, if (pos.x < width[%i]) {                                             ,plane);
+    GLSLF(2,     for (pos.y = 0; pos.y < height[%i]; pos.y++) {                   ,plane);
+    GLSLC(3,         offset = uint64_t(int_stride)*pos.y*T_ALIGN;                 );
+    GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);          );
+    GLSLC(0,                                                                      );
+    GLSLC(3,         #pragma unroll(1)                                            );
+    GLSLF(3,         for (r = 0; r < %i; r++) {                                   ,nb_rows);
+    if (first)
+        insert_first(shd, 0, "r", 1, plane, comp);
+    else
+        GLSLC(4,         s2 = dst.v[pos.x + r];                                   );
+    GLSLC(4,             dst.v[pos.x + r] = s2 + psum[r];                         );
+    GLSLC(4,             psum[r] += s2;                                           );
+    GLSLC(3,         }                                                            );
+    GLSLC(2,     }                                                                );
+    GLSLC(1, }                                                                    );
+    GLSLC(0,                                                                      );
 }
 
 static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
                                 int t, int dst_comp, int plane, int comp)
 {
-    GLSLF(1, p = patch_size[%i];                                     ,dst_comp);
+    GLSLF(1, p = patch_size[%i];                                              ,dst_comp);
     GLSLC(0,                                                                  );
-    GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
-                            gl_StorageSemanticsBuffer,
-                            gl_SemanticsAcquireRelease |
-                            gl_SemanticsMakeAvailable |
-                            gl_SemanticsMakeVisible);                         );
     GLSLC(1, barrier();                                                       );
+    GLSLC(0,                                                                  );
     if (!vert) {
-        GLSLF(1, for (y = 0; y < height[%i]; y++) {                           ,plane);
+        GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) {               ,plane);
         GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= width[%i])             ,nb_rows, plane);
         GLSLC(3,         break;                                               );
-        GLSLF(2,     for (r = 0; r < %i; r++) {                       ,nb_rows);
-        GLSLF(3,         x = int(gl_GlobalInvocationID.x) * %i + r;   ,nb_rows);
+        GLSLF(2,     for (r = 0; r < %i; r++) {                               ,nb_rows);
+        GLSLF(3,         pos.x = int(gl_GlobalInvocationID.x) * %i + r;       ,nb_rows);
     } else {
-        GLSLF(1, for (x = 0; x < width[%i]; x++) {                            ,plane);
+        GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) {                ,plane);
         GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= height[%i])            ,nb_rows, plane);
         GLSLC(3,         break;                                               );
-        GLSLF(2,     for (r = 0; r < %i; r++) {                       ,nb_rows);
-        GLSLF(3,         y = int(gl_GlobalInvocationID.x) * %i + r;   ,nb_rows);
+        GLSLF(2,     for (r = 0; r < %i; r++) {                               ,nb_rows);
+        GLSLF(3,         pos.y = int(gl_GlobalInvocationID.x) * %i + r;       ,nb_rows);
     }
     GLSLC(0,                                                                  );
     GLSLC(3,         a = DTYPE(0);                                            );
@@ -174,25 +165,25 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
     GLSLC(3,         c = DTYPE(0);                                            );
     GLSLC(3,         d = DTYPE(0);                                            );
     GLSLC(0,                                                                  );
-    GLSLC(3,         lt = ((x - p) < 0) || ((y - p) < 0);                     );
+    GLSLC(3,         lt = ((pos.x - p) < 0) || ((pos.y - p) < 0);             );
     GLSLC(0,                                                                  );
     if (TYPE_ELEMS == 4) {
-        GLSLF(3,         src[0] = texture(input_img[%i], ivec2(x + xoffs[0], y + yoffs[0]))[%i];   ,plane, comp);
-        GLSLF(3,         src[1] = texture(input_img[%i], ivec2(x + xoffs[1], y + yoffs[1]))[%i];   ,plane, comp);
-        GLSLF(3,         src[2] = texture(input_img[%i], ivec2(x + xoffs[2], y + yoffs[2]))[%i];   ,plane, comp);
-        GLSLF(3,         src[3] = texture(input_img[%i], ivec2(x + xoffs[3], y + yoffs[3]))[%i];   ,plane, comp);
+        GLSLF(3,         src[0] = texture(input_img[%i], pos + offs[0])[%i];   ,plane, comp);
+        GLSLF(3,         src[1] = texture(input_img[%i], pos + offs[1])[%i];   ,plane, comp);
+        GLSLF(3,         src[2] = texture(input_img[%i], pos + offs[2])[%i];   ,plane, comp);
+        GLSLF(3,         src[3] = texture(input_img[%i], pos + offs[3])[%i];   ,plane, comp);
     } else {
         for (int i = 0; i < 16; i++)
-            GLSLF(3, src[%i][%i] = texture(input_img[%i], ivec2(x + xoffs[%i], y + yoffs[%i]))[%i];
-                  ,i / 4, i % 4, plane, i, i, comp);
+            GLSLF(3, src[%i][%i] = texture(input_img[%i], pos + offs[%i])[%i];
+                  ,i / 4, i % 4, plane, i, comp);
 
     }
     GLSLC(0,                                                                  );
     GLSLC(3,         if (lt == false) {                                       );
-    GLSLC(4,             a = integral_data.v[(y - p)*int_stride + x - p];     );
-    GLSLC(4,             c = integral_data.v[(y - p)*int_stride + x + p];     );
-    GLSLC(4,             b = integral_data.v[(y + p)*int_stride + x - p];     );
-    GLSLC(4,             d = integral_data.v[(y + p)*int_stride + x + p];     );
+    GLSLC(4,             a = integral_data.v[(pos.y - p)*int_stride + pos.x - p];     );
+    GLSLC(4,             c = integral_data.v[(pos.y - p)*int_stride + pos.x + p];     );
+    GLSLC(4,             b = integral_data.v[(pos.y + p)*int_stride + pos.x - p];     );
+    GLSLC(4,             d = integral_data.v[(pos.y + p)*int_stride + pos.x + p];     );
     GLSLC(3,         }                                                        );
     GLSLC(0,                                                                  );
     GLSLC(3,         patch_diff = d + a - b - c;                              );
@@ -212,27 +203,26 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
     }
     GLSLC(0,                                                                  );
     if (t > 1) {
-        GLSLF(3,         atomicAdd(weights_%i[y*ws_stride[%i] + x], w_sum);   ,dst_comp, dst_comp);
-        GLSLF(3,         atomicAdd(sums_%i[y*ws_stride[%i] + x], sum);        ,dst_comp, dst_comp);
+        GLSLF(3,         atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum);   ,dst_comp, dst_comp);
+        GLSLF(3,         atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum);        ,dst_comp, dst_comp);
     } else {
-        GLSLF(3,         weights_%i[y*ws_stride[%i] + x] += w_sum;            ,dst_comp, dst_comp);
-        GLSLF(3,         sums_%i[y*ws_stride[%i] + x] += sum;                 ,dst_comp, dst_comp);
+        GLSLF(3,         weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum;            ,dst_comp, dst_comp);
+        GLSLF(3,         sums_%i[pos.y*ws_stride[%i] + pos.x] += sum;                 ,dst_comp, dst_comp);
     }
     GLSLC(2,     }                                                            );
     GLSLC(1, }                                                                );
 }
 
 typedef struct HorizontalPushData {
-    VkDeviceAddress integral_data;
-    VkDeviceAddress state_data;
-    int32_t  xoffs[TYPE_ELEMS];
-    int32_t  yoffs[TYPE_ELEMS];
     uint32_t width[4];
     uint32_t height[4];
     uint32_t ws_stride[4];
     int32_t  patch_size[4];
     float    strength[4];
+    VkDeviceAddress integral_base;
+    uint32_t integral_size;
     uint32_t int_stride;
+    uint32_t xyoffs_start;
 } HorizontalPushData;
 
 static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
@@ -249,26 +239,18 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
     FFVulkanDescriptorSetBinding *desc_set;
     int max_dim = FFMAX(width, height);
     uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0];
-    int max_shm = vkctx->props.properties.limits.maxComputeSharedMemorySize;
     int wg_size, wg_rows;
 
     /* Round the max workgroup size to the previous power of two */
-    max_wg = 1 << (31 - ff_clz(max_wg));
     wg_size = max_wg;
     wg_rows = 1;
 
     if (max_wg > max_dim) {
-        wg_size = max_wg / (max_wg / max_dim);
+        wg_size = max_dim;
     } else if (max_wg < max_dim) {
-        /* First, make it fit */
+        /* Make it fit */
         while (wg_size*wg_rows < max_dim)
             wg_rows++;
-
-        /* Second, make sure there's enough shared memory */
-        while ((wg_size * TYPE_SIZE + TYPE_SIZE + 2*4) > max_shm) {
-            wg_size >>= 1;
-            wg_rows++;
-        }
     }
 
     RET(ff_vk_shader_init(pl, shd, "nlmeans_weights", VK_SHADER_STAGE_COMPUTE_BIT, 0));
@@ -278,33 +260,24 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
     if (t > 1)
         GLSLC(0, #extension GL_EXT_shader_atomic_float : require              );
     GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     );
-    GLSLC(0, #pragma use_vulkan_memory_model                                  );
-    GLSLC(0, #extension GL_KHR_memory_scope_semantics : enable                );
     GLSLC(0,                                                                  );
-    GLSLF(0, #define N_ROWS %i                                       ,*nb_rows);
-    GLSLC(0, #define WG_SIZE (gl_WorkGroupSize.x)                             );
-    GLSLF(0, #define LG_WG_SIZE %i                ,ff_log2(shd->local_size[0]));
-    GLSLC(0, #define PARTITION_SIZE (N_ROWS*WG_SIZE)                          );
-    GLSLF(0, #define DTYPE %s                                       ,TYPE_NAME);
-    GLSLF(0, #define T_ALIGN %i                                     ,TYPE_SIZE);
+    GLSLF(0, #define DTYPE %s                                                 ,TYPE_NAME);
+    GLSLF(0, #define T_ALIGN %i                                               ,TYPE_SIZE);
     GLSLC(0,                                                                  );
-    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) coherent buffer DataBuffer {  );
+    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer {  );
     GLSLC(1,     DTYPE v[];                                                   );
     GLSLC(0, };                                                               );
     GLSLC(0,                                                                  );
-    GLSLC(0, layout(buffer_reference) buffer StateData;                       );
-    GLSLC(0,                                                                  );
     GLSLC(0, layout(push_constant, std430) uniform pushConstants {            );
-    GLSLC(1,     coherent DataBuffer integral_data;                           );
-    GLSLC(1,     StateData  state;                                            );
-    GLSLF(1,     uint xoffs[%i];                                   ,TYPE_ELEMS);
-    GLSLF(1,     uint yoffs[%i];                                   ,TYPE_ELEMS);
     GLSLC(1,     uvec4 width;                                                 );
     GLSLC(1,     uvec4 height;                                                );
     GLSLC(1,     uvec4 ws_stride;                                             );
     GLSLC(1,     ivec4 patch_size;                                            );
     GLSLC(1,     vec4 strength;                                               );
+    GLSLC(1,     DataBuffer integral_base;                                    );
+    GLSLC(1,     uint integral_size;                                          );
     GLSLC(1,     uint int_stride;                                             );
+    GLSLC(1,     uint xyoffs_start;                                           );
     GLSLC(0, };                                                               );
     GLSLC(0,                                                                  );
 
@@ -370,42 +343,65 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
     };
     RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1 + 2*desc->nb_components, 0, 0));
 
-    GLSLD(   ff_source_prefix_sum_comp                                        );
-    GLSLC(0,                                                                  );
-    GLSLC(0, void main()                                                      );
-    GLSLC(0, {                                                                );
-    GLSLC(1,     uint64_t offset;                                             );
-    GLSLC(1,     DataBuffer dst;                                              );
-    GLSLC(1,     float s1;                                                    );
-    GLSLC(1,     DTYPE s2;                                                    );
-    GLSLC(1,     int r;                                                       );
-    GLSLC(1,     int x;                                                       );
-    GLSLC(1,     int y;                                                       );
-    GLSLC(1,     int p;                                                       );
-    GLSLC(0,                                                                  );
-    GLSLC(1,     DTYPE a;                                                     );
-    GLSLC(1,     DTYPE b;                                                     );
-    GLSLC(1,     DTYPE c;                                                     );
-    GLSLC(1,     DTYPE d;                                                     );
-    GLSLC(0,                                                                  );
-    GLSLC(1,     DTYPE patch_diff;                                            );
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "xyoffsets_buffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .mem_quali   = "readonly",
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "ivec2 xyoffsets[];",
+        },
+    };
+    RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1, 1, 0));
+
+    GLSLC(0,                                                                     );
+    GLSLC(0, void main()                                                         );
+    GLSLC(0, {                                                                   );
+    GLSLC(1,     uint64_t offset;                                                );
+    GLSLC(1,     DataBuffer dst;                                                 );
+    GLSLC(1,     float s1;                                                       );
+    GLSLC(1,     DTYPE s2;                                                       );
+    GLSLC(1,     DTYPE prefix_sum;                                               );
+    GLSLF(1,     DTYPE psum[%i];                                                 ,*nb_rows);
+    GLSLC(1,     int r;                                                          );
+    GLSLC(1,     ivec2 pos;                                                      );
+    GLSLC(1,     int p;                                                          );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     DataBuffer integral_data;                                       );
+    GLSLF(1,     ivec2 offs[%i];                                                 ,TYPE_ELEMS);
+    GLSLC(0,                                                                     );
+    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                          );
+
+    GLSLC(1,     offset = uint64_t(integral_size)*invoc_idx;                     );
+    GLSLC(1,     dst = DataBuffer(uint64_t(integral_data) + offset);             );
+
+    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset);   );
+    for (int i = 0; i < TYPE_ELEMS*2; i += 2)
+        GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + 2*%i*invoc_idx + %i];       ,i/2,TYPE_ELEMS,i);
+    GLSLC(0,                                                                     );
+    GLSLC(1,     DTYPE a;                                                        );
+    GLSLC(1,     DTYPE b;                                                        );
+    GLSLC(1,     DTYPE c;                                                        );
+    GLSLC(1,     DTYPE d;                                                        );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     DTYPE patch_diff;                                               );
     if (TYPE_ELEMS == 4) {
-        GLSLC(1, vec4 src;                                                    );
-        GLSLC(1, vec4 w;                                                      );
+        GLSLC(1, vec4 src;                                                       );
+        GLSLC(1, vec4 w;                                                         );
     } else {
-        GLSLC(1, vec4 src[4];                                                 );
-        GLSLC(1, vec4 w[4];                                                   );
+        GLSLC(1, vec4 src[4];                                                    );
+        GLSLC(1, vec4 w[4];                                                      );
     }
-    GLSLC(1,     float w_sum;                                                 );
-    GLSLC(1,     float sum;                                                   );
-    GLSLC(0,                                                                  );
-    GLSLC(1,     bool lt;                                                     );
-    GLSLC(1,     bool gt;                                                     );
-    GLSLC(0,                                                                  );
+    GLSLC(1,     float w_sum;                                                    );
+    GLSLC(1,     float sum;                                                      );
+    GLSLC(0,                                                                     );
+    GLSLC(1,     bool lt;                                                        );
+    GLSLC(1,     bool gt;                                                        );
+    GLSLC(0,                                                                     );
 
     for (int i = 0; i < desc->nb_components; i++) {
         int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8);
-        if (width > height) {
+        if (width >= height) {
             insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
             insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
             insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off);
@@ -416,7 +412,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
         }
     }
 
-    GLSLC(0, }                                                                );
+    GLSLC(0, }                                                                   );
 
     RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque));
     RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main"));
@@ -584,6 +580,8 @@ static av_cold int init_filter(AVFilterContext *ctx)
     FFVulkanContext *vkctx = &s->vkctx;
     const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
     FFVkSPIRVCompiler *spv;
+    int *offsets_buf;
+    int offsets_dispatched = 0, nb_dispatches = 0;
 
     const AVPixFmtDescriptor *desc;
     desc = av_pix_fmt_desc_get(vkctx->output_format);
@@ -634,6 +632,20 @@ static av_cold int init_filter(AVFilterContext *ctx)
         }
     }
 
+    RET(ff_vk_create_buf(&s->vkctx, &s->xyoffsets_buf, 2*s->nb_offsets*sizeof(int32_t), NULL, NULL,
+                         VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
+                         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                         VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+    RET(ff_vk_map_buffer(&s->vkctx, &s->xyoffsets_buf, (uint8_t **)&offsets_buf, 0));
+
+    for (int i = 0; i < 2*s->nb_offsets; i += 2) {
+        offsets_buf[i + 0] = s->xoffsets[i >> 1];
+        offsets_buf[i + 1] = s->yoffsets[i >> 1];
+    }
+
+    RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1));
+
     s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS));
     if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) {
         av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, "
@@ -641,11 +653,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
         s->opts.t = 1;
     }
 
-    if (!vkctx->feats_12.vulkanMemoryModel) {
-        av_log(ctx, AV_LOG_ERROR, "Device doesn't support the Vulkan memory model!");
-        return AVERROR(EINVAL);;
-    }
-
     spv = ff_vk_spirv_init();
     if (!spv) {
         av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
@@ -663,8 +670,19 @@ static av_cold int init_filter(AVFilterContext *ctx)
     RET(init_denoise_pipeline(vkctx, &s->e, &s->pl_denoise, &s->shd_denoise, s->sampler,
                               spv, desc, planes));
 
-    av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches, %i parallel\n",
-           s->nb_offsets, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS) + 1, s->opts.t);
+    RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, NULL, 1, 0, 0,
+                                    s->xyoffsets_buf.address, s->xyoffsets_buf.size,
+                                    VK_FORMAT_UNDEFINED));
+
+    do {
+        int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
+        wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
+        offsets_dispatched += wg_invoc * TYPE_ELEMS;
+        nb_dispatches++;
+    } while (offsets_dispatched < s->nb_offsets);
+
+    av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches\n",
+           s->nb_offsets, nb_dispatches);
 
     s->initialized = 1;
 
@@ -736,18 +754,16 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     int plane_widths[4];
     int plane_heights[4];
 
+    int offsets_dispatched = 0;
+
     /* Integral */
-    AVBufferRef *state_buf;
-    FFVkBuffer *state_vk;
-    AVBufferRef *integral_buf;
+    AVBufferRef *integral_buf = NULL;
     FFVkBuffer *integral_vk;
     uint32_t int_stride;
     size_t int_size;
-    size_t state_size;
-    int t_offset = 0;
 
     /* Weights/sums */
-    AVBufferRef *ws_buf;
+    AVBufferRef *ws_buf = NULL;
     FFVkBuffer *ws_vk;
     VkDeviceAddress weights_addr[4];
     VkDeviceAddress sums_addr[4];
@@ -773,7 +789,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     /* Integral image */
     int_stride = s->pl_weights.wg_size[0]*s->pl_weights_rows;
     int_size = int_stride * int_stride * TYPE_SIZE;
-    state_size = int_stride * 3 *TYPE_SIZE;
 
     /* Plane dimensions */
     for (int i = 0; i < desc->nb_components; i++) {
@@ -798,16 +813,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
         return err;
     integral_vk = (FFVkBuffer *)integral_buf->data;
 
-    err = ff_vk_get_pooled_buffer(&s->vkctx, &s->state_buf_pool, &state_buf,
-                                  VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-                                  VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
-                                  NULL,
-                                  s->opts.t * state_size,
-                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
-    if (err < 0)
-        return err;
-    state_vk = (FFVkBuffer *)state_buf->data;
-
     err = ff_vk_get_pooled_buffer(&s->vkctx, &s->ws_buf_pool, &ws_buf,
                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                   VK_BUFFER_USAGE_TRANSFER_DST_BIT |
@@ -844,9 +849,12 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     RET(ff_vk_exec_add_dep_frame(vkctx, exec, out,
                                  VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                                  VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+
     RET(ff_vk_exec_add_dep_buf(vkctx, exec, &integral_buf, 1, 0));
-    RET(ff_vk_exec_add_dep_buf(vkctx, exec, &state_buf,    1, 0));
+    integral_buf = NULL;
+
     RET(ff_vk_exec_add_dep_buf(vkctx, exec, &ws_buf,       1, 0));
+    ws_buf = NULL;
 
     /* Input frame prep */
     RET(ff_vk_create_imageviews(vkctx, exec, in_views, in));
@@ -869,6 +877,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
 
+    nb_buf_bar = 0;
     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
         .srcStageMask = ws_vk->stage,
@@ -881,6 +890,19 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
         .size = ws_vk->size,
         .offset = 0,
     };
+    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+        .srcStageMask = integral_vk->stage,
+        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+        .srcAccessMask = integral_vk->access,
+        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .buffer = integral_vk->buf,
+        .size = integral_vk->size,
+        .offset = 0,
+    };
 
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
             .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
@@ -891,10 +913,13 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
         });
     ws_vk->stage = buf_bar[0].dstStageMask;
     ws_vk->access = buf_bar[0].dstAccessMask;
+    integral_vk->stage = buf_bar[1].dstStageMask;
+    integral_vk->access = buf_bar[1].dstAccessMask;
 
-    /* Weights/sums buffer zeroing */
+    /* Buffer zeroing */
     vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0);
 
+    nb_buf_bar = 0;
     buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
         .srcStageMask = ws_vk->stage,
@@ -948,29 +973,22 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     /* Weights pipeline */
     ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_weights);
 
-    for (int i = 0; i < s->nb_offsets; i += TYPE_ELEMS) {
-        int *xoffs = s->xoffsets + i;
-        int *yoffs = s->yoffsets + i;
+    do {
+        int wg_invoc;
         HorizontalPushData pd = {
-            integral_vk->address + t_offset*int_size,
-            state_vk->address + t_offset*state_size,
-            { 0 },
-            { 0 },
             { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
             { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
             { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
             { s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
             { s->strength[0], s->strength[1], s->strength[2], s->strength[2], },
+            integral_vk->address,
+            int_size,
             int_stride,
+            offsets_dispatched * 2,
         };
 
-        memcpy(pd.xoffs, xoffs, sizeof(pd.xoffs));
-        memcpy(pd.yoffs, yoffs, sizeof(pd.yoffs));
-
-        /* Put a barrier once we run out of parallelism buffers */
-        if (!t_offset) {
+        if (offsets_dispatched) {
             nb_buf_bar = 0;
-            /* Buffer prep/sync */
             buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
                 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
                 .srcStageMask = integral_vk->stage,
@@ -984,39 +1002,28 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
                 .size = integral_vk->size,
                 .offset = 0,
             };
-            buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-                .srcStageMask = state_vk->stage,
-                .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                .srcAccessMask = state_vk->access,
-                .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .buffer = state_vk->buf,
-                .size = state_vk->size,
-                .offset = 0,
-            };
 
             vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
                     .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
                     .pBufferMemoryBarriers = buf_bar,
                     .bufferMemoryBarrierCount = nb_buf_bar,
                 });
-            integral_vk->stage = buf_bar[0].dstStageMask;
-            integral_vk->access = buf_bar[0].dstAccessMask;
-            state_vk->stage = buf_bar[1].dstStageMask;
-            state_vk->access = buf_bar[1].dstAccessMask;
+            integral_vk->stage = buf_bar[1].dstStageMask;
+            integral_vk->access = buf_bar[1].dstAccessMask;
         }
-        t_offset = (t_offset + 1) % s->opts.t;
 
         /* Push data */
         ff_vk_update_push_exec(vkctx, exec, &s->pl_weights, VK_SHADER_STAGE_COMPUTE_BIT,
                                0, sizeof(pd), &pd);
 
+        wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
+        wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
+
         /* End of horizontal pass */
-        vk->CmdDispatch(exec->buf, 1, 1, 1);
-    }
+        vk->CmdDispatch(exec->buf, 1, 1, wg_invoc);
+
+        offsets_dispatched += wg_invoc * TYPE_ELEMS;
+    } while (offsets_dispatched < s->nb_offsets);
 
     RET(denoise_pass(s, exec, ws_vk, ws_stride));
 
@@ -1033,6 +1040,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
     return ff_filter_frame(outlink, out);
 
 fail:
+    av_buffer_unref(&integral_buf);
+    av_buffer_unref(&ws_buf);
     av_frame_free(&in);
     av_frame_free(&out);
     return err;
@@ -1051,7 +1060,6 @@ static void nlmeans_vulkan_uninit(AVFilterContext *avctx)
     ff_vk_shader_free(vkctx, &s->shd_denoise);
 
     av_buffer_pool_uninit(&s->integral_buf_pool);
-    av_buffer_pool_uninit(&s->state_buf_pool);
     av_buffer_pool_uninit(&s->ws_buf_pool);
 
     if (s->sampler)
diff --git a/libavfilter/vulkan/prefix_sum.comp b/libavfilter/vulkan/prefix_sum.comp
deleted file mode 100644
index 9147cd82fb..0000000000
--- a/libavfilter/vulkan/prefix_sum.comp
+++ /dev/null
@@ -1,151 +0,0 @@
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-
-#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
-#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
-
-// These correspond to X, A, P respectively in the prefix sum paper.
-#define FLAG_NOT_READY       0u
-#define FLAG_AGGREGATE_READY 1u
-#define FLAG_PREFIX_READY    2u
-
-layout(buffer_reference, buffer_reference_align = T_ALIGN) nonprivate buffer StateData {
-    DTYPE aggregate;
-    DTYPE prefix;
-    uint flag;
-};
-
-shared DTYPE sh_scratch[WG_SIZE];
-shared DTYPE sh_prefix;
-shared uint  sh_part_ix;
-shared uint  sh_flag;
-
-void prefix_sum(DataBuffer dst, uint dst_stride, DataBuffer src, uint src_stride)
-{
-    DTYPE local[N_ROWS];
-    // Determine partition to process by atomic counter (described in Section 4.4 of prefix sum paper).
-    if (gl_GlobalInvocationID.x == 0)
-          sh_part_ix = gl_WorkGroupID.x;
-//        sh_part_ix = atomicAdd(part_counter, 1);
-
-    barrier();
-    uint part_ix = sh_part_ix;
-
-    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
-
-    // TODO: gate buffer read? (evaluate whether shader check or CPU-side padding is better)
-    local[0] = src.v[ix*src_stride];
-    for (uint i = 1; i < N_ROWS; i++)
-        local[i] = local[i - 1] + src.v[(ix + i)*src_stride];
-
-    DTYPE agg = local[N_ROWS - 1];
-    sh_scratch[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1u << i))
-            agg += sh_scratch[gl_LocalInvocationID.x - (1u << i)];
-        barrier();
-
-        sh_scratch[gl_LocalInvocationID.x] = agg;
-    }
-
-    // Publish aggregate for this partition
-    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-        state[part_ix].aggregate = agg;
-        if (part_ix == 0)
-            state[0].prefix = agg;
-    }
-
-    // Write flag with release semantics
-    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-        uint flag = part_ix == 0 ? FLAG_PREFIX_READY : FLAG_AGGREGATE_READY;
-        atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE);
-    }
-
-    DTYPE exclusive = DTYPE(0);
-    if (part_ix != 0) {
-        // step 4 of paper: decoupled lookback
-        uint look_back_ix = part_ix - 1;
-
-        DTYPE their_agg;
-        uint their_ix = 0;
-        while (true) {
-            // Read flag with acquire semantics.
-            if (gl_LocalInvocationID.x == WG_SIZE - 1)
-                sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE);
-
-            // The flag load is done only in the last thread. However, because the
-            // translation of memoryBarrierBuffer to Metal requires uniform control
-            // flow, we broadcast it to all threads.
-            barrier();
-
-            uint flag = sh_flag;
-            barrier();
-
-            if (flag == FLAG_PREFIX_READY) {
-                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    DTYPE their_prefix = state[look_back_ix].prefix;
-                    exclusive = their_prefix + exclusive;
-                }
-                break;
-            } else if (flag == FLAG_AGGREGATE_READY) {
-                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                    their_agg = state[look_back_ix].aggregate;
-                    exclusive = their_agg + exclusive;
-                }
-                look_back_ix--;
-                their_ix = 0;
-                continue;
-            } // else spins
-
-            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-                // Unfortunately there's no guarantee of forward progress of other
-                // workgroups, so compute a bit of the aggregate before trying again.
-                // In the worst case, spinning stops when the aggregate is complete.
-                DTYPE m = src.v[(look_back_ix * PARTITION_SIZE + their_ix)*src_stride];
-                if (their_ix == 0)
-                    their_agg = m;
-                else
-                    their_agg += m;
-
-                their_ix++;
-                if (their_ix == PARTITION_SIZE) {
-                    exclusive = their_agg + exclusive;
-                    if (look_back_ix == 0) {
-                        sh_flag = FLAG_PREFIX_READY;
-                    } else {
-                        look_back_ix--;
-                        their_ix = 0;
-                    }
-                }
-            }
-            barrier();
-            flag = sh_flag;
-            barrier();
-            if (flag == FLAG_PREFIX_READY)
-                break;
-        }
-
-        // step 5 of paper: compute inclusive prefix
-        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-            DTYPE inclusive_prefix = exclusive + agg;
-            sh_prefix = exclusive;
-            state[part_ix].prefix = inclusive_prefix;
-        }
-
-        if (gl_LocalInvocationID.x == WG_SIZE - 1)
-            atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE);
-    }
-
-    barrier();
-    if (part_ix != 0)
-        exclusive = sh_prefix;
-
-    DTYPE row = exclusive;
-    if (gl_LocalInvocationID.x > 0)
-        row += sh_scratch[gl_LocalInvocationID.x - 1];
-
-    // note - may overwrite
-    for (uint i = 0; i < N_ROWS; i++)
-        dst.v[(ix + i)*dst_stride] = row + local[i];
-}