xarray-contrib · brendancol · Jul 2, 2026
diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
@@ -5,7 +5,7 @@ bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation).
 classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU."
 contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
-convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
+convolution,2026-07-02,SAFE,compute-bound,1,3615,_convolve_2d_numpy used prange w/o parallel=True -> ran serial (~7-10x slow); fixed via parallel=True + threading.Lock (macOS SIGABRT hazard #3141); cuda kernel 40 regs OK; dask ~20 tasks/chunk
 corridor,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 cost_distance,2026-06-15,RISKY,memory-bound,1,3342,"Perf sweep 2026-06-15. HIGH: bounded map_overlap branch in _cost_distance_dask gated on full dims (pad>=height/width) not chunk size; pad>chunk collapses to single chunk (#880-class OOM, verified npartitions=1 at chunks=10/pad=96). Fixed: compare pad vs max chunk dim, route to iterative when pad>=chunk (matches GPU path L484). dask+cupy path already correct. Register count 37 (no pressure). nanmin().compute() L478/L1149 intentional scalar. iterative tile_cache full-dataset materialization is documented MemoryError-guarded design (#1118). All 56 tests pass incl GPU."
 curvature,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,

diff --git a/benchmarks/benchmarks/convolution.py b/benchmarks/benchmarks/convolution.py
@@ -0,0 +1,28 @@
+import numpy as np
+
+from xrspatial.convolution import circle_kernel, convolve_2d
+
+from .common import get_xr_dataarray
+
+
+class Convolve2d:
+    params = ([300, 1000, 3000], [(5, 5), (25, 25)], ["numpy", "cupy", "dask"])
+    param_names = ("nx", "kernelsize", "type")
+
+    def setup(self, nx, kernelsize, type):
+        ny = nx // 2
+        self.agg = get_xr_dataarray((ny, nx), type)
+        kernel_h, kernel_w = kernelsize
+        self.kernel = np.ones((kernel_h, kernel_w), dtype=np.float64)
+
+    def time_convolve_2d(self, nx, kernelsize, type):
+        # convolve_2d takes the backing array, not the DataArray wrapper.
+        convolve_2d(self.agg.data, self.kernel)
+
+
+class CircleKernel:
+    params = ([3, 25, 100],)
+    param_names = ("radius",)
+
+    def time_circle_kernel(self, radius):
+        circle_kernel(1, 1, radius)
diff --git a/xrspatial/convolution.py b/xrspatial/convolution.py
@@ -1,4 +1,5 @@
 import re
+import threading
 from functools import partial
 
 import numpy as np
@@ -344,7 +345,16 @@ def custom_kernel(kernel):
     return kernel
 
 
-@jit(nopython=True, nogil=True)
+# Numba parallel=True kernels must not be launched concurrently from multiple
+# Python threads: the default 'workqueue' threading layer is not threadsafe and
+# aborts the process (SIGABRT on macOS) when two host threads enter a parallel
+# region at once.  _convolve_2d_dask_numpy calls the kernel per chunk under
+# dask's threaded scheduler, so the kernel launch is serialized behind this
+# lock.  Same hazard and fix as the terrain and reproject kernels (#3141).
+_PARALLEL_KERNEL_LOCK = threading.Lock()
+
+
+@jit(nopython=True, nogil=True, parallel=True)
 def _convolve_2d_numpy(data, kernel):
     # apply kernel to data image.
     # Caller must ensure data is a float type (float32 or float64).
@@ -374,14 +384,23 @@ def _convolve_2d_numpy(data, kernel):
     return out
 
 
+def _convolve_2d_numpy_locked(data, kernel):
+    # Serialize the parallel=True kernel launch across host threads; see the
+    # comment on _PARALLEL_KERNEL_LOCK. A single numpy call takes the lock
+    # uncontended and still runs across all cores; concurrent dask chunk calls
+    # run one at a time, each internally parallel.
+    with _PARALLEL_KERNEL_LOCK:
+        return _convolve_2d_numpy(data, kernel)
+
+
 def _convolve_2d_numpy_boundary(data, kernel, boundary='nan'):
     data = data.astype(_promote_float(data.dtype))
     if boundary == 'nan':
-        return _convolve_2d_numpy(data, kernel)
+        return _convolve_2d_numpy_locked(data, kernel)
     pad_h = kernel.shape[0] // 2
     pad_w = kernel.shape[1] // 2
     padded = _pad_array(data, (pad_h, pad_w), boundary)
-    result = _convolve_2d_numpy(padded, kernel)
+    result = _convolve_2d_numpy_locked(padded, kernel)
     r0 = pad_h if pad_h else None
     r1 = -pad_h if pad_h else None
     c0 = pad_w if pad_w else None
@@ -393,7 +412,7 @@ def _convolve_2d_dask_numpy(data, kernel, boundary='nan'):
     data = data.astype(_promote_float(data.dtype))
     pad_h = kernel.shape[0] // 2
     pad_w = kernel.shape[1] // 2
-    _func = partial(_convolve_2d_numpy, kernel=kernel)
+    _func = partial(_convolve_2d_numpy_locked, kernel=kernel)
     out = data.map_overlap(_func,
                            depth=(pad_h, pad_w),
                            boundary=_boundary_to_dask(boundary),