Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-performance-state.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation).
classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU."
contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
convolution,2026-07-02,SAFE,compute-bound,1,3615,_convolve_2d_numpy used prange w/o parallel=True -> ran serial (~7-10x slow); fixed via parallel=True + threading.Lock (macOS SIGABRT hazard #3141); cuda kernel 40 regs OK; dask ~20 tasks/chunk
corridor,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
cost_distance,2026-06-15,RISKY,memory-bound,1,3342,"Perf sweep 2026-06-15. HIGH: bounded map_overlap branch in _cost_distance_dask gated on full dims (pad>=height/width) not chunk size; pad>chunk collapses to single chunk (#880-class OOM, verified npartitions=1 at chunks=10/pad=96). Fixed: compare pad vs max chunk dim, route to iterative when pad>=chunk (matches GPU path L484). dask+cupy path already correct. Register count 37 (no pressure). nanmin().compute() L478/L1149 intentional scalar. iterative tile_cache full-dataset materialization is documented MemoryError-guarded design (#1118). All 56 tests pass incl GPU."
curvature,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
Expand Down
28 changes: 28 additions & 0 deletions benchmarks/benchmarks/convolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import numpy as np

from xrspatial.convolution import circle_kernel, convolve_2d

from .common import get_xr_dataarray


class Convolve2d:
params = ([300, 1000, 3000], [(5, 5), (25, 25)], ["numpy", "cupy", "dask"])
param_names = ("nx", "kernelsize", "type")

def setup(self, nx, kernelsize, type):
ny = nx // 2
self.agg = get_xr_dataarray((ny, nx), type)
kernel_h, kernel_w = kernelsize
self.kernel = np.ones((kernel_h, kernel_w), dtype=np.float64)

def time_convolve_2d(self, nx, kernelsize, type):
# convolve_2d takes the backing array, not the DataArray wrapper.
convolve_2d(self.agg.data, self.kernel)


class CircleKernel:
params = ([3, 25, 100],)
param_names = ("radius",)

def time_circle_kernel(self, radius):
circle_kernel(1, 1, radius)
27 changes: 23 additions & 4 deletions xrspatial/convolution.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import threading
from functools import partial

import numpy as np
Expand Down Expand Up @@ -344,7 +345,16 @@ def custom_kernel(kernel):
return kernel


@jit(nopython=True, nogil=True)
# Numba parallel=True kernels must not be launched concurrently from multiple
# Python threads: the default 'workqueue' threading layer is not threadsafe and
# aborts the process (SIGABRT on macOS) when two host threads enter a parallel
# region at once. _convolve_2d_dask_numpy calls the kernel per chunk under
# dask's threaded scheduler, so the kernel launch is serialized behind this
# lock. Same hazard and fix as the terrain and reproject kernels (#3141).
_PARALLEL_KERNEL_LOCK = threading.Lock()


@jit(nopython=True, nogil=True, parallel=True)
def _convolve_2d_numpy(data, kernel):
# apply kernel to data image.
# Caller must ensure data is a float type (float32 or float64).
Expand Down Expand Up @@ -374,14 +384,23 @@ def _convolve_2d_numpy(data, kernel):
return out


def _convolve_2d_numpy_locked(data, kernel):
# Serialize the parallel=True kernel launch across host threads; see the
# comment on _PARALLEL_KERNEL_LOCK. A single numpy call takes the lock
# uncontended and still runs across all cores; concurrent dask chunk calls
# run one at a time, each internally parallel.
with _PARALLEL_KERNEL_LOCK:
return _convolve_2d_numpy(data, kernel)


def _convolve_2d_numpy_boundary(data, kernel, boundary='nan'):
data = data.astype(_promote_float(data.dtype))
if boundary == 'nan':
return _convolve_2d_numpy(data, kernel)
return _convolve_2d_numpy_locked(data, kernel)
pad_h = kernel.shape[0] // 2
pad_w = kernel.shape[1] // 2
padded = _pad_array(data, (pad_h, pad_w), boundary)
result = _convolve_2d_numpy(padded, kernel)
result = _convolve_2d_numpy_locked(padded, kernel)
r0 = pad_h if pad_h else None
r1 = -pad_h if pad_h else None
c0 = pad_w if pad_w else None
Expand All @@ -393,7 +412,7 @@ def _convolve_2d_dask_numpy(data, kernel, boundary='nan'):
data = data.astype(_promote_float(data.dtype))
pad_h = kernel.shape[0] // 2
pad_w = kernel.shape[1] // 2
_func = partial(_convolve_2d_numpy, kernel=kernel)
_func = partial(_convolve_2d_numpy_locked, kernel=kernel)
out = data.map_overlap(_func,
depth=(pad_h, pad_w),
boundary=_boundary_to_dask(boundary),
Expand Down
Loading