import numpy as np from scipy.spatial import cKDTree, distance def _ensure_spacing(coord, spacing, p_norm, max_out): """Returns a subset of coord where a minimum spacing is guaranteed. Parameters ---------- coord : ndarray The coordinates of the considered points. spacing : float the maximum allowed spacing between the points. p_norm : float Which Minkowski p-norm to use. Should be in the range [1, inf]. A finite large p may cause a ValueError if overflow can occur. ``inf`` corresponds to the Chebyshev distance and 2 to the Euclidean distance. max_out: int If not None, at most the first ``max_out`` candidates are returned. Returns ------- output : ndarray A subset of coord where a minimum spacing is guaranteed. """ # Use KDtree to find the peaks that are too close to each other tree = cKDTree(coord) indices = tree.query_ball_point(coord, r=spacing, p=p_norm) rejected_peaks_indices = set() naccepted = 0 for idx, candidates in enumerate(indices): if idx not in rejected_peaks_indices: # keep current point and the points at exactly spacing from it candidates.remove(idx) dist = distance.cdist([coord[idx]], coord[candidates], distance.minkowski, p=p_norm).reshape(-1) candidates = [c for c, d in zip(candidates, dist) if d < spacing] # candidates.remove(keep) rejected_peaks_indices.update(candidates) naccepted += 1 if max_out is not None and naccepted >= max_out: break # Remove the peaks that are too close to each other output = np.delete(coord, tuple(rejected_peaks_indices), axis=0) if max_out is not None: output = output[:max_out] return output def ensure_spacing(coords, spacing=1, p_norm=np.inf, min_split_size=50, max_out=None, *, max_split_size=2000): """Returns a subset of coord where a minimum spacing is guaranteed. Parameters ---------- coords : array_like The coordinates of the considered points. spacing : float the maximum allowed spacing between the points. p_norm : float Which Minkowski p-norm to use. Should be in the range [1, inf]. A finite large p may cause a ValueError if overflow can occur. ``inf`` corresponds to the Chebyshev distance and 2 to the Euclidean distance. min_split_size : int Minimum split size used to process ``coords`` by batch to save memory. If None, the memory saving strategy is not applied. max_out : int If not None, only the first ``max_out`` candidates are returned. max_split_size : int Maximum split size used to process ``coords`` by batch to save memory. This number was decided by profiling with a large number of points. Too small a number results in too much looping in Python instead of C, slowing down the process, while too large a number results in large memory allocations, slowdowns, and, potentially, in the process being killed -- see gh-6010. See benchmark results `here `_. Returns ------- output : array_like A subset of coord where a minimum spacing is guaranteed. """ output = coords if len(coords): coords = np.atleast_2d(coords) if min_split_size is None: batch_list = [coords] else: coord_count = len(coords) split_idx = [min_split_size] split_size = min_split_size while coord_count - split_idx[-1] > max_split_size: split_size *= 2 split_idx.append(split_idx[-1] + min(split_size, max_split_size)) batch_list = np.array_split(coords, split_idx) output = np.zeros((0, coords.shape[1]), dtype=coords.dtype) for batch in batch_list: output = _ensure_spacing(np.vstack([output, batch]), spacing, p_norm, max_out) if max_out is not None and len(output) >= max_out: break return output