# -*- coding: utf-8 -*- """ HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise """ import numpy as np from sklearn.base import BaseEstimator, ClusterMixin from sklearn.metrics import pairwise_distances from scipy.sparse import issparse from sklearn.neighbors import KDTree, BallTree from joblib import Memory from warnings import warn from sklearn.utils import check_array from joblib.parallel import cpu_count from scipy.sparse import csgraph from ._hdbscan_linkage import ( single_linkage, mst_linkage_core, mst_linkage_core_vector, label, ) from ._hdbscan_tree import ( condense_tree, compute_stability, get_clusters, outlier_scores, ) from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm from .dist_metrics import DistanceMetric from .plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree from .prediction import PredictionData KDTREE_VALID_METRICS = ["euclidean", "l2", "minkowski", "p", "manhattan", "cityblock", "l1", "chebyshev", "infinity"] BALLTREE_VALID_METRICS = KDTREE_VALID_METRICS + [ "braycurtis", "canberra", "dice", "hamming", "haversine", "jaccard", "mahalanobis", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", ] FAST_METRICS = KDTREE_VALID_METRICS + BALLTREE_VALID_METRICS + ["cosine", "arccos"] # Author: Leland McInnes # Steve Astels # John Healy # # License: BSD 3 clause from numpy import isclose def _tree_to_labels( X, single_linkage_tree, min_cluster_size=10, cluster_selection_method="eom", allow_single_cluster=False, match_reference_implementation=False, cluster_selection_epsilon=0.0, max_cluster_size=0, ): """Converts a pretrained tree and cluster size into a set of labels and probabilities. """ condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) labels, probabilities, stabilities = get_clusters( condensed_tree, stability_dict, cluster_selection_method, allow_single_cluster, match_reference_implementation, cluster_selection_epsilon, max_cluster_size, ) return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree) def _hdbscan_generic( X, min_samples=5, alpha=1.0, metric="minkowski", p=2, leaf_size=None, gen_min_span_tree=False, **kwargs ): if metric == "minkowski": distance_matrix = pairwise_distances(X, metric=metric, p=p) elif metric == "arccos": distance_matrix = pairwise_distances(X, metric="cosine", **kwargs) elif metric == "precomputed": # Treating this case explicitly, instead of letting # sklearn.metrics.pairwise_distances handle it, # enables the usage of numpy.inf in the distance # matrix to indicate missing distance information. # TODO: Check if copying is necessary distance_matrix = X.copy() else: distance_matrix = pairwise_distances(X, metric=metric, **kwargs) if issparse(distance_matrix): # raise TypeError('Sparse distance matrices not yet supported') return _hdbscan_sparse_distance_matrix( distance_matrix, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs ) mutual_reachability_ = mutual_reachability(distance_matrix, min_samples, alpha) min_spanning_tree = mst_linkage_core(mutual_reachability_) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): warn( "The minimum spanning tree contains edge weights with value " "infinity. Potentially, you are missing too many distances " "in the initial distance matrix for the given neighborhood " "size.", UserWarning, ) # mst_linkage_core does not generate a full minimal spanning tree # If a tree is required then we must build the edges from the information # returned by mst_linkage_core (i.e. just the order of points to be merged) if gen_min_span_tree: result_min_span_tree = min_spanning_tree.copy() for index, row in enumerate(result_min_span_tree[1:], 1): candidates = np.where(isclose(mutual_reachability_[int(row[1])], row[2]))[0] candidates = np.intersect1d( candidates, min_spanning_tree[:index, :2].astype(int) ) candidates = candidates[candidates != row[1]] assert len(candidates) > 0 row[0] = candidates[0] else: result_min_span_tree = None # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, result_min_span_tree def _hdbscan_sparse_distance_matrix( X, min_samples=5, alpha=1.0, metric="minkowski", p=2, leaf_size=40, gen_min_span_tree=False, **kwargs ): assert issparse(X) # Check for connected component on X if csgraph.connected_components(X, directed=False, return_labels=False) > 1: raise ValueError( "Sparse distance matrix has multiple connected " "components!\nThat is, there exist groups of points " "that are completely disjoint -- there are no distance " "relations connecting them\n" "Run hdbscan on each component." ) lil_matrix = X.tolil() # Compute sparse mutual reachability graph # if max_dist > 0, max distance to use when the reachability is infinite max_dist = kwargs.get("max_dist", 0.0) mutual_reachability_ = sparse_mutual_reachability( lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha ) # Check connected component on mutual reachability # If more than one component, it means that even if the distance matrix X # has one component, there exists with less than `min_samples` neighbors if ( csgraph.connected_components( mutual_reachability_, directed=False, return_labels=False ) > 1 ): raise ValueError( ( "There exists points with less than %s neighbors. " "Ensure your distance matrix has non zeros values for " "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " "or specify a `max_dist` to use when distances are missing." ) % (min_samples, min_samples) ) # Compute the minimum spanning tree for the sparse graph sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_) # Convert the graph to scipy cluster array format nonzeros = sparse_min_spanning_tree.nonzero() nonzero_vals = sparse_min_spanning_tree[nonzeros] min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :][0] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None def _hdbscan_prims_kdtree( X, min_samples=5, alpha=1.0, metric="minkowski", p=2, leaf_size=40, gen_min_span_tree=False, **kwargs ): if X.dtype != np.float64: X = X.astype(np.float64) # The Cython routines used require contiguous arrays if not X.flags["C_CONTIGUOUS"]: X = np.array(X, dtype=np.double, order="C") tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) # TO DO: Deal with p for minkowski appropriately dist_metric = DistanceMetric.get_metric(metric, **kwargs) # Get distance to kth nearest neighbour core_distances = tree.query( X, k=min_samples + 1, dualtree=True, breadth_first=True )[0][:, -1].copy(order="C") # Mutual reachability distance is implicit in mst_linkage_core_vector min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None def _hdbscan_prims_balltree( X, min_samples=5, alpha=1.0, metric="minkowski", p=2, leaf_size=40, gen_min_span_tree=False, **kwargs ): if X.dtype != np.float64: X = X.astype(np.float64) # The Cython routines used require contiguous arrays if not X.flags["C_CONTIGUOUS"]: X = np.array(X, dtype=np.double, order="C") tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) dist_metric = DistanceMetric.get_metric(metric, **kwargs) # Get distance to kth nearest neighbour core_distances = tree.query( X, k=min_samples + 1, dualtree=True, breadth_first=True )[0][:, -1].copy(order="C") # Mutual reachability distance is implicit in mst_linkage_core_vector min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None def _hdbscan_boruvka_kdtree( X, min_samples=5, alpha=1.0, metric="minkowski", p=2, leaf_size=40, approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs ): if leaf_size < 3: leaf_size = 3 if core_dist_n_jobs < 1: core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) if X.dtype != np.float64: X = X.astype(np.float64) tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) alg = KDTreeBoruvkaAlgorithm( tree, min_samples, metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, n_jobs=core_dist_n_jobs, **kwargs ) min_spanning_tree = alg.spanning_tree() # Sort edges of the min_spanning_tree by weight row_order = np.argsort(min_spanning_tree.T[2]) min_spanning_tree = min_spanning_tree[row_order, :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None def _hdbscan_boruvka_balltree( X, min_samples=5, alpha=1.0, metric="minkowski", p=2, leaf_size=40, approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs ): if leaf_size < 3: leaf_size = 3 if core_dist_n_jobs < 1: core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) if X.dtype != np.float64: X = X.astype(np.float64) tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) alg = BallTreeBoruvkaAlgorithm( tree, min_samples, metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, n_jobs=core_dist_n_jobs, **kwargs ) min_spanning_tree = alg.spanning_tree() # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None def check_precomputed_distance_matrix(X): """Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.""" tmp = X.copy() tmp[np.isinf(tmp)] = 1 check_array(tmp) def remap_condensed_tree(tree, internal_to_raw, outliers): """ Takes an internal condensed_tree structure and adds back in a set of points that were initially detected as non-finite and returns that new tree. These points will all be split off from the maximal node at lambda zero and considered noise points. Parameters ---------- tree: condensed_tree internal_to_raw: dict a mapping from internal integer index to the raw integer index finite_index: ndarray Boolean array of which entries in the raw data were finite """ finite_count = len(internal_to_raw) outlier_count = len(outliers) for i, (parent, child, lambda_val, child_size) in enumerate(tree): if child < finite_count: child = internal_to_raw[child] else: child = child + outlier_count tree[i] = (parent + outlier_count, child, lambda_val, child_size) outlier_list = [] root = tree[0][0] # Should I check to be sure this is the minimal lambda? for outlier in outliers: outlier_list.append((root, outlier, 0, 1)) outlier_tree = np.array( outlier_list, dtype=[ ("parent", np.intp), ("child", np.intp), ("lambda_val", float), ("child_size", np.intp), ], ) tree = np.append(outlier_tree, tree) return tree def remap_single_linkage_tree(tree, internal_to_raw, outliers): """ Takes an internal single_linkage_tree structure and adds back in a set of points that were initially detected as non-finite and returns that new tree. These points will all be merged into the final node at np.inf distance and considered noise points. Parameters ---------- tree: single_linkage_tree internal_to_raw: dict a mapping from internal integer index to the raw integer index finite_index: ndarray Boolean array of which entries in the raw data were finite """ finite_count = len(internal_to_raw) outlier_count = len(outliers) for i, (left, right, distance, size) in enumerate(tree): if left < finite_count: tree[i, 0] = internal_to_raw[left] else: tree[i, 0] = left + outlier_count if right < finite_count: tree[i, 1] = internal_to_raw[right] else: tree[i, 1] = right + outlier_count outlier_tree = np.zeros((len(outliers), 4)) last_cluster_id = tree[tree.shape[0] - 1][0:2].max() last_cluster_size = tree[tree.shape[0] - 1][3] for i, outlier in enumerate(outliers): outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) last_cluster_id += 1 last_cluster_size += 1 tree = np.vstack([tree, outlier_tree]) return tree def is_finite(matrix): """Returns true only if all the values of a ndarray or sparse matrix are finite""" if issparse(matrix): return np.all(np.isfinite(matrix.tocoo().data)) else: return np.all(np.isfinite(matrix)) def get_finite_row_indices(matrix): """Returns the indices of the purely finite rows of a sparse matrix or dense ndarray""" if issparse(matrix): row_indices = np.array( [i for i, row in enumerate(matrix.tolil().data) if np.all(np.isfinite(row))] ) else: row_indices = np.where(np.isfinite(matrix).sum(axis=1) == matrix.shape[1])[0] return row_indices def hdbscan( X, min_cluster_size=5, min_samples=None, alpha=1.0, cluster_selection_epsilon=0.0, max_cluster_size=0, metric="minkowski", p=2, leaf_size=40, algorithm="best", memory=Memory(None, verbose=0), approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, match_reference_implementation=False, **kwargs ): """Perform HDBSCAN clustering from a vector array or distance matrix. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. min_cluster_size : int, optional (default=5) The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise. min_samples : int, optional (default=None) The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. defaults to the min_cluster_size. cluster_selection_epsilon: float, optional (default=0.0) A distance threshold. Clusters below this value will be merged. See [3]_ for more information. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument. alpha : float, optional (default=1.0) A distance scaling parameter as used in robust single linkage. See [2]_ for more information. max_cluster_size : int, optional (default=0) A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument. metric : string or callable, optional (default='minkowski') The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. p : int, optional (default=2) p value to use if using the minkowski metric. leaf_size : int, optional (default=40) Leaf size for trees responsible for fast nearest neighbour queries. algorithm : string, optional (default='best') Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``best`` * ``generic`` * ``prims_kdtree`` * ``prims_balltree`` * ``boruvka_kdtree`` * ``boruvka_balltree`` memory : instance of joblib.Memory or string, optional Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. approx_min_span_tree : bool, optional (default=True) Whether to accept an only approximate minimum spanning tree. For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. gen_min_span_tree : bool, optional (default=False) Whether to generate the minimum spanning tree for later analysis. core_dist_n_jobs : int, optional (default=4) Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. cluster_selection_method : string, optional (default='eom') The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: * ``eom`` * ``leaf`` allow_single_cluster : bool, optional (default=False) By default HDBSCAN* will not produce a single cluster, setting this to t=True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. (default False) match_reference_implementation : bool, optional (default=False) There exist some interpretational differences between this HDBSCAN* implementation and the original authors reference implementation in Java. This can result in very minor differences in clustering results. Setting this flag to True will, at a some performance cost, ensure that the clustering results match the reference implementation. **kwargs : optional Arguments passed to the distance metric Returns ------- labels : ndarray, shape (n_samples, ) Cluster labels for each point. Noisy samples are given the label -1. probabilities : ndarray, shape (n_samples, ) Cluster membership strengths for each point. Noisy samples are assigned 0. cluster_persistence : array, shape (n_clusters, ) A score of how persistent each cluster is. A score of 1.0 represents a perfectly stable cluster that persists over all distance scales, while a score of 0.0 represents a perfectly ephemeral cluster. These scores can be guage the relative coherence of the clusters output by the algorithm. condensed_tree : record array The condensed cluster hierarchy used to generate clusters. single_linkage_tree : ndarray, shape (n_samples - 1, 4) The single linkage tree produced during clustering in scipy hierarchical clustering format (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). min_spanning_tree : ndarray, shape (n_samples - 1, 3) The minimum spanning as an edgelist. If gen_min_span_tree was False this will be None. References ---------- .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). Density-based clustering based on hierarchical density estimates. In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the cluster tree. In Advances in Neural Information Processing Systems (pp. 343-351). .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical Density-based Cluster Selection. arxiv preprint 1911.02282. """ if min_samples is None: min_samples = min_cluster_size if not np.issubdtype(type(min_samples), np.integer) or \ not np.issubdtype(type(min_cluster_size), np.integer): raise ValueError("Min samples and min cluster size must be integers!") if min_samples <= 0 or min_cluster_size <= 0: raise ValueError( "Min samples and Min cluster size must be positive" " integers" ) if min_cluster_size == 1: raise ValueError("Min cluster size must be greater than one") if np.issubdtype(type(cluster_selection_epsilon), np.integer): cluster_selection_epsilon = float(cluster_selection_epsilon) if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0: raise ValueError("Epsilon must be a float value greater than or equal to 0!") if not isinstance(alpha, float) or alpha <= 0.0: raise ValueError("Alpha must be a positive float value greater than" " 0!") if leaf_size < 1: raise ValueError("Leaf size must be greater than 0!") if metric == "minkowski": if p is None: raise TypeError("Minkowski metric given but no p value supplied!") if p < 0: raise ValueError( "Minkowski metric with negative p value is not" " defined!" ) if match_reference_implementation: min_samples = min_samples - 1 min_cluster_size = min_cluster_size + 1 approx_min_span_tree = False if cluster_selection_method not in ("eom", "leaf"): raise ValueError( "Invalid Cluster Selection Method: %s\n" 'Should be one of: "eom", "leaf"\n' ) # Checks input and converts to an nd-array where possible if metric != "precomputed" or issparse(X): X = check_array(X, accept_sparse="csr", force_all_finite=False) else: # Only non-sparse, precomputed distance matrices are handled here # and thereby allowed to contain numpy.inf for missing distances check_precomputed_distance_matrix(X) # Python 2 and 3 compliant string_type checking if isinstance(memory, str): memory = Memory(memory, verbose=0) size = X.shape[0] min_samples = min(size - 1, min_samples) if min_samples == 0: min_samples = 1 if algorithm != "best": if metric != "precomputed" and issparse(X) and algorithm != "generic": raise ValueError("Sparse data matrices only support algorithm 'generic'.") if algorithm == "generic": (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_generic )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == "prims_kdtree": if metric not in KDTREE_VALID_METRICS: raise ValueError("Cannot use Prim's with KDTree for this" " metric!") (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_prims_kdtree )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == "prims_balltree": if metric not in BALLTREE_VALID_METRICS: raise ValueError("Cannot use Prim's with BallTree for this" " metric!") (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_prims_balltree )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == "boruvka_kdtree": if metric not in BALLTREE_VALID_METRICS: raise ValueError("Cannot use Boruvka with KDTree for this" " metric!") (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_boruvka_kdtree )( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs ) elif algorithm == "boruvka_balltree": if metric not in BALLTREE_VALID_METRICS: raise ValueError("Cannot use Boruvka with BallTree for this" " metric!") if (X.shape[0] // leaf_size) > 16000: warn( "A large dataset size and small leaf_size may induce excessive " "memory usage. If you are running out of memory consider " "increasing the ``leaf_size`` parameter." ) (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_boruvka_balltree )( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs ) else: raise TypeError("Unknown algorithm type %s specified" % algorithm) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_generic )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif metric in KDTREE_VALID_METRICS: # TO DO: Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_prims_kdtree )( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs ) else: (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_boruvka_kdtree )( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs ) else: # Metric is a valid BallTree metric # TO DO: Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_prims_balltree )( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs ) else: (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_boruvka_balltree )( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs ) return ( _tree_to_labels( X, single_linkage_tree, min_cluster_size, cluster_selection_method, allow_single_cluster, match_reference_implementation, cluster_selection_epsilon, max_cluster_size, ) + (result_min_span_tree,) ) # Inherits from sklearn class HDBSCAN(BaseEstimator, ClusterMixin): """Perform HDBSCAN clustering from vector array or distance matrix. HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications with Noise. Performs DBSCAN over varying epsilon values and integrates the result to find a clustering that gives the best stability over epsilon. This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), and be more robust to parameter selection. Parameters ---------- min_cluster_size : int, optional (default=5) The minimum size of clusters; single linkage splits that contain fewer points than this will be considered points "falling out" of a cluster rather than a cluster splitting into two new clusters. min_samples : int, optional (default=None) The number of samples in a neighbourhood for a point to be considered a core point. metric : string, or callable, optional (default='euclidean') The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. p : int, optional (default=None) p value to use if using the minkowski metric. alpha : float, optional (default=1.0) A distance scaling parameter as used in robust single linkage. See [3]_ for more information. cluster_selection_epsilon: float, optional (default=0.0) A distance threshold. Clusters below this value will be merged. See [5]_ for more information. algorithm : string, optional (default='best') Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``best`` * ``generic`` * ``prims_kdtree`` * ``prims_balltree`` * ``boruvka_kdtree`` * ``boruvka_balltree`` leaf_size: int, optional (default=40) If using a space tree algorithm (kdtree, or balltree) the number of points ina leaf node of the tree. This does not alter the resulting clustering, but may have an effect on the runtime of the algorithm. memory : Instance of joblib.Memory or string (optional) Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. approx_min_span_tree : bool, optional (default=True) Whether to accept an only approximate minimum spanning tree. For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. gen_min_span_tree: bool, optional (default=False) Whether to generate the minimum spanning tree with regard to mutual reachability distance for later analysis. core_dist_n_jobs : int, optional (default=4) Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. cluster_selection_method : string, optional (default='eom') The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: * ``eom`` * ``leaf`` allow_single_cluster : bool, optional (default=False) By default HDBSCAN* will not produce a single cluster, setting this to True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. prediction_data : boolean, optional Whether to generate extra cached data for predicting labels or membership vectors for new unseen points later. If you wish to persist the clustering object for later re-use you probably want to set this to True. (default False) match_reference_implementation : bool, optional (default=False) There exist some interpretational differences between this HDBSCAN* implementation and the original authors reference implementation in Java. This can result in very minor differences in clustering results. Setting this flag to True will, at a some performance cost, ensure that the clustering results match the reference implementation. **kwargs : optional Arguments passed to the distance metric Attributes ---------- labels_ : ndarray, shape (n_samples, ) Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1. probabilities_ : ndarray, shape (n_samples, ) The strength with which each sample is a member of its assigned cluster. Noise points have probability zero; points in clusters have values assigned proportional to the degree that they persist as part of the cluster. cluster_persistence_ : ndarray, shape (n_clusters, ) A score of how persistent each cluster is. A score of 1.0 represents a perfectly stable cluster that persists over all distance scales, while a score of 0.0 represents a perfectly ephemeral cluster. These scores can be guage the relative coherence of the clusters output by the algorithm. condensed_tree_ : CondensedTree object The condensed tree produced by HDBSCAN. The object has methods for converting to pandas, networkx, and plotting. single_linkage_tree_ : SingleLinkageTree object The single linkage tree produced by HDBSCAN. The object has methods for converting to pandas, networkx, and plotting. minimum_spanning_tree_ : MinimumSpanningTree object The minimum spanning tree of the mutual reachability graph generated by HDBSCAN. Note that this is not generated by default and will only be available if `gen_min_span_tree` was set to True on object creation. Even then in some optimized cases a tre may not be generated. outlier_scores_ : ndarray, shape (n_samples, ) Outlier scores for clustered points; the larger the score the more outlier-like the point. Useful as an outlier detection technique. Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander. prediction_data_ : PredictionData object Cached data used for predicting the cluster labels of new or unseen points. Necessary only if you are using functions from ``hdbscan.prediction`` (see :func:`~hdbscan.prediction.approximate_predict`, :func:`~hdbscan.prediction.membership_vector`, and :func:`~hdbscan.prediction.all_points_membership_vectors`). exemplars_ : list A list of exemplar points for clusters. Since HDBSCAN supports arbitrary shapes for clusters we cannot provide a single cluster exemplar per cluster. Instead a list is returned with each element of the list being a numpy array of exemplar points for a cluster -- these points are the "most representative" points of the cluster. relative_validity_ : float A fast approximation of the Density Based Cluster Validity (DBCV) score [4]. The only differece, and the speed, comes from the fact that this relative_validity_ is computed using the mutual- reachability minimum spanning tree, i.e. minimum_spanning_tree_, instead of the all-points minimum spanning tree used in the reference. This score might not be an objective measure of the goodness of clusterering. It may only be used to compare results across different choices of hyper-parameters, therefore is only a relative score. References ---------- .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). Density-based clustering based on hierarchical density estimates. In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. .. [2] Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. (2015). Hierarchical density estimates for data clustering, visualization, and outlier detection. ACM Transactions on Knowledge Discovery from Data (TKDD), 10(1), 5. .. [3] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the cluster tree. In Advances in Neural Information Processing Systems (pp. 343-351). .. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). .. [5] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical Density-based Cluster Selection. arxiv preprint 1911.02282. """ def __init__( self, min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0, max_cluster_size=0, metric="euclidean", alpha=1.0, p=None, algorithm="best", leaf_size=40, memory=Memory(None, verbose=0), approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, prediction_data=False, match_reference_implementation=False, **kwargs ): self.min_cluster_size = min_cluster_size self.min_samples = min_samples self.alpha = alpha self.max_cluster_size = max_cluster_size self.cluster_selection_epsilon = cluster_selection_epsilon self.metric = metric self.p = p self.algorithm = algorithm self.leaf_size = leaf_size self.memory = memory self.approx_min_span_tree = approx_min_span_tree self.gen_min_span_tree = gen_min_span_tree self.core_dist_n_jobs = core_dist_n_jobs self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster self.match_reference_implementation = match_reference_implementation self.prediction_data = prediction_data self._metric_kwargs = kwargs self._condensed_tree = None self._single_linkage_tree = None self._min_spanning_tree = None self._raw_data = None self._outlier_scores = None self._prediction_data = None self._relative_validity = None def fit(self, X, y=None): """Perform HDBSCAN clustering from features or distance matrix. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. Returns ------- self : object Returns self """ if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. # Rows with these values X = check_array(X, accept_sparse="csr", force_all_finite=False) self._raw_data = X self._all_finite = is_finite(X) if ~self._all_finite: # Pass only the purely finite indices into hdbscan # We will later assign all non-finite points to the background -1 cluster finite_index = get_finite_row_indices(X) clean_data = X[finite_index] internal_to_raw = { x: y for x, y in zip(range(len(finite_index)), finite_index) } outliers = list(set(range(X.shape[0])) - set(finite_index)) else: clean_data = X elif issparse(X): # Handle sparse precomputed distance matrices separately X = check_array(X, accept_sparse="csr") clean_data = X else: # Only non-sparse, precomputed distance matrices are allowed # to have numpy.inf values indicating missing distances check_precomputed_distance_matrix(X) clean_data = X kwargs = self.get_params() # prediction data only applies to the persistent model, so remove # it from the keyword args we pass on the the function kwargs.pop("prediction_data", None) kwargs.update(self._metric_kwargs) ( self.labels_, self.probabilities_, self.cluster_persistence_, self._condensed_tree, self._single_linkage_tree, self._min_spanning_tree, ) = hdbscan(clean_data, **kwargs) if self.metric != "precomputed" and not self._all_finite: # remap indices to align with original data in the case of non-finite entries. self._condensed_tree = remap_condensed_tree( self._condensed_tree, internal_to_raw, outliers ) self._single_linkage_tree = remap_single_linkage_tree( self._single_linkage_tree, internal_to_raw, outliers ) new_labels = np.full(X.shape[0], -1) new_labels[finite_index] = self.labels_ self.labels_ = new_labels new_probabilities = np.zeros(X.shape[0]) new_probabilities[finite_index] = self.probabilities_ self.probabilities_ = new_probabilities if self.prediction_data: self.generate_prediction_data() return self def fit_predict(self, X, y=None): """Performs clustering on X and returns cluster labels. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. Returns ------- y : ndarray, shape (n_samples, ) cluster labels """ self.fit(X) return self.labels_ def generate_prediction_data(self): """ Create data that caches intermediate results used for predicting the label of new/unseen points. This data is only useful if you are intending to use functions from ``hdbscan.prediction``. """ if self.metric in FAST_METRICS: min_samples = self.min_samples or self.min_cluster_size if self.metric in KDTREE_VALID_METRICS: tree_type = "kdtree" elif self.metric in BALLTREE_VALID_METRICS: tree_type = "balltree" else: warn("Metric {} not supported for prediction data!".format(self.metric)) return self._prediction_data = PredictionData( self._raw_data, self.condensed_tree_, min_samples, tree_type=tree_type, metric=self.metric, **self._metric_kwargs ) else: warn( "Cannot generate prediction data for non-vector" "space inputs -- access to the source data rather" "than mere distances is required!" ) def weighted_cluster_centroid(self, cluster_id): """Provide an approximate representative point for a given cluster. Note that this technique assumes a euclidean metric for speed of computation. For more general metrics use the ``weighted_cluster_medoid`` method which is slower, but can work with the metric the model trained with. Parameters ---------- cluster_id: int The id of the cluster to compute a centroid for. Returns ------- centroid: array of shape (n_features,) A representative centroid for cluster ``cluster_id``. """ if not hasattr(self, "labels_"): raise AttributeError("Model has not been fit to data") if cluster_id == -1: raise ValueError( "Cannot calculate weighted centroid for -1 cluster " "since it is a noise cluster" ) mask = self.labels_ == cluster_id cluster_data = self._raw_data[mask] cluster_membership_strengths = self.probabilities_[mask] return np.average(cluster_data, weights=cluster_membership_strengths, axis=0) def weighted_cluster_medoid(self, cluster_id): """Provide an approximate representative point for a given cluster. Note that this technique can be very slow and memory intensive for large clusters. For faster results use the ``weighted_cluster_centroid`` method which is faster, but assumes a euclidean metric. Parameters ---------- cluster_id: int The id of the cluster to compute a medoid for. Returns ------- centroid: array of shape (n_features,) A representative medoid for cluster ``cluster_id``. """ if not hasattr(self, "labels_"): raise AttributeError("Model has not been fit to data") if cluster_id == -1: raise ValueError( "Cannot calculate weighted centroid for -1 cluster " "since it is a noise cluster" ) mask = self.labels_ == cluster_id cluster_data = self._raw_data[mask] cluster_membership_strengths = self.probabilities_[mask] dist_mat = pairwise_distances( cluster_data, metric=self.metric, **self._metric_kwargs ) dist_mat = dist_mat * cluster_membership_strengths medoid_index = np.argmin(dist_mat.sum(axis=1)) return cluster_data[medoid_index] def dbscan_clustering(self, cut_distance, min_cluster_size=5): """Return clustering that would be equivalent to running DBSCAN* for a particular cut_distance (or epsilon) DBSCAN* can be thought of as DBSCAN without the border points. As such these results may differ slightly from sklearns implementation of dbscan in the non-core points. This can also be thought of as a flat clustering derived from constant height cut through the single linkage tree. This represents the result of selecting a cut value for robust single linkage clustering. The `min_cluster_size` allows the flat clustering to declare noise points (and cluster smaller than `min_cluster_size`). Parameters ---------- cut_distance : float The mutual reachability distance cut value to use to generate a flat clustering. min_cluster_size : int, optional Clusters smaller than this value with be called 'noise' and remain unclustered in the resulting flat clustering. Returns ------- labels : array [n_samples] An array of cluster labels, one per datapoint. Unclustered points are assigned the label -1. """ return self.single_linkage_tree_.get_clusters( cut_distance=cut_distance, min_cluster_size=min_cluster_size, ) @property def prediction_data_(self): if self._prediction_data is None: raise AttributeError("No prediction data was generated") else: return self._prediction_data @property def outlier_scores_(self): if self._outlier_scores is not None: return self._outlier_scores else: if self._condensed_tree is not None: self._outlier_scores = outlier_scores(self._condensed_tree) return self._outlier_scores else: raise AttributeError( "No condensed tree was generated; try running fit first." ) @property def condensed_tree_(self): if self._condensed_tree is not None: return CondensedTree( self._condensed_tree, self.cluster_selection_method, self.allow_single_cluster, ) else: raise AttributeError( "No condensed tree was generated; try running fit first." ) @property def single_linkage_tree_(self): if self._single_linkage_tree is not None: return SingleLinkageTree(self._single_linkage_tree) else: raise AttributeError( "No single linkage tree was generated; try running fit" " first." ) @property def minimum_spanning_tree_(self): if self._min_spanning_tree is not None: if self._raw_data is not None: return MinimumSpanningTree(self._min_spanning_tree, self._raw_data) else: warn( "No raw data is available; this may be due to using" " a precomputed metric matrix. No minimum spanning" " tree will be provided without raw data." ) return None else: raise AttributeError( "No minimum spanning tree was generated." "This may be due to optimized algorithm variations that skip" " explicit generation of the spanning tree." ) @property def exemplars_(self): if self._prediction_data is not None: return self._prediction_data.exemplars elif self.metric in FAST_METRICS: self.generate_prediction_data() return self._prediction_data.exemplars else: raise AttributeError( "Currently exemplars require the use of vector input data" "with a suitable metric. This will likely change in the " "future, but for now no exemplars can be provided" ) @property def relative_validity_(self): if self._relative_validity is not None: return self._relative_validity if not self.gen_min_span_tree: raise AttributeError( "Minimum spanning tree not present. " + "Either HDBSCAN object was created with " + "gen_min_span_tree=False or the tree was " + "not generated in spite of it owing to " + "internal optimization criteria." ) return labels = self.labels_ sizes = np.bincount(labels + 1) noise_size = sizes[0] cluster_size = sizes[1:] total = noise_size + np.sum(cluster_size) num_clusters = len(cluster_size) DSC = np.zeros(num_clusters) min_outlier_sep = np.inf # only required if num_clusters = 1 correction_const = 2 # only required if num_clusters = 1 # Unltimately, for each Ci, we only require the # minimum of DSPC(Ci, Cj) over all Cj != Ci. # So let's call this value DSPC_wrt(Ci), i.e. # density separation 'with respect to' Ci. DSPC_wrt = np.ones(num_clusters) * np.inf max_distance = 0 mst_df = self.minimum_spanning_tree_.to_pandas() for edge in mst_df.iterrows(): label1 = labels[int(edge[1]["from"])] label2 = labels[int(edge[1]["to"])] length = edge[1]["distance"] max_distance = max(max_distance, length) if label1 == -1 and label2 == -1: continue elif label1 == -1 or label2 == -1: # If exactly one of the points is noise min_outlier_sep = min(min_outlier_sep, length) continue if label1 == label2: # Set the density sparseness of the cluster # to the sparsest value seen so far. DSC[label1] = max(length, DSC[label1]) else: # Check whether density separations with # respect to each of these clusters can # be reduced. DSPC_wrt[label1] = min(length, DSPC_wrt[label1]) DSPC_wrt[label2] = min(length, DSPC_wrt[label2]) # In case min_outlier_sep is still np.inf, we assign a new value to it. # This only makes sense if num_clusters = 1 since it has turned out # that the MR-MST has no edges between a noise point and a core point. min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep # DSPC_wrt[Ci] might be infinite if the connected component for Ci is # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the # MR-MST might contain an edge with one point in Cj and ther other one # in Ck. Here, we replace the infinite density separation of Ci by # another large enough value. # # TODO: Think of a better yet efficient way to handle this. correction = correction_const * ( max_distance if num_clusters > 1 else min_outlier_sep ) DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction V_index = [ (DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i]) for i in range(num_clusters) ] score = np.sum( [(cluster_size[i] * V_index[i]) / total for i in range(num_clusters)] ) self._relative_validity = score return self._relative_validity