diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx index 84ffde61..5338e1f7 100644 --- a/hdbscan/_hdbscan_tree.pyx +++ b/hdbscan/_hdbscan_tree.pyx @@ -705,6 +705,9 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, stabilities : ndarray (n_clusters,) The cluster coherence strengths of each cluster. + + selected clusters : ndarray (n_clusters,) + The ids of the selected clusters """ cdef list node_list cdef np.ndarray cluster_tree @@ -803,4 +806,4 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, probs = get_probabilities(tree, reverse_cluster_map, labels) stabilities = get_stability_scores(labels, clusters, stability, max_lambda) - return (labels, probs, stabilities) + return (labels, probs, stabilities, np.array(sorted(clusters))) diff --git a/hdbscan/flat.py b/hdbscan/flat.py index e5912266..03627198 100644 --- a/hdbscan/flat.py +++ b/hdbscan/flat.py @@ -184,7 +184,8 @@ def HDBSCAN_flat(X, n_clusters=None, new_clusterer.probabilities_, new_clusterer.cluster_persistence_, new_clusterer._condensed_tree, - new_clusterer._single_linkage_tree) = output + new_clusterer._single_linkage_tree, + new_clusterer._selected_clusters) = output # PredictionData attached to HDBSCAN should also change. # A function re_init is defined in this module to handle this. diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py index 60dddd05..4e4deb1e 100644 --- a/hdbscan/hdbscan_.py +++ b/hdbscan/hdbscan_.py @@ -62,7 +62,7 @@ def _tree_to_labels( """ condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) - labels, probabilities, stabilities = get_clusters( + labels, probabilities, stabilities, selected_clusters = get_clusters( condensed_tree, stability_dict, cluster_selection_method, @@ -72,7 +72,8 @@ def _tree_to_labels( max_cluster_size, ) - return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree) + return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree, + selected_clusters) def _hdbscan_generic( @@ -1130,6 +1131,7 @@ def __init__( self._outlier_scores = None self._prediction_data = None self._relative_validity = None + self._selected_clusters = None def fit(self, X, y=None): """Perform HDBSCAN clustering from features or distance matrix. @@ -1186,6 +1188,7 @@ def fit(self, X, y=None): self.cluster_persistence_, self._condensed_tree, self._single_linkage_tree, + self._selected_clusters, self._min_spanning_tree, ) = hdbscan(clean_data, **kwargs) @@ -1248,6 +1251,7 @@ def generate_prediction_data(self): self._prediction_data = PredictionData( self._raw_data, self.condensed_tree_, + self._selected_clusters, min_samples, tree_type=tree_type, metric=self.metric, diff --git a/hdbscan/prediction.py b/hdbscan/prediction.py index 166975f9..55696765 100644 --- a/hdbscan/prediction.py +++ b/hdbscan/prediction.py @@ -95,7 +95,7 @@ def _recurse_leaf_dfs(self, current_node): return sum( [recurse_leaf_dfs(self.cluster_tree, child) for child in children], []) - def __init__(self, data, condensed_tree, min_samples, + def __init__(self, data, condensed_tree, selected_clusters, min_samples, tree_type='kdtree', metric='euclidean', **kwargs): self.raw_data = data.astype(np.float64) self.tree = self._tree_type_map[tree_type](self.raw_data, @@ -103,7 +103,6 @@ def __init__(self, data, condensed_tree, min_samples, self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1] self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) - selected_clusters = sorted(condensed_tree._select_clusters()) # raw_condensed_tree = condensed_tree.to_numpy() raw_condensed_tree = condensed_tree._raw_tree diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index 9e3faa34..857c469d 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -144,7 +144,7 @@ def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) - labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed") + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(D, metric="precomputed") # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == n_clusters @@ -167,7 +167,7 @@ def test_hdbscan_sparse_distance_matrix(): D = sparse.csr_matrix(D) D.eliminate_zeros() - labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed") + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(D, metric="precomputed") # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == n_clusters @@ -178,7 +178,7 @@ def test_hdbscan_sparse_distance_matrix(): def test_hdbscan_feature_vector(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X) + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(X) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -191,7 +191,9 @@ def test_hdbscan_feature_vector(): def test_hdbscan_prims_kdtree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_kdtree") + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( + X, algorithm="prims_kdtree" + ) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -203,7 +205,9 @@ def test_hdbscan_prims_kdtree(): def test_hdbscan_prims_balltree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_balltree") + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( + X, algorithm="prims_balltree" + ) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -215,7 +219,9 @@ def test_hdbscan_prims_balltree(): def test_hdbscan_boruvka_kdtree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_kdtree") + labels, p, persist, ctree, ltree, selclstrs, mtree, = hdbscan( + X, algorithm="boruvka_kdtree" + ) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -229,7 +235,9 @@ def test_hdbscan_boruvka_kdtree(): def test_hdbscan_boruvka_balltree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_balltree") + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( + X, algorithm="boruvka_balltree" + ) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -243,7 +251,7 @@ def test_hdbscan_boruvka_balltree(): def test_hdbscan_generic(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="generic") + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(X, algorithm="generic") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -261,7 +269,7 @@ def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) # H, y = shuffle(X, y, random_state=7) H = StandardScaler().fit_transform(H) - labels, p, persist, ctree, ltree, mtree = hdbscan(H) + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(H) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -275,7 +283,7 @@ def test_hdbscan_high_dimensional(): def test_hdbscan_best_balltree_metric(): - labels, p, persist, ctree, ltree, mtree = hdbscan( + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( X, metric="seuclidean", V=np.ones(X.shape[1]) ) n_clusters_1 = len(set(labels)) - int(-1 in labels) @@ -287,7 +295,9 @@ def test_hdbscan_best_balltree_metric(): def test_hdbscan_no_clusters(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X) + 1) + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( + X, min_cluster_size=len(X) + 1 + ) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == 0 @@ -298,7 +308,7 @@ def test_hdbscan_no_clusters(): def test_hdbscan_min_cluster_size(): for min_cluster_size in range(2, len(X) + 1, 1): - labels, p, persist, ctree, ltree, mtree = hdbscan( + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( X, min_cluster_size=min_cluster_size ) true_labels = [label for label in labels if label != -1] @@ -315,7 +325,7 @@ def test_hdbscan_callable_metric(): # metric is the function reference, not the string key. metric = distance.euclidean - labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric) + labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(X, metric=metric) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -333,8 +343,10 @@ def test_hdbscan_boruvka_kdtree_matches(): data = generate_noisy_data() - labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") - labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( + labels_prims, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( + data, algorithm="generic" + ) + labels_boruvka, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( data, algorithm="boruvka_kdtree" ) @@ -354,8 +366,10 @@ def test_hdbscan_boruvka_balltree_matches(): data = generate_noisy_data() - labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") - labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( + labels_prims, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( + data, algorithm="generic" + ) + labels_boruvka, p, persist, ctree, ltree, selclstrs, mtree = hdbscan( data, algorithm="boruvka_balltree" )