Skip to content

Commit

Permalink
Fix prediction data not honoring cluster_selection_epsilon
Browse files Browse the repository at this point in the history
  • Loading branch information
n9Mtq4 committed Mar 22, 2023
1 parent e55f957 commit 809c35a
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 23 deletions.
5 changes: 4 additions & 1 deletion hdbscan/_hdbscan_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,9 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
stabilities : ndarray (n_clusters,)
The cluster coherence strengths of each cluster.
selected clusters : ndarray (n_clusters,)
The ids of the selected clusters
"""
cdef list node_list
cdef np.ndarray cluster_tree
Expand Down Expand Up @@ -803,4 +806,4 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
probs = get_probabilities(tree, reverse_cluster_map, labels)
stabilities = get_stability_scores(labels, clusters, stability, max_lambda)

return (labels, probs, stabilities)
return (labels, probs, stabilities, np.array(sorted(clusters)))
3 changes: 2 additions & 1 deletion hdbscan/flat.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ def HDBSCAN_flat(X, n_clusters=None,
new_clusterer.probabilities_,
new_clusterer.cluster_persistence_,
new_clusterer._condensed_tree,
new_clusterer._single_linkage_tree) = output
new_clusterer._single_linkage_tree,
new_clusterer._selected_clusters) = output

# PredictionData attached to HDBSCAN should also change.
# A function re_init is defined in this module to handle this.
Expand Down
8 changes: 6 additions & 2 deletions hdbscan/hdbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _tree_to_labels(
"""
condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
stability_dict = compute_stability(condensed_tree)
labels, probabilities, stabilities = get_clusters(
labels, probabilities, stabilities, selected_clusters = get_clusters(
condensed_tree,
stability_dict,
cluster_selection_method,
Expand All @@ -72,7 +72,8 @@ def _tree_to_labels(
max_cluster_size,
)

return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)
return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree,
selected_clusters)


def _hdbscan_generic(
Expand Down Expand Up @@ -1130,6 +1131,7 @@ def __init__(
self._outlier_scores = None
self._prediction_data = None
self._relative_validity = None
self._selected_clusters = None

def fit(self, X, y=None):
"""Perform HDBSCAN clustering from features or distance matrix.
Expand Down Expand Up @@ -1186,6 +1188,7 @@ def fit(self, X, y=None):
self.cluster_persistence_,
self._condensed_tree,
self._single_linkage_tree,
self._selected_clusters,
self._min_spanning_tree,
) = hdbscan(clean_data, **kwargs)

Expand Down Expand Up @@ -1248,6 +1251,7 @@ def generate_prediction_data(self):
self._prediction_data = PredictionData(
self._raw_data,
self.condensed_tree_,
self._selected_clusters,
min_samples,
tree_type=tree_type,
metric=self.metric,
Expand Down
3 changes: 1 addition & 2 deletions hdbscan/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,14 @@ def _recurse_leaf_dfs(self, current_node):
return sum(
[recurse_leaf_dfs(self.cluster_tree, child) for child in children], [])

def __init__(self, data, condensed_tree, min_samples,
def __init__(self, data, condensed_tree, selected_clusters, min_samples,
tree_type='kdtree', metric='euclidean', **kwargs):
self.raw_data = data.astype(np.float64)
self.tree = self._tree_type_map[tree_type](self.raw_data,
metric=metric, **kwargs)
self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1]
self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)

selected_clusters = sorted(condensed_tree._select_clusters())
# raw_condensed_tree = condensed_tree.to_numpy()
raw_condensed_tree = condensed_tree._raw_tree

Expand Down
48 changes: 31 additions & 17 deletions hdbscan/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def test_hdbscan_distance_matrix():
D = distance.squareform(distance.pdist(X))
D /= np.max(D)

labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed")
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(D, metric="precomputed")
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise
assert n_clusters_1 == n_clusters
Expand All @@ -167,7 +167,7 @@ def test_hdbscan_sparse_distance_matrix():
D = sparse.csr_matrix(D)
D.eliminate_zeros()

labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed")
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(D, metric="precomputed")
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise
assert n_clusters_1 == n_clusters
Expand All @@ -178,7 +178,7 @@ def test_hdbscan_sparse_distance_matrix():


def test_hdbscan_feature_vector():
labels, p, persist, ctree, ltree, mtree = hdbscan(X)
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(X)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -191,7 +191,9 @@ def test_hdbscan_feature_vector():


def test_hdbscan_prims_kdtree():
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_kdtree")
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
X, algorithm="prims_kdtree"
)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -203,7 +205,9 @@ def test_hdbscan_prims_kdtree():


def test_hdbscan_prims_balltree():
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_balltree")
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
X, algorithm="prims_balltree"
)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -215,7 +219,9 @@ def test_hdbscan_prims_balltree():


def test_hdbscan_boruvka_kdtree():
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_kdtree")
labels, p, persist, ctree, ltree, selclstrs, mtree, = hdbscan(
X, algorithm="boruvka_kdtree"
)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -229,7 +235,9 @@ def test_hdbscan_boruvka_kdtree():


def test_hdbscan_boruvka_balltree():
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_balltree")
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
X, algorithm="boruvka_balltree"
)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -243,7 +251,7 @@ def test_hdbscan_boruvka_balltree():


def test_hdbscan_generic():
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="generic")
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(X, algorithm="generic")
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -261,7 +269,7 @@ def test_hdbscan_high_dimensional():
H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
# H, y = shuffle(X, y, random_state=7)
H = StandardScaler().fit_transform(H)
labels, p, persist, ctree, ltree, mtree = hdbscan(H)
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(H)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -275,7 +283,7 @@ def test_hdbscan_high_dimensional():


def test_hdbscan_best_balltree_metric():
labels, p, persist, ctree, ltree, mtree = hdbscan(
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
X, metric="seuclidean", V=np.ones(X.shape[1])
)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
Expand All @@ -287,7 +295,9 @@ def test_hdbscan_best_balltree_metric():


def test_hdbscan_no_clusters():
labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X) + 1)
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
X, min_cluster_size=len(X) + 1
)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == 0

Expand All @@ -298,7 +308,7 @@ def test_hdbscan_no_clusters():

def test_hdbscan_min_cluster_size():
for min_cluster_size in range(2, len(X) + 1, 1):
labels, p, persist, ctree, ltree, mtree = hdbscan(
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
X, min_cluster_size=min_cluster_size
)
true_labels = [label for label in labels if label != -1]
Expand All @@ -315,7 +325,7 @@ def test_hdbscan_callable_metric():
# metric is the function reference, not the string key.
metric = distance.euclidean

labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric)
labels, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(X, metric=metric)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters

Expand All @@ -333,8 +343,10 @@ def test_hdbscan_boruvka_kdtree_matches():

data = generate_noisy_data()

labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic")
labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(
labels_prims, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
data, algorithm="generic"
)
labels_boruvka, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
data, algorithm="boruvka_kdtree"
)

Expand All @@ -354,8 +366,10 @@ def test_hdbscan_boruvka_balltree_matches():

data = generate_noisy_data()

labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic")
labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(
labels_prims, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
data, algorithm="generic"
)
labels_boruvka, p, persist, ctree, ltree, selclstrs, mtree = hdbscan(
data, algorithm="boruvka_balltree"
)

Expand Down

0 comments on commit 809c35a

Please sign in to comment.