-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstandardCanopy.py
24 lines (21 loc) · 1.21 KB
/
standardCanopy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
# source: https://gist.github.com/gdbassett/528d816d035f2deaaca1
# T1 = Distance to centroid point to not include in other clusters
# T2 = Distance to centroid point to include in cluster
# T1 > T2 for overlapping clusters
# T1 < T2 will have points which reside in no clusters
# T1 == T2 will cause all points to reside in mutually exclusive clusters
# Distance metric can be any from here: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
# filemap may be a list of point names in their order in X. If included, row numbers from X will be replaced with names from filemap.
def myStandardCanopy(X, T1, T2, distance_metric='euclidean'):
canopies = dict()
X1_dist = pairwise_distances(X, metric=distance_metric)
canopy_points = set(range(X.shape[0]))
while canopy_points:
point = canopy_points.pop()
i = len(canopies)
canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])}
canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0]))
print('Standard canopy found %d clusters' %(len(canopies)))
return canopies