-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
80 lines (62 loc) · 3 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from data_loader import DataLoader
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from collections import Counter
from reduce_dimensionality import reduce_dims
def plot_cluster_centers(centers):
"""Plots the generated cluster centers
Args:
centers (list): List of centers generated by clustering
"""
fig, axs = plt.subplots(2, 2)
axs[0, 0].imshow(centers[0].reshape(15, 15), cmap='Greys')
axs[0, 0].set_title('Center 1')
axs[0, 1].imshow(centers[1].reshape(15, 15), cmap='Greys')
axs[0, 1].set_title('Center 2')
axs[1, 0].imshow(centers[2].reshape(15, 15), cmap='Greys')
axs[1, 0].set_title('Center 3')
axs[1, 1].imshow(centers[3].reshape(15, 15), cmap='Greys')
axs[1, 1].set_title('Center 4')
# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
ax.label_outer()
plt.show()
def create_data_for_classifier(all_positives, cluster_labels, positive_label):
"""If label equals to positive_label, it creates a dataset so that the examples with the
positive_label take a label of 1 (data_positives) and the rest a label of 0 (data_negatives)
Then runs PCA on the positives and returns the transformed data
Args:
all_positives (list): All the positive training examples (only volcanoes)
cluster_labels (list): The labels created from clustering
positive_label (number): The label to be defined as positive
Returns:
data_pca (list): The transformed pca list with 6 features
labels (list): The concatenated labels
"""
data_positives = []
data_negatives = []
# type A with label 1, others 0
for training_ex, label in zip(X_train_volcanoes, cluster_labels):
if label == positive_label:
data_positives.append(tuple((training_ex, 1)))
else:
data_negatives.append(tuple((training_ex, 0)))
data, labels = zip(*(data_positives + data_negatives))
data_pca = reduce_dims('pca', data, labels, 6, SEED)
print('New data:', data_pca.shape)
return data_pca, labels
if __name__ == '__main__':
SEED = 10
data = DataLoader()
X_train_volcanoes, y_train_volcanoes = data.get_training_set_positives()
print('Class distribution before clustering:', Counter(y_train_volcanoes))
kmeans = KMeans(n_clusters=4, random_state=10).fit(X_train_volcanoes)
cluster_labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_
print('Class distribution after clustering:', Counter(cluster_labels))
print('Cluster centers:', len(cluster_centers))
plot_cluster_centers(cluster_centers)
classifier_data_A, labels_A = create_data_for_classifier(X_train_volcanoes, cluster_labels, 0)
classifier_data_B, labels_B = create_data_for_classifier(X_train_volcanoes, cluster_labels, 1)
classifier_data_C, labels_C = create_data_for_classifier(X_train_volcanoes, cluster_labels, 2)
classifier_data_D, labels_D = create_data_for_classifier(X_train_volcanoes, cluster_labels, 3)