kmeans.py

from data_loader import DataLoader
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from collections import Counter
from reduce_dimensionality import reduce_dims


def plot_cluster_centers(centers):
    """Plots the generated cluster centers

    Args:
        centers (list): List of centers generated by clustering
    """
    fig, axs = plt.subplots(2, 2)

    axs[0, 0].imshow(centers[0].reshape(15, 15), cmap='Greys')
    axs[0, 0].set_title('Center 1')
    axs[0, 1].imshow(centers[1].reshape(15, 15), cmap='Greys')
    axs[0, 1].set_title('Center 2')
    axs[1, 0].imshow(centers[2].reshape(15, 15), cmap='Greys')
    axs[1, 0].set_title('Center 3')
    axs[1, 1].imshow(centers[3].reshape(15, 15), cmap='Greys')
    axs[1, 1].set_title('Center 4')

    # Hide x labels and tick labels for top plots and y ticks for right plots.
    for ax in axs.flat:
        ax.label_outer()

    plt.show()


def create_data_for_classifier(all_positives, cluster_labels, positive_label):
    """If label equals to positive_label, it creates a dataset so that the examples with the
    positive_label take a label of 1 (data_positives) and the rest a label of 0 (data_negatives)
    Then runs PCA on the positives and returns the transformed data

    Args:
        all_positives (list): All the positive training examples (only volcanoes)
        cluster_labels (list): The labels created from clustering
        positive_label (number): The label to be defined as positive

    Returns:
        data_pca (list): The transformed pca list with 6 features
        labels (list): The concatenated labels
    """
    data_positives = []
    data_negatives = []

    # type A with label 1, others 0
    for training_ex, label in zip(X_train_volcanoes, cluster_labels):
        if label == positive_label:
            data_positives.append(tuple((training_ex, 1)))
        else:
            data_negatives.append(tuple((training_ex, 0)))

    data, labels = zip(*(data_positives + data_negatives))
    data_pca = reduce_dims('pca', data, labels, 6, SEED)
    print('New data:', data_pca.shape)

    return data_pca, labels


if __name__ == '__main__':
    SEED = 10
    data = DataLoader()
    X_train_volcanoes, y_train_volcanoes = data.get_training_set_positives()
    print('Class distribution before clustering:', Counter(y_train_volcanoes))

    kmeans = KMeans(n_clusters=4, random_state=10).fit(X_train_volcanoes)
    cluster_labels = kmeans.labels_
    cluster_centers = kmeans.cluster_centers_
    print('Class distribution after clustering:', Counter(cluster_labels))
    print('Cluster centers:', len(cluster_centers))

    plot_cluster_centers(cluster_centers)

    classifier_data_A, labels_A = create_data_for_classifier(X_train_volcanoes, cluster_labels, 0)
    classifier_data_B, labels_B = create_data_for_classifier(X_train_volcanoes, cluster_labels, 1)
    classifier_data_C, labels_C = create_data_for_classifier(X_train_volcanoes, cluster_labels, 2)
    classifier_data_D, labels_D = create_data_for_classifier(X_train_volcanoes, cluster_labels, 3)