forked from eriklindernoren/ML-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 4
/
k_nearest_neighbors.py
executable file
·81 lines (69 loc) · 2.67 KB
/
k_nearest_neighbors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import sys
import os
import math
import numpy as np
import matplotlib.pyplot as plt
#from sklearn.datasets import load_iris
# Import helper functions
dir_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, dir_path + "/../utils")
from data_manipulation import train_test_split, normalize
from data_operation import euclidean_distance, accuracy_score
sys.path.insert(0, dir_path + "/../unsupervised_learning/")
from principal_component_analysis import PCA
from data_manipulation import load_iris_dataset
class KNN():
def __init__(self, k=5):
self.k = k
# Do a majority vote among the neighbors
def _get_vote(self, neighbors, classes):
max_count = 0
label = None
# Count class occurences among neighbors
for c in classes:
count = 0
for sample in neighbors:
sample_class = sample[1]
if sample_class == c:
count += 1
# If vote is larger than highest previous => update label pred.
if count > max_count:
max_count = count
label = c
return label
def predict(self, X_test, X_train, y_train):
classes = np.unique(y_train)
y_pred = []
# Determine the class of each sample
for test_sample in X_test:
neighbors = []
# Calculate the distance form each observed sample to the
# sample we wish to predict
for j, observed_sample in enumerate(X_train):
distance = euclidean_distance(test_sample, observed_sample)
label = y_train[j]
# Add neighbor information
neighbors.append([distance, label])
neighbors = np.array(neighbors)
# Sort the list of observed samples from lowest to highest distance
# and select the k first
k_nearest_neighbors = neighbors[neighbors[:, 0].argsort()][:self.k]
# Do a majority vote among the k neighbors and set prediction as the
# class receing the most votes
label = self._get_vote(k_nearest_neighbors, classes)
y_pred.append(label)
return np.array(y_pred)
def main():
data=load_iris_dataset(dir_path + r"/../data/iris.csv")
X=data['X']
y=data['target']
X = normalize(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf = KNN(k=3)
y_pred = clf.predict(X_test, X_train, y_train)
print "Accuracy score:", accuracy_score(y_test, y_pred)
# Reduce dimensions to 2d using pca and plot the results
pca = PCA()
pca.plot_in_2d(X_test, y_pred)
if __name__ == "__main__":
main()