-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk_NN.py
133 lines (117 loc) · 3.7 KB
/
k_NN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import math
from itertools import groupby
from dataset import Dataset
def k_NN(dataset, x, k, metric, attributes_weights=None, distance_weight=None):
"""Method clasify row
Args:
dataset: dataset which used to classify
x (list): row to be classified
k (int): number of neighbors, must be large then 0
metric (predicate): metric by which calculate distance
attributes_weights (list): weights for attributes
distance_weight (list): weights for distances
Returns:
object: class. Type depends on target value type
None: if k is incorrect
"""
# Check k
if (k < 1):
return None
# If attributes weight not specify
if attributes_weights is None:
# Fill by 1, don't count target attribute
attributes_weights = [1 for i in range(dataset.get_columns_number() - 1)]
# If attributes_weights specify - check for same dimension
# don't count target attribute
elif len(attributes_weights) != len(dataset.get_columns_number() - 1):
return None
# If distance weight not specify
if distance_weight is None:
# Fill by 1
distance_weight = [1 for i in range(k)]
# If distance weight specify - check number of weights to equal to k
elif len(distance_weight) != k:
return None
# Count distance between two dots
output = []
for index, row in enumerate(dataset.data):
# Make copy of the row
value = row.copy()
# Delete target element
value.pop(dataset.target)
# Calculate distance
output.append((distance(x, value, attributes_weights, metric), index))
# Sort by distance descending
output.sort()
# Get first k elements
neighbours = output[:k]
# Get only indexes
classes = map(lambda x: dataset.data[x[1]][dataset.target], neighbours)
# Count number of instances in each class
frequency = [(len([i for i in value]), key) for key, value in groupby(sorted(classes), None)]
# Voting
voting = [frequency[i] * distance_weight[i] for i in range(len(frequency))]
# Get class
cl = sorted(voting)[-1]
# Return class
return cl[1]
def distance(x, y, weight, metric):
"""Function calculate distance between two dots
using specific metric and weights for each dimension
Args:
x (list): first dot
y (list): second dot
weight (list): weights determine how specific dimension
influence on general distance
metric (predicate): metric that used to calculate distance
"""
# If weight not specified
if weight is None:
weight = [1 for i in range(len(x))]
# Check coordinates for same length (dimension)
if len(x) != len(y) or len(x) != len(weight):
return -1
# Calculate distance
return metric(x, y, weight)
def euclidean(x, y, w):
"""Function calculate Euclidean distance
Args:
x (list): first dot
y (list): second dot
w (list): weights determine how specific dimension
"""
return math.sqrt(sum(
math.pow(x[i] - y[i], 2) * w[i]
for i in range(len(x))
))
# TODO implement other metrics
def manhattan(x, y, w):
"""Function calculate Manhattan distance
Args:
x (list): first dot
y (list): second dot
w (list): weights determine how specific dimension
"""
return sum(
math.fabs(x[i] - y[i]) * w[i]
for i in range(lex(x)))
def camberra(x, y, w):
"""Function calculate Camberra distance
Args:
x (list): first dot
y (list): second dot
w (list): weights determine how specific dimension
"""
return sum(
math.fabs((x[i] - y[i]) / (x[i] + y[i])) * w[i]
for i in range(lex(x)))
def chebychev(x, y, w):
"""Function calculate Chebychev distance
Args:
x (list): first dot
y (list): second dot
w (list): weights determine how specific dimension
"""
return max(
math.fabs(x[i] - y[i]) * w[i]
for i in range(lex(x)))