forked from eriklindernoren/ML-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 4
/
decision_tree.py
executable file
·177 lines (146 loc) · 6.69 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from __future__ import division
import numpy as np
#from sklearn import datasets
import sys
import os
# Import helper functions
dir_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, dir_path + "/../utils")
from data_manipulation import divide_on_feature, train_test_split,load_iris_dataset
from data_operation import calculate_entropy, accuracy_score
sys.path.insert(0, dir_path + "/../unsupervised_learning/")
from principal_component_analysis import PCA
# Class that represents a decision node or leaf in the decision tree
class DecisionNode():
def __init__(self, feature_i=None, threshold=None,
label=None, true_branch=None, false_branch=None):
self.feature_i = feature_i # Index for the feature that is tested
self.threshold = threshold # Threshold value for feature
self.label = label # Label if the node is a leaf in the tree
self.true_branch = true_branch # 'Left' subtree
self.false_branch = false_branch # 'Right' subtree
class DecisionTree():
def __init__(self, min_samples_split=2, min_gain=1e-7,
max_depth=float("inf")):
self.root = None # Root node in dec. tree
self.min_samples_split = min_samples_split
self.min_gain = min_gain
self.max_depth = max_depth
def fit(self, X, y):
# Build tree
self.current_depth = 0
self.root = self._build_tree(X, y)
def _build_tree(self, X, y):
# Calculate the entropy by the label values
entropy = calculate_entropy(y)
highest_info_gain = 0
best_criteria = None # Feature index and threshold
best_sets = None # Subsets of the data
# Add y as last column of X
X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)
n_samples, n_features = np.shape(X)
if n_samples >= self.min_samples_split:
# Calculate the information gain for each feature
for feature_i in range(n_features):
# All values of feature_i
feature_values = np.expand_dims(X[:, feature_i], axis=1)
unique_values = np.unique(feature_values)
# Iterate through all unique values of feature column i and
# calculate the informaion gain
for threshold in unique_values:
Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
# If one subset there is no use of calculating the
# information gain
if len(Xy_1) > 0 and len(Xy_2) > 0:
# Calculate information gain
p = len(Xy_1) / n_samples
y1 = Xy_1[:, -1]
y2 = Xy_2[:, -1]
info_gain = entropy - p * \
calculate_entropy(y1) - (1 - p) * \
calculate_entropy(y2)
# If this threshold resulted in a higher information gain than previously
# recorded save the threshold value and the feature
# index
if info_gain > highest_info_gain:
highest_info_gain = info_gain
best_criteria = {
"feature_i": feature_i, "threshold": threshold}
best_sets = {
"left_branch": Xy_1, "right_branch": Xy_2}
# If we have any information gain to go by we build the tree deeper
if self.current_depth < self.max_depth and highest_info_gain > self.min_gain:
leftX, leftY = best_sets["left_branch"][
:, :-1], best_sets["left_branch"][:, -1] # X - all cols. but last, y - last
rightX, rightY = best_sets["right_branch"][
:, :-1], best_sets["right_branch"][:, -1] # X - all cols. but last, y - last
true_branch = self._build_tree(leftX, leftY)
false_branch = self._build_tree(rightX, rightY)
self.current_depth += 1
return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
"threshold"], true_branch=true_branch, false_branch=false_branch)
# There's no recorded information gain so we are at a leaf
most_common = None
max_count = 0
results = {}
for label in np.unique(y):
count = len(y[y == label])
if count > max_count:
most_common = label
max_count = count
return DecisionNode(label=most_common)
# Do a recursive search down the tree and label the data sample by the
# value of the leaf that we end up at
def classify_sample(self, x, tree=None):
if tree is None:
tree = self.root
# If we have a label => classify
if tree.label is not None:
return tree.label
# Choose the feature that we will test
feature_value = x[tree.feature_i]
# Determine if we will follow left or right branch
branch = tree.false_branch
if isinstance(feature_value, int) or isinstance(feature_value, float):
if feature_value >= tree.threshold:
branch = tree.true_branch
elif feature_value == tree.threshold:
branch = tree.true_branch
# Test subtree
return self.classify_sample(x, branch)
# Classify samples one by one and return the set of labels
def predict(self, X):
y_pred = []
for x in X:
y_pred.append(self.classify_sample(x))
return y_pred
def print_tree(self, tree=None, indent=" "):
if not tree:
tree = self.root
# If we're at leaf => print the label
if tree.label is not None:
print tree.label
# Go deeper down the tree
else:
# Print test
print "%s:%s? " % (tree.feature_i, tree.threshold)
# Print the true scenario
print "%sT->" % (indent),
self.print_tree(tree.true_branch, indent + indent)
# Print the false scenario
print "%sF->" % (indent),
self.print_tree(tree.false_branch, indent + indent)
def main():
data=load_iris_dataset(dir_path + r"/../data/iris.csv")
X=data['X']
y=data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
clf = DecisionTree()
clf.fit(X_train, y_train)
# clf.print_tree()
y_pred = clf.predict(X_test)
print "Accuracy:", accuracy_score(y_test, y_pred)
pca = PCA()
pca.plot_in_2d(X_test, y_pred)
if __name__ == "__main__":
main()