-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_tree.py
73 lines (53 loc) · 2.55 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from util import *
def H(labels):
''' return H-function of labels'''
total = len(labels)
_, counts = np.unique(labels, return_counts=True)
probs = counts / total
return -sum(probs * np.log2(probs))
def gain(dataset, left_data, right_data):
''' return gain value of a attribute according to left and right data'''
len_all = len(dataset)
len_left = len(left_data)
len_right = len(right_data)
remainder = (H(left_data)*len_left + H(right_data)*len_right) / len_all
return H(dataset) - remainder
def attribute_gain(dataset, att_index):
''' return: (max gain, value) to split an index-th attribute
according to its gain value'''
# iterate through each value in attribute and compute gain
best_attribute_split = (-float('inf'), 0, [], []) # best attribute gain, best value, left_data, right_data
sorted_data = dataset[dataset[:,att_index].argsort()] # sort data by attribute
attribute_values = np.unique(dataset[:,att_index])
# split data to left and right, and find the best att_index to split
for val in attribute_values:
left_data = sorted_data[np.where(sorted_data[:,att_index] < val)]
right_data = sorted_data[np.where(sorted_data[:,att_index] >= val)]
attribute_gain = gain(sorted_data, left_data, right_data)
if attribute_gain > best_attribute_split[0]:
best_attribute_split = (attribute_gain, val, left_data, right_data)
return best_attribute_split
def find_split(dataset):
''' return: best attribute, value, left data and right data for the best split'''
best_gain = -float('inf')
best_split = (0, 0, [], []) # best attribute, best value, left_data, right_data
# loop all attribute to find the best attribute to split
for att_i in range(len(dataset[0])-1):
gain, value, left_data, right_data = attribute_gain(dataset, att_i)
if gain > best_gain:
best_gain = gain
best_split = (att_i, value, left_data, right_data)
return best_split
def decision_tree_learning(dataset, depth=0):
''' return: root node of the decision tree '''
# check if all values in the label are the same
if len(np.unique(dataset[:,-1])) == 1:
return{'label': int(dataset[:,-1][0]), 'is_checked' : False}, depth
else:
# find attribute and value to split the dataset
attribute, value, left_data, right_data = find_split(dataset)
# recursively calls the fucntion on left and right node
left_node, left_depth = decision_tree_learning(left_data, depth+1)
right_node, right_depth = decision_tree_learning(right_data, depth+1)
root = {'attribute': attribute, 'value': value, 'left': left_node, 'right': right_node}
return root, max(left_depth, right_depth)