-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tree_Learning_Requirements.py
150 lines (124 loc) · 6.87 KB
/
Tree_Learning_Requirements.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import Score_Measure
import numpy as np
import random
class Tree_Learning_Requirements:
def __init__(self, global_seed, attribute_range, attribute_info, num_target_classes,\
aggregator_func, parties_update_func, attribute_percentage,\
included_parties_indices, Secure_Aggregation_SMC):
self.attribute_range = attribute_range
self.attribute_info = attribute_info
self.num_target_classes = num_target_classes
self.aggregator_func = aggregator_func
self.parties_update_func = parties_update_func
self.global_seed = global_seed
self.global_random_state = None
random.seed(global_seed)
self.num_transactions = 0
self.num_updates = 0
self.att_perc = attribute_percentage
self.included_parties_indices = included_parties_indices
self.Secure_Aggregation_SMC = Secure_Aggregation_SMC
#_Tree Learning Requirements
class Criterion:
"""The criterion objects instantiated from Criterion class are used to divide a dataset into two sets"""
def __init__(self, attribute_type, attribute_index, point_or_category):
self.attribute_type = attribute_type
self.attribute_index = attribute_index
self.point_or_category = point_or_category
def check(self, sampel):
"""for numerical attributes: Check if the value in the specified attribute of the sampel is
greater or less than the specified point.
for categorical attributes: check if the sample' attribute is in the same as specified category"""
if self.attribute_type == ["categorical"]:
result = sampel[self.attribute_index] == self.point_or_category
else:
result = sampel[self.attribute_index] >= self.point_or_category
return result
def Pick_a_random_split(self, attribute_indices):
"""SELECT SPLITS RANDOMELY FOR THE ATTRIBUTES
input: range of attributes (fore categoricals: categories); info about attribute(categrical vs numerical);
indices of several attributes selected ranomly;
output: a list containing the randome splits for those randomly selected attributes."""
split_list = []
for i in range(len(attribute_indices)):
index_of_the_attribute = attribute_indices[i]
attribute_type = self.attribute_info[index_of_the_attribute]
if attribute_type == ["categorical"]:
# find all the available categories in the attribute
unique = self.attribute_range[index_of_the_attribute]
# selectr one of the categories randomly
rnd_split = random.choice(unique)
# append the generated split to the list of splits for all randomely selected attributes
split_list.append(rnd_split)
else:
# min and max in the range of the attribute
range_min = self.attribute_range[index_of_the_attribute][0]
range_max = self.attribute_range[index_of_the_attribute][1]
# generate a randome number in the range of the attribute
rnd_split = random.uniform(range_min, range_max)
# append the generated split to the list of splits for all randomely selected attributes
split_list.append(rnd_split)
return split_list
def find_best_attribute(self, impurity_measure='entropy', node_id=0, branch=None):
"""Find the best attribute for using in the node for classification.
input: a subset of data, including all the attributes and the labels.
output: best_value (best purity value),...
best_criterion (best attribute to be used for classification along with the criterion)."""
def stop_split(classses_vec, n_min):
flag = False
if sum(classses_vec) < n_min:
flag = True
else:
classses_vec = np.array(classses_vec)
if np.count_nonzero(classses_vec) < 2:
flag = True
return flag
best_value = 0 # best value for purity measure (with information gain or gini index)
best_criterion = None # best attribute along with the criterion for classification
classes_for_samples = [] # classes for samples meeting the criteria
##CHOOSE A VALUE FOR "n_min" IN THE ERT ALGORITHM
n_min = 5
##SAVE THE STATE OF THE RANDOM FUNCTION
random_func_state = random.getstate()
##SELECT ATTRIBUTES RANDOMELY
# number of attributes
num_att = len(self.attribute_info)
# percentage of attributes randomely selected
att_perc = self.att_perc#.7 #.5
# number of attributes randomely selected
num_rnd_att = np.int(round(att_perc * num_att))
# indeces of randomly selected attributes
rand_attribute_index = [random.randint(0, num_att - 1) for p in range(0, num_rnd_att)]
##SELECT RANDOM SPLITS
split_list = self.Pick_a_random_split(rand_attribute_index)
##CHOOSE BEST {ATTRIBUTE,SPLIT} SET
criterion_list = []
for i in range(num_rnd_att): # for each selected attribute
index_of_the_attribute = rand_attribute_index[i]
attribute_type = self.attribute_info[index_of_the_attribute]
rnd_split = split_list[i]
# make a citerion to evaluate
criterion_list.append(self.Criterion(attribute_type, index_of_the_attribute, rnd_split))
# divide the data based on the selected attribute and split
true_set_clasess, false_set_clasess = self.aggregator_func(node_id, branch, random_func_state,
self.num_target_classes, num_criteria=num_rnd_att)
self.num_transactions +=1
## FOR SMC PART
# if self.Secure_Aggregation_SMC:
# pass
for i in range(len(criterion_list)):
stop_flag = stop_split(((true_set_clasess[i, :]) + (false_set_clasess[i, :])), n_min) ### change...
if not (sum(true_set_clasess[i, :]) == 0 or sum(
false_set_clasess[i, :]) == 0 or stop_flag): ## to be checked in case of error
# calculate the purity measure after division
# adopting the above criterion
purity_val = Score_Measure.purity(true_set_clasess[i, :], false_set_clasess[i, :], impurity_measure)
if purity_val > best_value:
best_value, best_criterion = purity_val, criterion_list[i]
if best_value > 0:
self.parties_update_func(best_criterion, node_id, branch)
node_id += 1
self.num_updates += 1
else:
classes_for_samples = true_set_clasess[0, :] + false_set_clasess[0, :]
return best_value, best_criterion, classes_for_samples, node_id