-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
124 lines (95 loc) · 3.55 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
import pandas as pd
def split_domain(value_domain, split_value):
'''
Split the attribute's domain.
Args:
value_domain (list|dict):
Continuous: {"min": x, "max": y};
Categorical: [c1, c2, ...].
split_value:
The split value.
Returns (tuple):
Continuous: ({"min": min, "max": split_value}, {"min": split_value, "max": max});
Categorical: ([split_value], a list after removing the split value).
'''
if isinstance(value_domain, list): #categorical
return list(split_value), value_domain.remove(split_value)
else: #continuous
left = right = value_domain.copy()
#generate a random value in the confidence interval as the split value
split_value = np.random.uniform(split_value[0], split_value[1], 1)[0]
left["max"] = split_value
right["min"] = split_value
return left, right
def true_support_filter(df:pd.DataFrame, attr_info:dict):
'''
Filter the sample after PCA by the true support of attributes.
Args:
df (DataFrame):
A DataFrame contains attributes and labels.
attr_info (dict):
The name and range of all the attributes:
Continuous: {attr1: {"min": x, "max": y}, attr2: ...};
Categorical: {attr1: [c1, c2, ...], attr2: ...}.
Returns:
df (DataFrame):
A DataFrame contains attributes and labels after filtration.
'''
for attr, values_domain in attr_info.items():
df = filter_sample(df, attr, values_domain)
return df
def filter_sample(df:pd.DataFrame, attr, values_domain):
'''
Filter the sample according to a specified attribute's domain.
Args:
df (DataFrame):
A DataFrame contains attributes and labels.
attr:
An attribute.
values_domain (list|dict):
Continuous: {"min": x, "max": y};
Categorical: [c1, c2, ...].
Returns:
df (DataFrame):
A DataFrame contains attributes and labels after filtration.
'''
#continuous
if isinstance(values_domain, dict):
df = df[df[attr]>=values_domain["min"]]
df = df[df[attr]<=values_domain["max"]]
#categorical
else:
df = df[df[attr].apply(lambda x: x in values_domain)]
return df
def binary_search(na: np.ndarray, criteria: float):
'''
Binary Search the index i when the sum of na[:i] greater or equal to the criteria.
Args:
na (ndarray):
A array of proportions sum to 1.
criteria (float):
A number small or equal to 1.
Returns:
split (int):
The index i which makes the sum of na[:i] greater or equal to the criteria.
'''
criteria = np.round(criteria, 2)
length = len(na)
start = 0
end = length - 1
split = end // 2
while True:
cur_sum = np.sum(na[:split])
if cur_sum < criteria:
if np.round(cur_sum + na[split], 2) >= criteria:
break
else:
start = split + 1
else:
end = split
split = (end + start) // 2
return split + 1
if __name__ == "__main__":
test = np.array([0.1,0.2,0.1,0.3,0.1,0.2])
print(binary_search(test, 1))