-
Notifications
You must be signed in to change notification settings - Fork 0
/
stability.py
70 lines (54 loc) · 2.58 KB
/
stability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
import pandas as pd
from collections import Counter
from scipy.stats import t
def first_stability(attribute_count:dict, num:int):
'''
Find the most frequent attribute and calculate its first level stability.
Args:
attribute_count (dict):
The record of how many times each attribute/value is selected.
For example: {attr1: 5, attr2: 3, ...}
num (int):
The number of Monte Carlo simulations.
Returns:
best_attr:
The most frequent attribute been selected
n_k (float):
The first level stability round to two decimal (the proportion).
'''
key = np.argmax(list(attribute_count.values()))
best_attr = list(attribute_count.keys())[key] #the attribute which is most frequently selected
n_k = round(attribute_count[best_attr]/num, 2) #the number of times the attribute is selected
return best_attr, n_k
def second_stability(Xs, categorical:bool, confidence:float=None):
'''
Find the most frequent value/the range of the values and its second level stability.
Args:
Xs (list or Series):
The record of the values been selected in each iteration.
categorical (bool):
Whether the values in Xs are categorical or continuous.
confidence (float):
For calculating the confidence interval. Only work when it's continuous attribute.
Returns:
If it's categorical, the return values are the same as that of first_stability;
If it's continuous, it returns a dictionary contains mean, standard deviation, variance, confidence variance, and confidence interval.
'''
#if it's a categorical variable
if categorical:
#similar to the first level stability
count = Counter(Xs)
return first_stability(count, len(Xs))
#if it's a continuous variable
else:
if confidence is None:
raise ValueError("confidence is needed for calculating the confidence interval of a continuous attribute")
var = np.var(Xs)
std = np.std(Xs)
mean = np.mean(Xs)
cv = std/mean #confidence variance
dof = len(Xs)-1 #Degree of Freedom
t_crit = np.abs(t.ppf((1-confidence)/2,dof))
interval = mean-std*t_crit/np.sqrt(len(Xs)), mean+std*t_crit/np.sqrt(len(Xs)) #confidence interval
return {"mean": mean, "std": std, "var": var, "cv": cv, "c_interval": interval}