-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_selection_univariate_testing.py
62 lines (40 loc) · 1.82 KB
/
feature_selection_univariate_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# import packages
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
# import data
my_df = pd.read_csv("feature_selection_sample_data.csv")
# Regression Template
X = my_df.drop(["output"], axis = 1)
y = my_df["output"]
feature_selector = SelectKBest(f_regression, k = "all")
fit = feature_selector.fit(X,y)
p_values = pd.DataFrame(fit.pvalues_)
scores = pd.DataFrame(fit.scores_)
input_variable_names = pd.DataFrame(X.columns)
summary_stats = pd.concat([input_variable_names, p_values, scores], axis = 1)
summary_stats.columns = ["input_variable", "p_value", "f_score"]
summary_stats.sort_values(by = "p_value", inplace = True)
p_value_threshold = 0.05
score_threshold = 5
selected_variables = summary_stats.loc[(summary_stats["f_score"] >= score_threshold) &
(summary_stats["p_value"] <= p_value_threshold)]
selected_variables = selected_variables["input_variable"].tolist()
X_new = X[selected_variables]
# Classification Template
from sklearn.feature_selection import SelectKBest, chi2
X = my_df.drop(["output"], axis = 1)
y = my_df["output"]
feature_selector = SelectKBest(chi2, k = "all")
fit = feature_selector.fit(X,y)
p_values = pd.DataFrame(fit.pvalues_)
scores = pd.DataFrame(fit.scores_)
input_variable_names = pd.DataFrame(X.columns)
summary_stats = pd.concat([input_variable_names, p_values, scores], axis = 1)
summary_stats.columns = ["input_variable", "p_value", "chi2_score"]
summary_stats.sort_values(by = "p_value", inplace = True)
p_value_threshold = 0.05
score_threshold = 5
selected_variables = summary_stats.loc[(summary_stats["chi2_score"] >= score_threshold) &
(summary_stats["p_value"] <= p_value_threshold)]
selected_variables = selected_variables["input_variable"].tolist()
X_new = X[selected_variables]