-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmodel.py
189 lines (146 loc) · 6.37 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, roc_auc_score
def auc_curve_plot(clf, X, y):
'''
This function accepts y_validate or y_test and predict_proba or decision_function classifier attribute
Example:
---------
auc_mertic(actual_outcome=y_validate, decision_func=svc.decision_function(X_validate))
Returns a plot of the ROC curve and Precision Recall Curve.
Parameters
----------
clf : The classification model fit with X_train, y_train
X : X_validate or X_test
y : y_validate or y_test
Returns
-------
A visualization of the roc and precision recall curves for a given classification model.
'''
# Set visualization defaults
sns.set_context('talk')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
# Plot the roc curve
plot_roc_curve(clf, X, y, ax=ax1)
ax1.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
# Plot the precision recall curve
plot_precision_recall_curve(clf, X, y, ax=ax2)
def auc_score_proba(clf, X, y):
'''
This function accepts a classification model that can estimate probability, X_set and y_set
and returns a dataframe of predicted probabilty on a binary class
and also returns a auc score
Parameter
----------
clf: the classification algorithm after fitting on X_train, y_train
X: X_train, X_validate and X_test
y: y_train, y_validate and y_test
Returns
----------
1. A dataframe containing the probability estimates
2. AUC score
'''
y_proba = clf.predict_proba(X)
y_proba = pd.DataFrame(y_proba, columns=['p_0', 'p_1'])
score = roc_auc_score(y, y_proba['p_1'])
return y_proba, score
###################################### Evaluation Functions #########################################
def train_model(names, classifiers, X_set, y_set):
'''
This function fits a list of classifiers to the training set.
Returns a dataframe with model name and train AUC score.
'''
metrics = pd.DataFrame()
models = []
for name, clf in zip(names, classifiers):
# Set up a progress indicator
print(f"Currently running on model {name}")
# Working on thr train dataset
clf = clf.fit(X_set, y_set)
y_pred_proba = clf.predict_proba(X_set)
y_proba, score = auc_score_proba(clf, X_set, y_set)
d = {"Algo": name, "dataset": "train", "AUC score": score}
metrics = metrics.append(d, ignore_index=True)
models.append(clf)
metrics = metrics.sort_values(by="AUC score", ascending=False)
metrics.reset_index(drop=True, inplace=True)
return metrics, models
def validate_model(names, classifiers, X_set, y_set):
'''
This function calculates the AUC score using the validation set.
Returns a dataframe with model name and validation AUC score.
'''
metrics = pd.DataFrame()
for name, clf in zip(names, classifiers):
# Set up a progress indicator
print(f"Currently running on model {name}")
y_proba, score = auc_score_proba(clf, X_set, y_set)
d = {"Algo": name, "dataset": "validate", "AUC score": score}
metrics = metrics.append(d, ignore_index=True)
metrics = metrics.sort_values(by="AUC score", ascending=False)
metrics.reset_index(drop=True, inplace=True)
return metrics
def test_model(names, classifiers, X_set, y_set):
'''
This function calculates the AUC score using the validation set.
Returns a dataframe with model name and validation AUC score.
'''
metrics = pd.DataFrame()
for name, clf in zip(names, classifiers):
# Set up a progress indicator
print(f"Currently running on model {name}")
y_proba, score = auc_score_proba(clf, X_set, y_set)
d = {"Algo": name, "dataset": "test", "AUC score": score}
metrics = metrics.append(d, ignore_index=True)
metrics = metrics.sort_values(by="AUC score", ascending=False)
metrics.reset_index(drop=True, inplace=True)
return metrics
def model_multiple_algos(names, classifiers, X_train, y_train, X_validate, y_validate, X_test, y_test):
'''
This function accetps a list of classifiers, feature dataset and target dataset
and return the auc scores.
The order of the names should match the order of the classifiers
Parameter
----------
names: a list of the names of the classifiers that will be tested.
classifiers: a list of classifier objects.
X_train: features in the train dataset
y_train: target variable in the train dataset
X_validate: features in the validate dataset
y_validate: target variable in the validate dataset
X_test: features in the test dataset
y_test: target variable in the test dataset
Example
----------
names: ["logistic Regression", "Decision Tree"]
classifiers: [LogisticRegression(), DecisionTreeClassifier(max_depth=3)]
all the datasets ready for modeling
Return
----------
A dataframe of auc scores associated with the classification algorithm and the dataset it used.
'''
metrics = pd.DataFrame()
for name, clf in zip(names, classifiers):
# Set up a progress indicator
print(f"Currently running on model {name}")
# Fit on the train dataset
clf = clf.fit(X_train, y_train)
# Compute the AUC score on train
y_proba, score = auc_score_proba(clf, X_train, y_train)
d = {"Algo": name, "dataset": "train", "AUC score": score}
metrics = metrics.append(d, ignore_index=True)
# Compute the AUC score on validate
y_proba, score = auc_score_proba(clf, X_validate, y_validate)
d = {"Algo": name, "dataset": "validate", "AUC score": score}
metrics = metrics.append(d, ignore_index=True)
# Compute the AUC score on test
y_proba, score = auc_score_proba(clf, X_test, y_test)
d = {"Algo": name, "dataset": "test", "AUC score": score}
metrics = metrics.append(d, ignore_index=True)
# Show the completeness of the modeling
print(f"{name} has completed")
return metrics