-
Notifications
You must be signed in to change notification settings - Fork 0
/
classification.py
271 lines (222 loc) · 9.02 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import json
import joblib
import os
import numpy as np
import pandas as pd
from tabular_data import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# preprocessing steps
np.random.seed(5)
dataset = pd.read_csv('tabular_data/clean_tabular_data.csv')
features, label = load_airbnb(dataset, 'Category')
label_series = dataset['Category']
encoder = LabelEncoder()
label_categories = label_series.unique()
label_encoded = encoder.fit_transform(label_series)
X_train, X_test, y_train, y_test = train_test_split(features, label_encoded, test_size=0.3)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5)
print(f'Number of samples in dataset: {len(features)}')
print(
'Number of samples in: '
f'training: {y_train.shape[0]}, '
f'validation: {y_validation.shape[0]}, '
f'testing: {y_test.shape[0]}, '
)
# normalize the features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)
# get baseline classification model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
y_validation_pred = model.predict(X_validation_scaled)
y_test_pred = model.predict(X_test_scaled)
# evaluate model using accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
validation_acc = accuracy_score(y_validation, y_validation_pred)
print(
f'baseline_train_acc: {round(train_acc, 4)}, '
f'baseline_test_acc: {round(test_acc, 4)}, '
f'baseline_val_acc: {round(validation_acc, 4)}'
)
# evaluate model using precision
train_pres = precision_score(y_train, y_train_pred, average='macro')
test_pres = precision_score(y_test, y_test_pred, average='macro')
validation_pres = precision_score(y_validation, y_validation_pred, average='macro')
print(
f'baseline train_pres: {round(train_pres, 4)}, '
f'baseline test_pres: {round(test_pres, 4)}, '
f'baseline val_pres: {round(validation_pres, 4)}'
)
# evaluate model using F1 score
train_f1 = f1_score(y_train, y_train_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')
validation_f1 = f1_score(y_validation, y_validation_pred, average='macro')
print(
f'baseline_train_f1: {round(train_f1, 4)}, '
f'baseline test_f1: {round(test_f1, 4)}, '
f'baseline_val_f1: {round(validation_f1, 4)}'
)
# evaluate model using recall
train_recall = recall_score(y_train, y_train_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
validation_recall = recall_score(y_validation, y_validation_pred, average='macro')
print(
f'baseline_train_recall: {round(train_recall, 4)}, '
f'baseline_test_recall: {round(test_recall, 4)}, '
f'baseline_val_recall: {round(validation_recall, 4)}'
)
def tune_regression_model_hyperparameters(model_class, X_train_scaled, y_train,
X_validation_scaled, y_validation, X_test_scaled, y_test, parameter_grid):
'''
Tunes the hyperparameters for a specified classification model class.
Parameters
----------
model_class:
The class of regression model to be tuned.
X_train_scaled:
The normalized training set for the features.
y_train:
The training set for the label.
X_validation scaled:
The normalized validation set for the features.
y_validation:
The validation set for the label.
X_test_scaled:
The normalized testing set for the features.
y_test:
The testing set for the label.
parameter_grid: dict
A dictionary of specified hyperparameters corresponding to the model_class.
Returns
-------
best_model_data: list
A list of the best model_class, it's best parameters and performance metrics.
'''
np.random.seed(5)
models = {
'LogisticRegression': LogisticRegression,
'RandomForestClassifier': RandomForestClassifier,
'DecisionTreeClassifier': DecisionTreeClassifier,
'GradientBoostingClassifier': GradientBoostingClassifier
}
# perform grid search to find best hyperparameters
model = models[model_class]()
grid_search = GridSearchCV(estimator=model, param_grid=parameter_grid, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)
best_model = models[model_class](**grid_search.best_params_)
best_model.fit(X_train_scaled, y_train)
# use the best model from the grid search to predict the validation & test sets
y_validation_pred = best_model.predict(X_validation_scaled)
y_test_pred = best_model.predict(X_test_scaled)
# determine performance metrics
validation_acc = accuracy_score(y_validation, y_validation_pred)
test_acc = accuracy_score(y_test, y_test_pred)
validation_f1 = f1_score(y_validation, y_validation_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')
metrics = {'validation_acc': validation_acc, 'test_acc': test_acc, 'validation_f1': validation_f1, 'test_f1': test_f1}
best_model_data = [best_model, grid_search.best_params_, metrics]
print(best_model_data)
return best_model_data
def save_model(best_model_data, model_name):
'''
Saves the best model for each model_class.
Parameters
----------
best_model_data: list
A list of the best model_class, it's best parameters and performance metrics.
model_name: str
The name of the model_class being saved.
'''
model, hyperparameters, metrics = best_model_data
model_folder = 'models/classification/' + model_name
if not os.path.exists(model_folder):
os.makedirs(model_folder)
joblib.dump(model, f'{model_folder}/model.joblib')
with open(f'{model_folder}/hyperparameters.json', 'w') as file:
json.dump(hyperparameters, file)
with open(f'{model_folder}/metrics.json', 'w') as file:
json.dump(metrics, file)
def evaluate_all_models():
'''
Evaluates all chosen model_classes and saves the best model for each class.
'''
np.random.seed(5)
lr_model = tune_regression_model_hyperparameters('LogisticRegression', X_train_scaled, y_train,
X_validation_scaled, y_validation, X_test_scaled, y_test, parameter_grid =
{
'C': [0.001, 0.01, 0.1, 1, 10],
'penalty': ['l2'],
'max_iter': [100, 500, 1000],
'multi_class': ['ovr'],
'solver': ['lbfgs']
})
save_model(lr_model, 'LogisticRegression')
rfc_model = tune_regression_model_hyperparameters('RandomForestClassifier', X_train_scaled, y_train,
X_validation_scaled, y_validation, X_test_scaled, y_test, parameter_grid =
{
'n_estimators': [50, 75, 100],
'criterion': ['gini', 'entropy'],
'max_depth': [30, 40, 50],
'min_samples_split': [2, 0.1, 0.2],
'min_samples_leaf': [1, 2, 3]
})
save_model(rfc_model, 'RandomForestClassifier')
gbc_model = tune_regression_model_hyperparameters('GradientBoostingClassifier', X_train_scaled, y_train,
X_validation_scaled, y_validation, X_test_scaled, y_test, parameter_grid =
{
'n_estimators': [25, 50, 100,],
'learning_rate': [0.01, 0.1, 1],
'max_depth': [1, 3, 5],
'max_features': [1, 2, 3]
})
save_model(gbc_model, 'GradientBoostingClassifier')
dtc_model = tune_regression_model_hyperparameters('GradientBoostingClassifier', X_train_scaled, y_train,
X_validation_scaled, y_validation, X_test_scaled, y_test, parameter_grid =
{
'max_depth': [10, 20, 30],
'min_samples_split': [2, 4, 0.2, 0.4],
'min_samples_leaf': [1, 3, 5],
'max_features': [4, 6, 8]
})
save_model(dtc_model, 'DecisionTreeClassifier')
def find_best_model(models_directory):
'''
Finds the best model from a selection of the best model in each model_class.
Parameters
----------
models_directory: str
Directory for the location of the saved best models for each model_class.
Returns
-------
best_model: str
The name of the best model_class.
'''
best_model = None
best_acc = -float('inf')
best_f1 = float('inf')
for model_name in os.listdir(models_directory):
metrics_path = os.path.join(models_directory, model_name, 'metrics.json')
with open(metrics_path) as f:
metrics = json.load(f)
val_acc = metrics['validation_acc']
val_f1 = metrics['validation_f1']
if val_acc > best_acc and val_f1 > best_f1:
best_model = model_name
best_acc = val_acc
best_f1 = val_f1
return best_model
if __name__=='__main__':
evaluate_all_models()
best_model = find_best_model('models/classification')
print(best_model)