-
Notifications
You must be signed in to change notification settings - Fork 0
/
resampling_methods.py
91 lines (68 loc) · 3.14 KB
/
resampling_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
"""Examples of various resampling approaches.
O'Reilly E-book page:
https://learning.oreilly.com/library/view/building-machine-learning/9781484244708/html/463852_1_En_24_Chapter.xhtml
"""
# Make the models deterministic
RANDOM_SEED = 42
class Resampler:
"""A set of techniques that involve selecting a subset of the available dataset, training on that data
subset, and using the remainder of the data to evaluate the trained model."""
def resample_data(self, data):
"""Apply and compare two resampling algorithms.
:param data: tuple - a tuple containing the data and the targets.
:return:
"""
X = data[0]
y = data[1]
print()
self.apply_kfold_cross_validation(X, y)
print('=' * 50)
self.apply_locv(X, y)
@staticmethod
def apply_kfold_cross_validation(X, y):
"""Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds.
:param X: Numpy array - features
:param y: Numpy array - labels
:return:
"""
print('Calculating KFold accuracy ... ')
# Initialize KFold to shuffle the data before splitting
kfold = KFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)
# Create the model
knn_clf = KNeighborsClassifier(n_neighbors=3)
# Fit the model using cross validation
cv_result = cross_val_score(knn_clf, X, y, cv=kfold)
# Evaluate the model performance using accuracy metric
print('KFold Accuracy: {:.2f}'.format(cv_result.mean() * 100.0, cv_result.std() * 100.0))
@staticmethod
def apply_locv(X, y):
"""In leave one out cross validation (LOOCV) just one example is assigned to the test set,
and the model is trained on the remainder of the dataset. This process is repeated for all
the examples in the dataset. This process is repeated until all the examples in the dataset
have been used for evaluating the model.
:param X: Numpy array - features
:param y: Numpy array - labels
:return:
"""
print('Calculating LOOCV accuracy ... ')
# Initialize LOOCV.
loocv = LeaveOneOut()
# Create the model
knn_clf = KNeighborsClassifier(n_neighbors=3)
# Fit the model using cross validation.
cv_result = cross_val_score(knn_clf, X, y, cv=loocv)
# Evaluate the model performance using accuracy metric
print('KFold Accuracy: {:.2f}'.format(cv_result.mean() * 100.0, cv_result.std() * 100.0))
if __name__ == "__main__":
# Get some sample data from sklearn datasets. Setting return_X_y to True will
# constrain the output to be a tuple containing only the data and the targets.
flower_data = datasets.load_iris(return_X_y=True)
housing_data = datasets.load_boston(return_X_y=True)
resampler = Resampler()
resampler.resample_data(flower_data)