-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcv_scratch.py
155 lines (130 loc) · 6.1 KB
/
cv_scratch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from random import random
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import learning_curve
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, log_loss
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def preprocess_data(train, test, divider):
'''
Preprocesses the MNIST dataset for binary classification
Inputs:
train: pandas dataframe representing the training dataset
test: pandas dataframe representing the testing dataset
divider: scaling factor to rescale the RGB colors
Outputs:
mnist_train: preprocessed training dataset
xmnist_test: preprocessed testing data without the classifications
ymnist_test: preprocessed testing data classifications
'''
# Filters out data labeled as 3 or 8 for binary classification
mnist_train = train[(train['label'] == 3) | (train['label'] == 8)]
scaler_train = preprocessing.StandardScaler().fit(mnist_train.iloc[:, 1:])
mnist_train.iloc[:, 1:] = scaler_train.transform(mnist_train.iloc[:, 1:])
mnist_test = test[(test['label'] == 3) | (test['label'] == 8)]
scaler_test = preprocessing.StandardScaler().fit(mnist_test.iloc[:, 1:])
mnist_test.iloc[:, 1:] = scaler_test.transform(mnist_test.iloc[:, 1:])
xmnist_test = mnist_test.iloc[:, 1:] / divider
ymnist_test = mnist_test.iloc[:, 0]
return mnist_train, xmnist_test, ymnist_test
def shuffle_split_data(data, numfolds):
'''
Shuffles and splits data into K folds at random
Inputs:
data: the data to split
numfolds: represents K, the number of folds to split into
Outputs:
folds: an array in which the object at each index represents
one fold
'''
i = 0
numrows = data.shape[0]
# Number of data points in each fold
chunksize = numrows // numfolds
folds = []
# Randomly shuffles the data before splitting into folds
mnist_train = shuffle(data)
while i < numrows:
# Last fold will contain the remaining data, may not be size chunksize
if len(folds) == numfolds - 1:
folds.append(mnist_train.tail(numrows - i))
break
folds.append(mnist_train[i: i + chunksize])
i += chunksize
return folds
def k_fold_cv(mlmethod, numfolds, lambdas, train_folds, xtest, ytest, divider):
'''
K-fold cross validation implemented from scratch
Inputs:
mlmethod: string representing the type of ML method to apply CV on. Must
be "logreg" or "linsvm"
numfolds: represents K, the number of folds to split into
lambdas: the lambda values used in tuning the regularization hyperparameter
train_folds: the training data split into K folds
xtest: the testing data without the classification
ytest: classification of the testing data
divider: scaling factor to rescale the RGB colors
Outputs:
'''
cv_err = np.zeros((numfolds, len(lambdas)))
train_err = np.zeros((numfolds, len(lambdas)))
test_err = np.zeros((numfolds, len(lambdas)))
for k in range(numfolds):
# Creates the training and
xtrain = pd.DataFrame()
for i in range(numfolds):
if i != k:
xtrain = pd.concat([xtrain, train_folds[i]])
ytrain = xtrain.iloc[:, 0]
xtrain = xtrain.iloc[:, 1:] / divider
ycv = train_folds[k].iloc[:, 0]
xcv = train_folds[k].iloc[:, 1:] / divider
for j in range(len(lambdas)):
if mlmethod == "logreg":
# Logistic Regression
clf = LogisticRegression(C=lambdas[j], random_state=0, solver='sag', tol=5e-3, max_iter=1000, n_jobs=-1).fit(xtrain, ytrain)
clf = clf.sparsify()
cv_err[k, j] = log_loss(ycv, clf.predict_proba(xcv))
train_err[k, j] = log_loss(ytrain, clf.predict_proba(xtrain))
test_err[k, j] = log_loss(ytest, clf.predict_proba(xtest))
elif mlmethod == "linsvm":
# Linear SVM
clf = LinearSVC(dual=False, random_state=0, C=lambdas[i], tol=1e-2, max_iter=1000).fit(xtrain, ytrain)
cv_err[k, j] = 1 - accuracy_score(ycv, clf.predict(xcv))
train_err[k, j] = 1 - accuracy_score(ytrain, clf.predict(xtrain))
test_err[k, j] = 1 - accuracy_score(ytest, clf.predict(xtest))
best_lambda = lambdas[np.argmin(mean_cv_err)]
mean_cv_err = np.mean(cv_err, axis=0)
mean_train_err = np.mean(train_err, axis=0)
mean_test_err = np.mean(test_err, axis=0)
return best_lambda, mean_cv_err, mean_train_err, mean_test_err
K = 5
DIVIDER = np.array(255)
mnist_train = pd.read_csv("MNIST/mnist_train.csv")
mnist_test = pd.read_csv("MNIST/mnist_test.csv")
mnist_train, xmnist_test, ymnist_test = preprocess_data(mnist_train, mnist_test, DIVIDER)
mnist_train_folds = shuffle_split_data(mnist_train, K)
# Logistic Regression
lambdas = np.linspace(1e0, 1e2, num=100)
best_lambda, mean_cv_err, mean_train_err, mean_test_err = k_fold_cv("logreg", K, lambdas, mnist_train_folds, xmnist_test, ymnist_test, DIVIDER)
# Linear SVM
lambdas = np.array([1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2])
best_lambda, mean_cv_err, mean_train_err, mean_test_err = k_fold_cv("linsvm", K, lambdas, mnist_train_folds, xmnist_test, ymnist_test, DIVIDER)
# Plots Error Curves
print("Lambdas: ", lambdas)
print("Mean CV Error: ", mean_cv_err)
print("Mean Test Error: ", mean_test_err)
print("Best Lambda: ", best_lambda)
plt.plot(lambdas, mean_cv_err, color='r', label='CV Error')
plt.plot(lambdas, mean_train_err, color='g', label='Training Error')
plt.plot(lambdas, mean_test_err, color='b', label='Test Error')
plt.xlabel("Lambda (Hyperparameter) Value")
plt.ylabel("Accuracy Error")
plt.title("Error Curves vs. Hyperparameter Choice for Linear SVM")
plt.legend()
logregfig = plt.gcf()
plt.show()
logregfig.savefig('SVMErrorCurves.png')