-
Notifications
You must be signed in to change notification settings - Fork 0
/
shift_adapt.py
335 lines (261 loc) · 15.3 KB
/
shift_adapt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
from label_shift_adaptation import analyze_val_data, update_probs
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
def LoadData(path,Train=False):
# Load data from the path
data = pd.read_csv(path,usecols=['Distance(mi)', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)',
'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
'Traffic_Signal', 'Turning_Loop', 'Severity'])
# Assign X data
X = data[['Distance(mi)', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)',
'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
'Traffic_Signal', 'Turning_Loop']]
# Assign Y data
Y = data['Severity']
if not Train:
# get sorted true labels by class
true_labels = data['Severity'].value_counts().sort_index()
return X,Y,true_labels,len(Y)
else:
return X,Y
def TrainModel(X,Y,classifier):
# train the model
classifier.fit(X,Y)
return classifier
def Predict(X,Y,classifier,prob=False):
# predict from the model
Y_predict = classifier.predict(X.values)
# Get predicted probabilities
Y_prob = classifier.predict_proba(X.values)
# Calculate accuracy in % with 2 decimal places
accuracy = round(accuracy_score(Y, Y_predict) * 100, 2)
# Return predicted prob if asked
if prob:
return Y_predict, Y_prob
else:
return accuracy, Y_predict
def Train(X_train,Y_train):
# Train all the classifiers
Dummy_maj = TrainModel(X_train,Y_train,DummyClassifier(strategy='most_frequent',random_state=0))
Dummy_strat = TrainModel(X_train,Y_train,DummyClassifier(strategy='stratified',random_state=0))
RF = TrainModel(X_train,Y_train, RandomForestClassifier(random_state=0,min_samples_split=10,n_estimators=100))
ThreeNN = TrainModel(X_train,Y_train,KNeighborsClassifier(n_neighbors=3))
NineNN = TrainModel(X_train,Y_train,KNeighborsClassifier(n_neighbors=9))
GP = TrainModel(X_train,Y_train,GaussianProcessClassifier(random_state=0))
# assign the trained models to a list to return
classifiers = [Dummy_maj, Dummy_strat,RF,GP,ThreeNN,NineNN]
for model in classifiers:
print(model, model.get_params())
return classifiers
def ConvertAccuracyTable(val_accuracies,t1_accuracies,t2_accuracies,t3_accuracies):
model_names = ['Dummy Classifier with Most-Frequent', 'Dummy Classifier with Stratified', 'Random Forest',
'GaussianProcess','3-NearestNeighbor', '9-NearestNeighbor']
df = pd.DataFrame({'Classifier': model_names, 'Val-TX':val_accuracies, 'Test1-TX':t1_accuracies,
'Test2-FL':t2_accuracies, 'Test3-FL':t3_accuracies})
return df
def ConvertAccuracyTableBBSC(t1_accuracies,t2_accuracies,t3_accuracies):
model_names = ['Random Forest','GaussianProcess','3-NearestNeighbor', '9-NearestNeighbor']
df = pd.DataFrame({'Classifier': model_names,'Test1-TX':t1_accuracies,'Test2-FL':t2_accuracies, 'Test3-FL':t3_accuracies})
return df
def PredictAccuracies(classifiers,X_val,Y_val,X_t1,Y_t1,X_t2,Y_t2,X_t3,Y_t3):
# Init list for accuracies
val_accuracies = []
t1_accuracies = []
t2_accuracies = []
t3_accuracies = []
i = 0
weights_adapted = {}
GB = classifiers[3]
ThreeNN = classifiers[4]
# Predict and get the accuracy on val set for all models
for model in classifiers:
# Predict, using .values to predict using the values, otherwise gives error
accuracy,Y_val_predict_var = Predict(X_val,Y_val,model)
val_accuracies.append(accuracy)
# Predict and get the accuracy on test set 1 for all models
# Predict, using .values to predict using the values, otherwise gives error
accuracy,Y_t1_predict_var = Predict(X_t1,Y_t1,model)
t1_accuracies.append(accuracy)
# Predict and get the accuracy on test set 2 for all models
# Predict, using .values to predict using the values, otherwise gives error
accuracy,Y_t2_predict_var = Predict(X_t2,Y_t2,model)
t2_accuracies.append(accuracy)
# Predict and get the accuracy on test set 3 for all models
# Predict, using .values to predict using the values, otherwise gives error
accuracy,Y_t3_predict_var = Predict(X_t3,Y_t3,model)
t3_accuracies.append(accuracy)
# Analyze and find weights
if i > 1 and i < len(classifiers):
weights_adapted[str(model)] = []
weights_adapted[str(model)].extend([float(val) for val in analyze_val_data(Y_val,Y_val_predict_var,Y_t1_predict_var)])
weights_adapted[str(model)].extend([float(val) for val in analyze_val_data(Y_val,Y_val_predict_var,Y_t2_predict_var)])
weights_adapted[str(model)].extend([float(val) for val in analyze_val_data(Y_val,Y_val_predict_var,Y_t3_predict_var)])
i+=1
return val_accuracies,t1_accuracies,t2_accuracies,t3_accuracies,weights_adapted
def ConvertWeightsToTable(weights):
data = []
processed_models = set()
column_name = ['Test1-TX', 'Test2-FL', 'Test3-FL']
for model, values in weights.items():
# Check if the model is not in set
if model not in processed_models:
# Create a dictionary for each model
row = {'Model': model}
# Add the first 4 values of the model's list
for i in range(3):
col_name = column_name[i]
# slice the 4 corresponding values
col_values = values[i*4:(i+1)*4]
# Round each value to 2 decimal places
col_values_rounded = ["{:.2f}".format(value) for value in col_values]
row[col_name] = col_values_rounded
data.append(row)
# Add the model to the set
processed_models.add(model)
df = pd.DataFrame(data)
return df
def SeparateWights(weights):
# Init Dict for weight separation
RF_weights = {'T1': [], 'T2': [], 'T3': []}
GP_weights = {'T1': [], 'T2': [], 'T3': []}
ThreeNN_weights = {'T1': [], 'T2': [], 'T3': []}
NineNN_weights = {'T1': [], 'T2': [], 'T3': []}
# Separate weights for for each classifier per test set
RF_weights['T1'] = (weights['RandomForestClassifier(min_samples_split=10, random_state=0)'][:4])
RF_weights['T2'] = (weights['RandomForestClassifier(min_samples_split=10, random_state=0)'][4:8])
RF_weights['T3'] = (weights['RandomForestClassifier(min_samples_split=10, random_state=0)'][8:12])
GP_weights['T1'] = (weights['GaussianProcessClassifier(random_state=0)'][:4])
GP_weights['T2'] = (weights['GaussianProcessClassifier(random_state=0)'][4:8])
GP_weights['T3'] = (weights['GaussianProcessClassifier(random_state=0)'][8:12])
ThreeNN_weights['T1'] = (weights['KNeighborsClassifier(n_neighbors=3)'][:4])
ThreeNN_weights['T2'] = (weights['KNeighborsClassifier(n_neighbors=3)'][4:8])
ThreeNN_weights['T3'] = (weights['KNeighborsClassifier(n_neighbors=3)'][8:12])
NineNN_weights['T1'] = (weights['KNeighborsClassifier(n_neighbors=9)'][:4])
NineNN_weights['T2'] = (weights['KNeighborsClassifier(n_neighbors=9)'][4:8])
NineNN_weights['T3'] = (weights['KNeighborsClassifier(n_neighbors=9)'][8:12])
# Return weights dicts
return RF_weights,GP_weights,ThreeNN_weights,NineNN_weights
def UniqueClasses(Y):
classes = np.unique(Y).tolist()
return classes
def PredictAccuraciesBBSC(classifiers,X_val,Y_val,X_t1,Y_t1,X_t2,Y_t2,X_t3,Y_t3,weights):
# slice to ignore dummy classifiers
classifiers = classifiers[2:len(classifiers)]
# Get Prob and Pred for each model for each test set
Y_t1_Predict_RF, Y_t1_Prob_RF = Predict(X_t1,Y_t1,classifiers[0],True)
Y_t2_Predict_RF, Y_t2_Prob_RF = Predict(X_t2,Y_t2,classifiers[0],True)
Y_t3_Predict_RF, Y_t3_Prob_RF = Predict(X_t3,Y_t3,classifiers[0],True)
Y_t1_Predict_GP, Y_t1_Prob_GP = Predict(X_t1,Y_t1,classifiers[1],True)
Y_t2_Predict_GP, Y_t2_Prob_GP = Predict(X_t2,Y_t2,classifiers[1],True)
Y_t3_Predict_GP, Y_t3_Prob_GP = Predict(X_t3,Y_t3,classifiers[1],True)
Y_t1_Predict_3NN, Y_t1_Prob_3NN = Predict(X_t1,Y_t1,classifiers[2],True)
Y_t2_Predict_3NN, Y_t2_Prob_3NN = Predict(X_t2,Y_t2,classifiers[2],True)
Y_t3_Predict_3NN, Y_t3_Prob_3NN = Predict(X_t3,Y_t3,classifiers[2],True)
Y_t1_Predict_9NN, Y_t1_Prob_9NN = Predict(X_t1,Y_t1,classifiers[3],True)
Y_t2_Predict_9NN, Y_t2_Prob_9NN = Predict(X_t2,Y_t2,classifiers[3],True)
Y_t3_Predict_9NN, Y_t3_Prob_9NN = Predict(X_t3,Y_t3,classifiers[3],True)
# Get unique classes (same for all test sets), hence only getting from 1 test set
unique_classes = UniqueClasses(Y_t1)
# Get all the weights, dicts, divided by models and test sets
RF_weights,GP_weights,ThreeNN_weights,NineNN_weights = SeparateWights(weights)
# Update the prob using BBSC
new_t1_pred_RF, new_t1_prob_RF = update_probs(unique_classes,RF_weights['T1'],Y_t1_Predict_RF,Y_t1_Prob_RF)
new_t2_pred_RF, new_t2_prob_RF = update_probs(unique_classes,RF_weights['T2'],Y_t2_Predict_RF,Y_t2_Prob_RF)
new_t3_pred_RF, new_t3_prob_RF = update_probs(unique_classes,RF_weights['T3'],Y_t3_Predict_RF,Y_t3_Prob_RF)
new_t1_pred_GP, new_t1_prob_GP = update_probs(unique_classes,GP_weights['T1'],Y_t1_Predict_GP,Y_t1_Prob_GP)
new_t2_pred_GP, new_t2_prob_GP = update_probs(unique_classes,GP_weights['T2'],Y_t2_Predict_GP,Y_t2_Prob_GP)
new_t3_pred_GP, new_t3_prob_GP = update_probs(unique_classes,GP_weights['T3'],Y_t3_Predict_GP,Y_t3_Prob_GP)
new_t1_pred_3NN, new_t1_prob_3NN = update_probs(unique_classes,ThreeNN_weights['T1'],Y_t1_Predict_3NN,Y_t1_Prob_3NN)
new_t2_pred_3NN, new_t2_prob_3NN = update_probs(unique_classes,ThreeNN_weights['T2'],Y_t2_Predict_3NN,Y_t2_Prob_3NN)
new_t3_pred_3NN, new_t3_prob_3NN = update_probs(unique_classes,ThreeNN_weights['T3'],Y_t3_Predict_3NN,Y_t3_Prob_3NN)
new_t1_pred_9NN, new_t1_prob_9NN = update_probs(unique_classes,NineNN_weights['T1'],Y_t1_Predict_9NN,Y_t1_Prob_9NN)
new_t2_pred_9NN, new_t2_prob_9NN = update_probs(unique_classes,NineNN_weights['T2'],Y_t2_Predict_9NN,Y_t2_Prob_9NN)
new_t3_pred_9NN, new_t3_prob_9NN = update_probs(unique_classes,NineNN_weights['T3'],Y_t3_Predict_9NN,Y_t3_Prob_9NN)
# Confusion matrices for GB model
conf_matrix_t1 = confusion_matrix(Y_t1,new_t1_pred_GP)
conf_matrix_t2 = confusion_matrix(Y_t2,new_t2_pred_GP)
conf_matrix_t3 = confusion_matrix(Y_t3,new_t3_pred_GP)
print("T1 set conf matrix for GB:\n",conf_matrix_t1)
print("T2 set conf matrix for GB:\n",conf_matrix_t2)
print("T3 set conf matrix for GB:\n",conf_matrix_t3)
# Confusion matrices for 3-NN model
conf_matrix_t1 = confusion_matrix(Y_t1,new_t1_pred_3NN)
print("T1 set conf matrix for 3-NN:\n",conf_matrix_t1)
# Append all the accuracies in the lists for each test set
# The indices represent the classifiers by RF,GP,3NN,9NN respectively
t1_acc_BBSC = []
t2_acc_BBSC = []
t3_acc_BBSC = []
t1_acc_BBSC.append(round(accuracy_score(Y_t1, new_t1_pred_RF) * 100, 2))
t1_acc_BBSC.append(round(accuracy_score(Y_t1, new_t1_pred_GP) * 100, 2))
t1_acc_BBSC.append(round(accuracy_score(Y_t1, new_t1_pred_3NN) * 100, 2))
t1_acc_BBSC.append(round(accuracy_score(Y_t1, new_t1_pred_9NN) * 100, 2))
t2_acc_BBSC.append(round(accuracy_score(Y_t2, new_t2_pred_RF) * 100, 2))
t2_acc_BBSC.append(round(accuracy_score(Y_t2, new_t2_pred_GP) * 100, 2))
t2_acc_BBSC.append(round(accuracy_score(Y_t2, new_t2_pred_3NN) * 100, 2))
t2_acc_BBSC.append(round(accuracy_score(Y_t2, new_t2_pred_9NN) * 100, 2))
t3_acc_BBSC.append(round(accuracy_score(Y_t3, new_t3_pred_RF) * 100, 2))
t3_acc_BBSC.append(round(accuracy_score(Y_t3, new_t3_pred_GP) * 100, 2))
t3_acc_BBSC.append(round(accuracy_score(Y_t3, new_t3_pred_3NN) * 100, 2))
t3_acc_BBSC.append(round(accuracy_score(Y_t3, new_t3_pred_9NN) * 100, 2))
return t1_acc_BBSC, t2_acc_BBSC, t3_acc_BBSC
def Normalize(label,count):
normalized = label/count
return normalized
def TrueLabelsBarPlot(val, t1, t2, t3, val_len, t1_len, t2_len, t3_len):
# Normalize the data
val_normal = Normalize(val, val_len)
t1_normal = Normalize(t1, t1_len)
t2_normal = Normalize(t2, t2_len)
t3_normal = Normalize(t3, t3_len)
# Concatenate the normalized data into a single DataFrame
normalized_df = pd.concat([val_normal, t1_normal, t2_normal, t3_normal], axis=1)
normalized_df.columns = ['Validation', 'Test 1', 'Test 2', 'Test 3']
# Plot the bar plot using pandas
ax = normalized_df.plot.bar(figsize=(10, 6))
# Set labels and title
ax.set_xlabel('Classes')
ax.set_ylabel('Counts')
ax.set_title('True Class Label Distribution (Normalized)')
# Save the plot to BarPlot.png file
ax.legend(title='Dataset', bbox_to_anchor=(1, 1))
plt.savefig('BarPlot.png')
if __name__ == "__main__":
# Load training data
X_train,Y_train = LoadData('/Users/wadoodalam/ML Models Adapting to Shifts/train-TX.csv',True)
# Load val data
X_val,Y_val,true_label_val, val_len = LoadData('/Users/wadoodalam/ML Models Adapting to Shifts/val-TX.csv')
# Load test 1 data
X_t1,Y_t1,true_label_t1, t1_len = LoadData('/Users/wadoodalam/ML Models Adapting to Shifts/test1-TX.csv')
# Load test 2 data
X_t2,Y_t2,true_label_t2, t2_len = LoadData('/Users/wadoodalam/ML Models Adapting to Shifts/test2-FL.csv')
# Load test 3 data
X_t3,Y_t3,true_label_t3, t3_len = LoadData('/Users/wadoodalam/ML Models Adapting to Shifts/test3-FL.csv')
# Create Bar plot for True labels
TrueLabelsBarPlot(true_label_val,true_label_t1,true_label_t2,true_label_t3, val_len,t1_len,t2_len,t3_len)
# Train all the classifiers
classifiers = Train(X_train,Y_train)
# Get all accuracies without BBSC
val_accuracies,t1_accuracies,t2_accuracies,t3_accuracies,weights = PredictAccuracies(classifiers,X_val,Y_val,X_t1,Y_t1,X_t2,Y_t2,X_t3,Y_t3)
# Get all accuracies with BBSC
t1_acc_BBSC, t2_acc_BBSC, t3_acc_BBSC = PredictAccuraciesBBSC(classifiers,X_val,Y_val,X_t1,Y_t1,X_t2,Y_t2,X_t3,Y_t3,weights)
# Convert accuracies and weights to dataframe
df_accuracy = ConvertAccuracyTable(val_accuracies,t1_accuracies,t2_accuracies,t3_accuracies)
df_weight = ConvertWeightsToTable(weights)
df_accuracy_BBSC = ConvertAccuracyTableBBSC(t1_acc_BBSC,t2_acc_BBSC,t3_acc_BBSC)
# Uncomment to get csv outputs for both accuracies and weights table
df_accuracy.to_csv('accuracy.csv')
df_weight.to_csv('weights.csv')
df_accuracy_BBSC.to_csv('accuracy_BBSC.csv')