-
Notifications
You must be signed in to change notification settings - Fork 1
/
Train_Predict_Light.py
177 lines (105 loc) · 5.43 KB
/
Train_Predict_Light.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from functions import *
mpl.use('macosx')
# Column headers
columns_list = [
"Timestamp", "IMSI", "slice_id", "slice_prb", "scheduling_policy", "dl_mcs",
"dl_n_samples", "dl_buffer [bytes]", "tx_brate downlink [Mbps]", "tx_pkts downlink",
"dl_cqi", "ul_mcs", "ul_n_samples", "ul_buffer [bytes]", "rx_brate uplink [Mbps]",
"rx_pkts uplink", "rx_errors uplink (%)", "ul_sinr", "sum_requested_prbs", "sum_granted_prbs"
]
# Dataset filenames
dataset_filenames = {
"training": "Data/dataset_restart_training.pkl",
"testing": "Data/dataset_restart_testing.pkl",
"complete": "Data/dataset_restart_complete.pkl"
}
rs = 42 #random_state
supervised = False # set to True to perform hp tuning + prediction via supervised learning
unsupervised = False # set to True to use unspervised learning
k = None # Set to None for tuning of k, otherwise to int greater than 1 to perform clustering.
pca_flag = False # If k is None, set to True to evaluate the benefit of PCA on the tuning of k
# Main execution logic
def main():
# Configurations
dataset_filename = dataset_filenames["training"]
is_split_dataset_active = False
extract_rate = 0.05
# Load dataset
dataset = load_dataset(dataset_filename)
# Split dataset if flag is active
if is_split_dataset_active:
split_and_save_dataset(dataset, extract_rate, dataset_filenames)
# Ensure the Timestamp column is in datetime format
dataset['Timestamp'] = pd.to_datetime(dataset['Timestamp'], errors='coerce')
# CLASSIFICATION PIPELINE
cross_val_k = 3
#Comment above and use below to train/test on full dataset
# Prepare Data for Training and Validation Evaluation
X, X_test, y, y_test = train_test_split(dataset.drop(["Timestamp", "IMSI", "slice_id"], axis=1),
dataset.loc[:, 'slice_id'],
test_size=0.2, random_state=rs)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,
random_state=rs) # 0.25
X_train_norm, X_val_norm, stats_val = normalize_dataset(X_train, X_val)
X_norm, X_test_norm, stats_test = normalize_dataset(X, X_test)
if supervised:
### SUPERVISED
# Select Classsifiers
classifiers = {
"Linear Regression": RidgeClassifier(solver='svd'),
}
# Select Hyper-Parameters
params = {'Linear Regression': []} #choose parameters to optimize
# Example to predict with plain logistic regression
for i, (clf_name, clf) in enumerate(classifiers.items()):
print(f'Train classifier on training set...')
clf.fit(X_norm, y)
print(f'Perform prediction on test set...')
output = clf.predict(X_test_norm)
save_predictions_supervised(clf_name, output)
for i, (clf_name, clf) in enumerate(classifiers.items()):
print(10 * '-')
# Validation
######
# Put Validation Logic Here
######
# Testing
######
# Put Testing Logic Here
######
# save predictions: save_predictions_supervised(clf_name, output)
if unsupervised:
### UNSUPERVISED
if k is None:
# Use function kmeans_helbow to select best k
min_cl_km = 2
max_cl_km = 8
# Check how Silhouette Score varies with k with function kmeans_silhouette
# k_silhouette =
# print(f"Best K Silhouette: {k_silhouette}") # extract k with best sil coeff
else:
# Once that k is tuned, use it to perform clustering and generate labels on Test Set
######
# Put Clustering Logic Here
######
# save_predictions_supervised(k, output)
if pca_flag:
### PERFORM PCA to check how clusters look like in PC plane
# Defining the number of principal components to generate
n = min(X_norm.shape[0], X_norm.shape[1]) # get maximum n of components accepted by scikit.PCA
# Finding principal components for the data
pca = PCA(n_components=n, random_state=42)
X_norm_pca = pd.DataFrame(pca.fit_transform(X_norm))
# Get percentages of variance explained by each principal component
# exp_var =
# Visualize the Cumulative Sum of Explained Variance
plt.figure(figsize=(10, 10))
# find the least number of components that can explain more than x% variance
xvar = 90
# Make a scatter PLot of 1st vs 2nd components
# Make a scatter PLot of 1st vs 2nd components, where data points are labelled according to the associated cluster
# NB: also the centroids of the produced clustering configuration can be projected on the Principal plane
# applying the function transform() to the trained pca algorithm, giving as input the centroids
plt.show(block=False)
if __name__ == "__main__":
main()