-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProjeto.py
325 lines (248 loc) · 9.79 KB
/
Projeto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
'''
---------- Data Importing ----------
'''
#Reading the first dataset
data_1 = pd.read_csv('md_raw_dataset.csv',sep = ';')
#Showing the dataset
print(data_1)
#Reading the second dataset
data_2 = pd.read_csv('md_target_dataset.csv', sep = ';')
#Showing the dataset
print(data_2)
#Renaming the column name of the second dataset
data_2 = data_2.rename(columns={'groups':'groups_target'})
#Concatenating the two datasets
data = pd.concat([data_1,data_2], axis = 1)
#Display more columns in the print function
pd.set_option('display.max_columns', 50)
#Getting the shape of the final dataset
print(data.shape)
#Showing the first 5 rows
print(data.head())
#Showing the last 5 rows
print(data.tail())
'''
As the dataset 2 has 3 rows less than the dataset 1, in the final dataset
is missing 3 values of the target variable, so these rows will be removed.
'''
data = data.iloc[:-3]
'''
---------- Data Treatment ----------
'''
#Printing information of the type of the data and missing values
print(data.info())
'''
There are some columns with date information, so they will be put
in a list to be easier to convert to the correct type
'''
date_columns = ['when','expected_start','start_process','start_subprocess1','start_critical_subprocess1','predicted_process_end','process_end','subprocess1_end','reported_on_tower','opened']
'''
As the opened column is contains date information and the first 3 lines don't have
this type of information, they will be replace with the value zero because we don't know
the correct date.
'''
data['opened'] = data['opened'].replace(['44021.58091','44021.6737','44021.70867'],'0')
#Converting the date columns to the correct type
for col in date_columns:
data[col] = pd.to_datetime(data[col], errors='coerce')
#Getting the numerical columns
numerical_columns = data.select_dtypes(['float64','int64']).columns
#Getting the categorical columns
categorical_columns = data.select_dtypes('object').columns
#Getting the number of unique values in the categorical columns
for col in categorical_columns:
print(col,':',data[col].nunique(), 'unique values.')
'''
As we can observe in the result, the column 'etherium_before_start'
has too many unique values, so probably there are numerical and pandas
could not interprete the right type due to some problems
'''
#Searching for the value that isn't numerical
wrong_data_index = []
for i in range(len(data['etherium_before_start'])):
try:
float(data['etherium_before_start'][i])
except:
wrong_data_index.append(i)
#Printting the problem index
print(wrong_data_index)
#As we can see, just one row has a non numerical value, so let's print it.
print(data['etherium_before_start'][7829])
#As we don't know the correct value for this row, it will be replaced with zero.
data['etherium_before_start'] = data['etherium_before_start'].replace('21/12/2020 12:11',0)
#Converting this column to numerical type
data['etherium_before_start'] = data['etherium_before_start'].apply(float)
#Redefining the numerical_columns variable due to this change
numerical_columns = data.select_dtypes(['float64','int64']).columns
#Redefining the categorical_columns variable due to this change
categorical_columns = data.select_dtypes('object').columns
#Checking if these 3 columns are representing all the columns of the dataset
print(len(numerical_columns)+len(categorical_columns)+len(date_columns))
#Looking for null values
print(data.isnull().sum())
#Observing the distribution of the data throught descriptive statistics
print(data.describe())
'''
As we have a considerable number of null values and the variables have a
significant value, the best option is to replace the null values by the
mean of the variable to get closer to real values.
'''
for col in numerical_columns:
data[col] = data[col].fillna(data[col].mean())
#Checking if the if all the null values in the numerical columns were solved.
print(data[numerical_columns].isnull().sum())
#Checking if there are missing values in the categorical columns
print(data[categorical_columns].isnull().sum())
#Checking if there are missing values in the date columns
print(data[date_columns].isnull().sum())
'''
As shown, there are null values in the date columns but as they won't be used in
the machine learning algorithm and we don't know the correct value, they are no need
to solve them.
'''
#Printing the unique values in each categorical column and it frequency.
for col in categorical_columns:
print(data[col].value_counts())
print()
'''
As we can seem there are some value with strange information. Probably it is
miss typed information. So they will be corrected.
'''
#Correcting the values
data['super_hero_group'] = data['super_hero_group'].replace('₢','C')
data['crystal_supergroup'] = data['crystal_supergroup'].replace('1ª','1')
data['Cycle'] = data['Cycle'].replace('33','3ª')
'''
---------- EDA ----------
'''
#Setting the style of the plots
sns.set_style('darkgrid')
#Looking for outliers in the numerical columns with boxplot and saving them as image
for col in numerical_columns:
ax = sns.boxplot(data[col])
#plt.show()
figure = ax.get_figure()
name = col.split(sep=':')
name = '_'.join(name)
figure.savefig('boxplot/'+name+'.png')
plt.close()
'''
The data contains outliers in some columns but as we don't if they are mistakes
or real value, the best action is to keep them in the data.
'''
#Getting descriptive statistics
print(data.describe())
#Ploting the relationship of the numerical values and the target variable
for col in numerical_columns[:-1]:
plt.figure(figsize = (15,5))
ax = sns.scatterplot(x = data[col], y = data['target'])
#plt.show()
figure = ax.get_figure()
name = col.split(sep=':')
name = '_'.join(name)
figure.savefig('scatterplot/'+name+'.png')
plt.close()
#Plotting the correlation between the numerical variables
plt.figure(figsize = (20,20))
ax = sns.heatmap(data.corr(), annot = True)
#plt.show()
figure = ax.get_figure()
figure.savefig('Correlation.png')
plt.close()
#Plotting the categorical variables and its frequencies
for col in categorical_columns:
if data[col].nunique() <= 10:
total = len(data[col])
plt.figure(figsize = (15,5))
ax = sns.countplot(x = data[col], order = data[col].value_counts().index)
for p in ax.patches:
height = p.get_height()
ax.text(x = p.get_x()+(p.get_width()/2), y = height * 1.01 , s = '{:.2f}%'.format(height/total*100), ha = 'center')
#plt.show()
figure = ax.get_figure()
figure.savefig('categorical/'+col+'.png')
plt.close()
#Plotting the relashionship between categorical variables and the target with boxenplot
for col in categorical_columns:
if data[col].nunique() <= 10:
plt.figure(figsize = (15,5))
ax = sns.boxenplot(x = data['target'], y = data[col])
#plt.show()
figure = ax.get_figure()
figure.savefig('boxenplot/'+col+'.png')
plt.close()
#Plotting the relashionship between categorical variables and the target with histograms
for col in categorical_columns:
if data[col].nunique() <= 10:
plt.figure(figsize = (15,5))
ax = sns.histplot(data = data , x = 'target', hue = col, kde = True)
#plt.show()
figure = ax.get_figure()
figure.savefig('histogram/'+col+'.png')
plt.close()
#Checking the crystal type column values
for col in categorical_columns:
print(data[col].value_counts())
print()
'''
---------- Feature Engineering ----------
'''
#Creating the x and y variables
#As the date columns don't have useful information for the ML algorithms,
#they will be take out from x variable
x = data.drop(date_columns, axis = 1).drop('target',axis = 1)
y = data['target']
#Starting the encoder for the categorical columns
cat_enc = TargetEncoder(cols = categorical_columns).fit(x,y)
#Encoding the categorical varibles
x_encoded = cat_enc.transform(x)
#Checking the x encoded variables
print(x_encoded)
'''
---------- Machine Learning ----------
'''
'''
As request from the challenge, it's needed to choose one machine learning algorithm. So I will choose
Random Forest because it is a very strong algorithm that use ensemble techniques to improve the
performance.
'''
#Starting the ML Model
model = RandomForestRegressor()
#Validating the model using Cross-validation
'''
As its needed to choose a metrics, i chose RMSE because it is a good metrics and has the same
unit of the target variable, so it's easier to interpretate.
'''
rmse = cross_val_score(model,x_encoded,y,cv = 10, scoring = 'neg_root_mean_squared_error' )
#Printing the mean value of RMSE
print(np.abs(rmse.mean()))
'''
With a mean value of approximately 23.7 of the metrics Root Mean Squared Error and a
mean value of the target of 99.7, it is possible to conclude that the algorithm has
an error of 24% in predicting new values.
'''
#Training the model
model_1 = RandomForestRegressor()
model_1.fit(x_encoded,y)
#Getting the most important variables to predict the target variable
importance = pd.DataFrame({'Variables': x_encoded.columns, 'Importance': model_1.feature_importances_})
#Showing the top 10 important variables
print(importance.sort_values('Importance', ascending = False).head(10))
#Plotting the importance
plt.figure(figsize = (15,8))
ax = sns.barplot(y = importance['Variables'], x = importance['Importance'], order = importance.sort_values('Importance', ascending = False)['Variables'])
#plt.show()
figure = ax.get_figure()
figure.savefig('feature_importancy.png')
plt.close()
#Predicting New values
#y_pred = model_1.predict([[VALUES]])