-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrangle.py
363 lines (312 loc) · 13.1 KB
/
wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# =======================================================================================================
# Table of Contents START
# =======================================================================================================
'''
1. Orientation
2. Imports
3. acquire
4. prepare
5. wrangle
6. split
7. scale
8. sample_dataframe
9. remove_outliers_tukey
10. find_outliers_tukey
11. find_outliers_sigma
12. drop_nullpct
13. check_nulls
'''
# =======================================================================================================
# Table of Contents END
# Table of Contents TO Orientation
# Orientation START
# =======================================================================================================
'''
The purpose of this file is to create functions for both the acquire & preparation phase of the data
science pipeline or also known as 'wrangling' the data...
'''
# =======================================================================================================
# Orientation END
# Orientation TO Imports
# Imports START
# =======================================================================================================
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
# =======================================================================================================
# Imports END
# Imports TO acquire
# acquire START
# =======================================================================================================
def acquire():
'''
Obtains the vanilla version of the mass_shooters dataframe
INPUT:
NONE
OUTPUT:
mass_shooters = pandas dataframe
'''
print('Acquire dat shit!')
# =======================================================================================================
# acquire END
# acquire TO prepare
# prepare START
# =======================================================================================================
def prepare():
'''
Takes in the vanilla mass_shooters dataframe and returns a cleaned version that is ready
for exploration and further analysis
INPUT:
NONE
OUTPUT:
.csv = ONLY IF FILE NONEXISTANT
prepped_mass_shooters = pandas dataframe of the prepared mass_shooters dataframe
'''
if os.path.exists('mass_shooters.csv'):
print('Prep dat shit!')
else:
print('Prep dat shit!')
# =======================================================================================================
# prepare END
# prepare TO wrangle
# wrangle START
# =======================================================================================================
def wrangle():
'''
Function that acquires, prepares, and splits the mass_shooters dataframe for use as well as
creating a csv.
INPUT:
NONE
OUTPUT:
.csv = ONLY IF FILE NONEXISTANT
train = pandas dataframe of training set for mass_shooter data
validate = pandas dataframe of validation set for mass_shooter data
test = pandas dataframe of testing set for mass_shooter data
'''
if os.path.exists('mass_shooters.csv'):
mass_shooters = pd.read_csv('mass_shooters.csv', index_col=0)
train, validate, test = split(mass_shooters, stratify='shooter_volatility')
return train, validate, test
else:
mass_shooters = prepare()
mass_shooters.to_csv('mass_shooters.csv')
train, validate, test = split(mass_shooters, stratify='shooter_volatility')
return train, validate, test
# =======================================================================================================
# wrangle END
# wrangle TO split
# split START
# =======================================================================================================
def split(df, stratify=None):
'''
Takes a dataframe and splits the data into a train, validate and test datasets
INPUT:
df = pandas dataframe to be split into
stratify = Splits data with specific columns in consideration
OUTPUT:
train = pandas dataframe with 70% of original dataframe
validate = pandas dataframe with 20% of original dataframe
test = pandas dataframe with 10% of original dataframe
'''
train_val, test = train_test_split(df, train_size=0.9, random_state=1349, stratify=df[stratify])
train, validate = train_test_split(train_val, train_size=0.778, random_state=1349, stratify=train_val[stratify])
return train, validate, test
# =======================================================================================================
# split END
# split TO scale
# scale START
# =======================================================================================================
def scale(train, validate, test, cols, scaler):
'''
Takes in a train, validate, test dataframe and returns the dataframes scaled with the scaler
of your choice
INPUT:
train = pandas dataframe that is meant for training your machine learning model
validate = pandas dataframe that is meant for validating your machine learning model
test = pandas dataframe that is meant for testing your machine learning model
cols = List of column names that you want to be scaled
scaler = Scaler that you want to scale columns with like 'MinMaxScaler()', 'StandardScaler()', etc.
OUTPUT:
new_train = pandas dataframe of scaled version of inputted train dataframe
new_validate = pandas dataframe of scaled version of inputted validate dataframe
new_test = pandas dataframe of scaled version of inputted test dataframe
'''
original_train = train.copy()
original_validate = validate.copy()
original_test = test.copy()
scale_cols = cols
scaler = scaler
scaler.fit(original_train[scale_cols])
original_train[scale_cols] = scaler.transform(original_train[scale_cols])
scaler.fit(original_validate[scale_cols])
original_validate[scale_cols] = scaler.transform(original_validate[scale_cols])
scaler.fit(original_test[scale_cols])
original_test[scale_cols] = scaler.transform(original_test[scale_cols])
new_train = original_train
new_validate = original_validate
new_test = original_test
return new_train, new_validate, new_test
# =======================================================================================================
# scale END
# scale TO sample_dataframe
# sample_dataframe START
# =======================================================================================================
def sample_dataframe(train, validate, test):
'''
Takes train, validate, test dataframes and reduces the shape to no more than 1000 rows by taking
the percentage of 1000/len(train) then applying that to train, validate, test dataframes.
INPUT:
train = Split dataframe for training
validate = Split dataframe for validation
test = Split dataframe for testing
OUTPUT:
train_sample = Reduced size of original split dataframe of no more than 1000 rows
validate_sample = Reduced size of original split dataframe of no more than 1000 rows
test_sample = Reduced size of original split dataframe of no more than 1000 rows
'''
ratio = 1000/len(train)
train_samples = int(ratio * len(train))
validate_samples = int(ratio * len(validate))
test_samples = int(ratio * len(test))
train_sample = train.sample(train_samples)
validate_sample = validate.sample(validate_samples)
test_sample = test.sample(test_samples)
return train_sample, validate_sample, test_sample
# =======================================================================================================
# sample_dataframe END
# sample_dataframe TO remove_outliers_tukey
# remove_outliers_tukey START
# =======================================================================================================
def remove_outliers_tukey(df, col_list, k=1.5):
'''
Remove outliers from a dataframe based on a list of columns using the tukey method and then
returns a single dataframe with the outliers removed
INPUT:
df = pandas dataframe
col_list = List of columns that you want outliers removed
k = Defines range for fences, default/normal is 1.5, 3 is more extreme outliers
OUTPUT:
df = pandas dataframe with outliers removed
'''
col_qs = {}
for col in col_list:
col_qs[col] = q1, q3 = df[col].quantile([0.25, 0.75])
for col in col_list:
iqr = col_qs[col][0.75] - col_qs[col][0.25]
lower_fence = col_qs[col][0.25] - (k*iqr)
upper_fence = col_qs[col][0.75] + (k*iqr)
df = df[(df[col] > lower_fence) & (df[col] < upper_fence)]
return df
# =======================================================================================================
# remove_outliers_tukey END
# remove_outliers_tukey TO find_outliers_tukey
# find_outliers_tukey START
# =======================================================================================================
def find_outliers_tukey(df, col_list, k=1.5):
'''
Find outliers from a dataframe based on a list of columns using the tukey method and then
returns all of the values identifed as outliers
INPUT:
df = pandas dataframe
col_list = List of columns that you want outliers removed
k = Defines range for fences, default/normal is 1.5, 3 is more extreme outliers
OUTPUT:
NONE
'''
for col in col_list:
lower_vals = []
upper_vals = []
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_fence = q1 - iqr * k
upper_fence = q3 + iqr * k
for val in df[col]:
if val < lower_fence:
lower_vals.append(val)
elif val > upper_fence:
upper_vals.append(val)
print(f'\033[35m =========={col} // k={k}==========\033[0m')
print(f'\033[32mValues < {lower_fence:.2f}: {len(lower_vals)} Values\033[0m')
print(lower_vals)
print(f'\n\033[32mValues > {upper_fence:.2f}: {len(upper_vals)} Values\033[0m')
print(upper_vals)
print(f'\n')
# =======================================================================================================
# find_outliers_tukey END
# find_outliers_tukey TO find_outliers_sigma
# find_outliers_sigma START
# =======================================================================================================
def find_outliers_sigma(df, col_list, sigma=2):
'''
Find outliers from a dataframe based on a list of columns using the three sigma rule and then
returns all of the values identifed as outliers
INPUT:
df = pandas dataframe
col_list = List of columns that you want outliers removed
sigma = How many z-scores a value must at least be to identify as an outlier
OUTPUT:
NONE
'''
for col in col_list:
mean = df[col].mean()
std = df[col].std()
z_scores = ((df[col] - mean) / std)
outliers = df[col][z_scores.abs() >= sigma]
print(f'\033[35m =========={col} // sigma={sigma}==========\033[0m')
print(f'\033[32mMEAN:\033[0m {mean:.2f}')
print(f'\033[32mSTD:\033[0m {std:.2f}')
print(f'\033[32mOutliers:\033[0m {len(outliers)}')
print(outliers)
print(f'\n')
# =======================================================================================================
# find_outliers_sigma END
# find_outliers_sigma TO drop_nullpct
# drop_nullpct START
# =======================================================================================================
def drop_nullpct(df, percent_cutoff):
'''
Takes in a dataframe and a percent_cutoff of nulls to drop a column on
and returns the new dataframe and a dictionary of dropped columns and their pct...
INPUT:
df = pandas dataframe
percent_cutoff = Null percent cutoff amount
OUTPUT:
new_df = pandas dataframe with dropped columns
drop_null_pct_dict = dict of column names dropped and pcts
'''
drop_null_pct_dict = {
'column_name' : [],
'percent_null' : []
}
for col in df:
pct = df[col].isna().sum() / df.shape[0]
if pct > 0.20:
df = df.drop(columns=col)
drop_null_pct_dict['column_name'].append(col)
drop_null_pct_dict['percent_null'].append(pct)
new_df = df
return new_df, drop_null_pct_dict
# =======================================================================================================
# drop_nullpct END
# drop_nullpct TO check_nulls
# check_nulls START
# =======================================================================================================
def check_nulls(df):
'''
Takes a dataframe and returns a list of columns that has at least one null value
INPUT:
df = pandas dataframe
OUTPUT:
has_nulls = List of column names with at least one null
'''
has_nulls = []
for col in df:
nulls = df[col].isna().sum()
if nulls > 0:
has_nulls.append(col)
return has_nulls
# =======================================================================================================
# check_nulls END
# =======================================================================================================