-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathBN_Parsing.py
349 lines (270 loc) · 10.9 KB
/
BN_Parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import numpy as np
import pandas as pd
from joblib import delayed, Parallel
import multiprocessing
from file_handling import *
from selection import *
from functions import *
import time
import operator
from sklearn.feature_extraction.text import CountVectorizer
import csv
def main():
print("Program: Parsing")
print("Release: 1.12.0")
print("Date: 2020-06-26")
print("Author: Brian Neely")
print()
print()
print(
"This program takes a csv given csv, parses, and encodes a given column of the data from a given deliminator.")
print("The processing time varies exponentially with the number of encoding categories and rows.")
print()
print()
# Find input file
file_in = select_file_in()
# Set output file
file_out = select_file_out_csv(file_in)
# Ask for delimination
delimination = input("Enter Deliminator: ")
# Open input csv using the unknown encoder function
data = open_unknown_csv(file_in, delimination)
# Create an empty output file
open(file_out, 'a').close()
# Create Column Header List
headers = list(data.columns.values)
# Select Column
column = column_selection(headers, "parsing")
# Export list of parsed words
if y_n_question("Export list of parsed words (y/n): "):
# Set flag for export parse list
export_parsed_list = True
# Select second file out
file_out_parse_list = select_file_out_csv(file_out)
print()
if y_n_question("Split data using spaces? Note: This will speed up processing time significantly. (y/n):"):
# Start Timer
start_time = time.time()
# Parse Data
data_out, new_headers = vectorize_text(data, column)
# Print Time
print("Parsing completed in " + str(round(time.time() - start_time, 2)) + " s")
else:
# Select Encoding Delimination
deliminator = input("Enter deliminators separated by spaces: ")
while deliminator is None:
deliminator = input("No deliminator selected! Enter deliminators separated by spaces: ")
print()
print("Processing File: " + file_in)
# Get name to append
encode_concate = input("Append string to encoded column name: ")
# Set parallel to true
parallel = True
data_out, new_headers = parse_and_encode_data(data, column, deliminator, encode_concate, parallel)
# Write CSV
print("Writing CSV File...")
data_out.to_csv(file_out, index=False)
print("Wrote CSV File!")
print()
# If parse list, write CSV
if export_parsed_list:
# Write list
with open(file_out_parse_list, 'w') as write_file:
writer = csv.writer(write_file, dialect='excel')
writer.writerow(new_headers)
print("Encoding Completed on column: [" + column + "]")
print("File written to: " + file_out)
input("Press Enter to close...")
def vectorize_text(data, column):
print()
print("Creating Vectorizer...")
# Set vectorizer from CountVectorizer
vectorizer = CountVectorizer()
print("Vectorizer Created!")
# Fill NaN
print()
print("Dropping NAs...")
data[column] = data[column].fillna("empty_text")
print("NAs Dropped!")
# Create sparse matrix of parsed text
print()
print("Creating Sparse Matrix of Vectorized Data...")
X = vectorizer.fit_transform(data[column])
print("Sparse matrix created!")
# Convert sparse matrix to DataFrame
print()
print("Converting sparse matrix to dense...")
parsed = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
print("Conversion completed!")
# Look for columns in the original data that matches new columns and append _parsing to new columns
duplicate_found = True
print()
print("Looking of new columns that match from original data...")
while duplicate_found:
duplicate_found = False
# Create list of columns in original dataset
original_columns = list(data.columns.values)
# Create list of columns in new dataset
new_columns = list(parsed.columns.values)
# Create intersection of the two lists
intersection = list_common(new_columns, original_columns)
# If there is an intersection, rename in new columns
if len(intersection) > 0:
for i in intersection:
old_column_name = str(i)
new_column_name = str(i) + "_parsed"
# Rename column in parsed
print("Duplicate column found: {" + str(i) + "} Renaming new column to {" + new_column_name + "}.")
parsed.rename(columns={old_column_name: new_column_name}, inplace=True)
# Set flag for duplicate
duplicate_found = True
print("No duplicates left!")
print()
print("Creating list of new columns...")
# Get new headers
new_headers = list(parsed.columns.values)
print("List created!")
# Append original dataset to parsed dataset
print()
print("Appending new matrix to original data...")
data_out = pd.concat([data, parsed], axis=1, sort=False)
print("Append complete!")
# Return parsed data
return data_out, new_headers
def parse_and_encode_data(data, column, deliminator, encode_concate, parallel=False):
# Replace first deliminator if it is in the first character position
column_list = []
for i in data[column]:
if type(i) != float:
if i.find(deliminator) == 0:
i = i[1:]
column_list.append(i)
# Parsed array
print("Parsing column: [" + str(column) + "]...")
parsed_array = [i.split(deliminator) for i in column_list]
print("Column: [" + str(column) + "] Parsed!")
print()
# Unpivot array
parse_list = list()
for i in range(len(parsed_array)):
parse_list.extend(parsed_array[i])
# Lowercase parse_list
parse_list_lower = list()
for i in parse_list:
parse_list_lower.append(i.lower())
# Dedupe list
print("Removing Duplicates for Parsed Field...")
deduped_list = dedupe_list(parse_list_lower)
# Remove None for Deduped List
deduped_list = [x for x in deduped_list if x is not None]
print("Duplicates Removed!")
print()
# Add _encoder string
if encode_concate == "":
deduped_list_concat = deduped_list
else:
deduped_list_concat = list()
for i in deduped_list:
deduped_list_concat.append(i + encode_concate)
print("Number of Unique words: " + str(len(deduped_list)))
print()
# Create dataframe of rows with empty strings
empty_list = data[column].isnull()
data_empty = data[empty_list]
data_filled = data[empty_list == False]
# Start time
if parallel:
# *****Find optimum split*****
if len(data_filled) > 8192:
# Create sample dataset
data_filled_sample = data_filled.head(1024)
# List of number of splits to test
num_splits_test = [8, 16, 32, 64, 128, 256, 512]
# Test splits
old_time = 99999
time_dict = dict()
for splits in num_splits_test:
# Start Time
start_time = time.time()
# Print statement
print("Testing " + str(splits) + " splits")
# Split Data
data_split = split_data(data_filled_sample, splits)
# Test Speed
Parallel(n_jobs=-1)(delayed(encoding_data)
(par_index + 1, len(data_split), i, column, deduped_list_concat, encode_concate)
for par_index, i in enumerate(data_split))
# Record time and print results
time_dict[splits] = time.time() - start_time
print(str(splits) + " Splits: " + str(round(time_dict[splits], 2)) + " s")
# If time is increasing, stop and use current. More data tends to benefit from a slight increase in
# number of splits.
if time_dict[splits] > old_time:
break
else:
old_time = time_dict[splits]
# Lookup the optimum split
optimum_num_splits = min(time_dict.items(), key=operator.itemgetter(1))[0]
else:
optimum_num_splits = 16
# Split data for optimum number of splits
data_split = split_data(data_filled, optimum_num_splits)
# Parse Data using parallel process
start_time = time.time()
print("Encoding Parsed Data on full dataset...")
data_split_parsed = Parallel(n_jobs=-1) \
(delayed(encoding_data)(par_index + 1, len(data_split), i, column, deduped_list_concat, encode_concate)
for par_index, i in enumerate(data_split))
print("Encoding Complete!")
print()
# Union split data frames
data_encoded = pd.concat(data_split_parsed)
else:
# Start timer
start_time = time.time()
# Single Thread
data_encoded = encoding_data(1, 1, data_filled, column, deduped_list_concat, encode_concate)
# Bring back data_empty with 0's in new columns
for i in deduped_list_concat:
data_empty[i] = 0
# Union dataframes encoded and empty
data_out = pd.concat([data_encoded, data_empty])
# End time
print("Parsing completed in " + str(round(time.time() - start_time, 2)) + " s")
# Look for columns in the original data that matches new columns
any_match_found = True
while any_match_found:
any_match_found = False
for i in data_out:
column_match_found = False
for j in column:
# Set flag for found match
column_match_found = True
# If match found, add string into column name
data_out.rename(columns={i: i + "_parsed"}, inplace=True)
any_match_found = True
# Get original columns
headers_original = list(data.columns.values)
# Get output headers
headers_new = list(data_out.columns.values)
# Add print statement and timer
print("Extracting new columns added...")
start_time = time.time()
# Look for differences between original headers and new
new_headers = list_diff(headers_original, headers_new)
# Print time and results
print(str(len(new_headers)) + " new columns found in " + str(round(time.time() - start_time, 2)) + " s")
# Return data
return data_out, new_headers
def encoding_data(par_index, par_len, data, column, deduped_list, encode_concate):
pd.options.mode.chained_assignment = None # default='warn'
for index, i in enumerate(deduped_list):
# Add 0 for new column
data[i] = "0"
# Encode Columns
data[i][data[column].str.find(i.replace(encode_concate, "")) != -1] = 1
print("Completed: " + str(par_index) + " out of " + str(par_len))
return data
if __name__ == '__main__':
multiprocessing.freeze_support()
main()