-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMigration.py
352 lines (292 loc) · 19.5 KB
/
Migration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# ____________________________________________________________________________________________________________________________________________________________________ #
# Importing libraries
# ____________________________________________________________________________________________________________________________________________________________________ #
print("... Début du chargement des bibliothèques ...")
import json # JSON processing
import csv # CSV processing
import pandas as pd # Data processing
from google.cloud import bigquery # BigQuery connection management
from google.cloud.exceptions import NotFound # Error management
import os # Environmental management
print("... Bibliothèques chargées ...")
# ____________________________________________________________________________________________________________________________________________________________________ #
# Variables
# ____________________________________________________________________________________________________________________________________________________________________ #
import credentials
project_id = credentials.project_id # BigQuery project identifier
dataset_id = credentials.dataset_id # BigQuery dataset identifier
table_name = credentials.table_name # Destination table name
key_path = credentials.ospath
key_path = os.path.abspath(key_path)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_path
# ____________________________________________________________________________________________________________________________________________________________________ #
# Functions
# ____________________________________________________________________________________________________________________________________________________________________ #
# Check dataset existence _________________________________________________________________________________________________
def check_dataset(dataset_ref, client):
try:
client.get_dataset(dataset_ref)
print(f"Dataset {dataset_id} found.")
except NotFound:
print(f"Dataset {dataset_id} not found.")
exit(1)
# Check table existence ___________________________________________________________________________________________________
def check_table(table_ref, client):
try:
client.get_table(table_ref)
print(f"Table {table_name} found.")
except NotFound:
print(f"Table {table_name} not found. Creating table schema.")
# CVS to JSON converter function _________________________________________________________________________________________
def clean_and_convert_csv_to_json(csv_Inputfile_path, json_file_path, delimiter=','):
cleaned_rows = []
with open(csv_Inputfile_path, 'r', encoding='utf-8') as csv_file:
# Read CSV file
csv_reader = csv.reader(csv_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL)
# Keep header
headers = next(csv_reader)
headers = [header.strip() for header in headers]
# Read other line and clean it
for row in csv_reader:
cleaned_row = {headers[i]: row[i].strip() for i in range(len(headers))}
cleaned_rows.append(cleaned_row)
# Writting clean data
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(cleaned_rows, json_file, indent=4)
print(f"Les données ont été nettoyées et converties de {csv_Inputfile_path} à {json_file_path}")
# JSON to NDJSON convertion function ____________________________________________________________________________________
def convert_json_to_ndjson(input_file_path, output_file_path):
with open(input_file_path, 'r', encoding='utf-8') as infile:
data = json.load(infile)
# Check
if not isinstance(data, list):
raise ValueError("Le fichier JSON d'entrée doit contenir une liste d'objets JSON")
# Write NDJSON file (Newline Delimiter)
with open(output_file_path, 'w', encoding='utf-8') as outfile:
for record in data:
json_record = json.dumps(record)
outfile.write(json_record + '\n')
print(f"Les données ont été converties et enregistrées dans {output_file_path}")
# Data Filtering function _______________________________________________________________________________________________
def data_filtering(input_file_path, filtered_output_file_path, column_mapping):
# Reading and filtering the NDJSON file
filtered_data = []
try:
with open(input_file_path, 'r', encoding='utf-8') as infile:
for line in infile:
record = json.loads(line.strip())
filtered_record = {bigquery_col: record.get(file_col, None) for file_col, bigquery_col in column_mapping.items()}
filtered_data.append(filtered_record)
except json.JSONDecodeError as e:
print(f"Erreur de décodage JSON : {e}")
# Writting new data
with open(filtered_output_file_path, 'w', encoding='utf-8') as outfile:
json.dump(filtered_data, outfile, indent=4)
print(f"Les données ont été filtrées et enregistrées dans {filtered_output_file_path}")
# Data correction - dates format __________________________________________________________________________________________
def date_correction(input_file, output_file):
with open(input_file, 'r') as f_in:
with open(output_file, 'w') as f_out:
for line in f_in:
data = json.loads(line)
if 'created_at' in data:
# Add ":00" and the end of date
if len(data['created_at']) == 16:
data['created_at'] += ":00"
json.dump(data, f_out)
f_out.write('\n')
print(f"Les dates ont été corrigées et enregistrées dans {output_file}")
# Data correction - organization name _____________________________________________________________________________________
def fill_organization_name(input_file, output_file):
with open(input_file, 'r') as f_in:
with open(output_file, 'w') as f_out:
for line in f_in:
data = json.loads(line)
# Check if organization_name is empty
if not data.get('organization_name'):
# use organization_id to complete organization_name
data['organization_name'] = data.get('organization_id')
json.dump(data, f_out)
f_out.write('\n')
print(f"Les nom clients ont été corrigés et enregistrées dans {output_file}")
# Data correction - tags format ___________________________________________________________________________________________
def convert_tags_to_list(input_file, output_file):
with open(input_file, 'r') as f_in:
with open(output_file, 'w') as f_out:
for line in f_in:
data = json.loads(line)
# Check if tags exist and is a string
if 'tags' in data and isinstance(data['tags'], str):
# Convert string into a list
data['tags'] = data['tags'].split()
json.dump(data, f_out)
f_out.write('\n')
print(f"Les tags ont été corrigés et enregistrées dans {output_file}")
# Data correction - Maj ___________________________________________________________________________________________________
def transform_fields_to_lowercase(input_path, output_path, fields_to_lowercase):
with open(input_path, 'r', encoding='utf-8') as f_input:
with open(output_path, 'w', encoding='utf-8') as f_output:
for line in f_input:
data = json.loads(line)
for field in fields_to_lowercase:
if field in data:
data[field] = data[field].lower()
f_output.write(json.dumps(data) + '\n')
print(f"Les données ont été passés en Lowercase et enregistrées dans {output_path}")
# Data correction - Boolean _______________________________________________________________________________________________
def transform_to_boolean(input_path, output_path, fields):
with open(input_path, 'r', encoding='utf-8') as f_input:
with open(output_path, 'w', encoding='utf-8') as f_output:
for line in f_input:
entry = json.loads(line)
for field in fields:
if field in entry:
value = entry[field].lower()
if value == 'yes':
entry[field] = True
elif value == 'no':
entry[field] = False
f_output.write(json.dumps(entry) + '\n')
print(f"Les données booléennes ont été corrigées et enregistrées dans {output_path}")
# Data correction - Underscore ____________________________________________________________________________________________
def transform_to_underscore(input_path, output_path, fields):
with open(input_path, 'r', encoding='utf-8') as f_input:
with open(output_path, 'w', encoding='utf-8') as f_output:
for line in f_input:
entry = json.loads(line)
for field in fields:
if field in entry:
entry[field] = entry[field].replace(' ', '_').lower()
f_output.write(json.dumps(entry) + '\n')
print(f"Les données ont corrigées avec l'ajout d'un underscore et enregistrées dans {output_path}")
# Data correction - Null __________________________________________________________________________________________________
def replace_dashes_with_null(input_path, output_path, fields):
with open(input_path, 'r', encoding='utf-8') as f_input:
with open(output_path, 'w', encoding='utf-8') as f_output:
for line in f_input:
entry = json.loads(line)
for field in fields:
if field in entry and entry[field] == "-":
entry[field] = None
f_output.write(json.dumps(entry) + '\n')
print(f"Les données null ont été corrigé et enregistrées dans {output_path}")
# Data correction - Satisfaction rating ___________________________________________________________________________________
def transform_satisfaction_rating(input_path, output_path, fields):
with open(input_path, 'r', encoding='utf-8') as f_input:
with open(output_path, 'w', encoding='utf-8') as f_output:
for line in f_input:
entry = json.loads(line)
for field in fields:
if field in entry:
value = entry[field].lower()
if value in ['offered', 'good', 'bad']:
value = value
elif value == 'not offered':
value = 'unoffered'
entry[field] = f"{{score={value}}}"
f_output.write(json.dumps(entry) + '\n')
print(f"Les données de satisfaction ont été corrigées et enregistrées dans {output_path}")
# ____________________________________________________________________________________________________________________________________________________________________ #
# Executions
# ____________________________________________________________________________________________________________________________________________________________________ #
print("... Starting data transformation ...")
# Clean and CSV to JSON convertion _____________________________________________________________________________________
csv_Inputfile_path = 'Data/input.csv' # input CSV file path
json_file_path = 'Data/JSON_data.json' # input data convert in JSON file path
clean_and_convert_csv_to_json(csv_Inputfile_path, json_file_path) # Function call - CSV 2 JSON
# JSON to NDJSON convertion ____________________________________________________________________________________________
input_file_path = 'Data/JSON_data.json' # input data convert in JSON file path
output_file_path = 'Data/NDJSON_data.json' # inline JSON path
convert_json_to_ndjson(input_file_path, output_file_path) # Function call - JSON to NDJSON (ND = Newline Delimiter)
# Data filtering _______________________________________________________________________________________________________
column_mapping = {
"Summation column": "brand_id",
"Categories [list]": "categories",
"Created at": "created_at",
"Group": "group",
"JIRA URL [txt]": "jira_url",
"Level [list]": "level",
"Organization": "organization_id",
"Organization Name": "organization_name",
"POD [list]": "pod",
"Priority": "priority",
"Product feedback status [list]": "product_feedback_status",
"Satisfaction Score": "satisfaction_rating",
"Status": "status",
"Tags": "tags",
"Ticket type": "type",
"Type (custom) [list]": "type_custom",
"Id": "ticket_id",
"Ticket Form ID": "ticket_form_id",
"Assignee": "assignee",
"Severity [list]": "severity",
"Need partner training [flag]": "NT_partner",
"Need customer training [flag]": "NT_customer"
}
input_file_path = 'Data/NDJSON_data.json' # inline data file path
filtered_output_file_path = 'Data/FILTERED_data.json' # filtred data path
data_filtering(input_file_path,filtered_output_file_path,column_mapping)# Function call - Data filtering
# JSON to NDJSON convertion ____________________________________________________________________________________________
input_file_path = 'Data/FILTERED_data.json' # filtred data
output_file_path = 'Data/NDJSON_data2.json' # filtered inline JSON path
convert_json_to_ndjson(input_file_path, output_file_path) # Function call - JSON to NDJSON
# Date format correction _______________________________________________________________________________________________
input_file = 'Data/NDJSON_data2.json' # filtered inline JSON path
output_file = 'Data/DateCorrection.json' # JSON with date correction
date_correction(input_file, output_file) # Function call - Date format correction
# Organization_name correction _________________________________________________________________________________________
input_file = 'Data/DateCorrection.json' # JSON with date correction
output_file = 'Data/OrgaName.json' # JSON with organization_name correction
fill_organization_name(input_file, output_file) # Function call - Organization_name correction
# Correct tag format ___________________________________________________________________________________________________
input_file = 'Data/OrgaName.json' # JSON with organization_name correction
output_file = 'Data/Tag.json' # JSON with tags correction - Output
convert_tags_to_list(input_file, output_file) # Function call - Correct tag format
# Lower case Correction ________________________________________________________________________________________________
input_path = 'Data/Tag.json'
output_path = 'Data/Lowercase.json'
fields_to_lowercase = ['priority', 'status', 'type', 'severity']
transform_fields_to_lowercase(input_path, output_path, fields_to_lowercase)
# Underscore Correction ________________________________________________________________________________________________
input_path = 'Data/Lowercase.json'
output_path = 'Data/Underscore.json'
fields_to_underscore = ['level', 'type_custom']
transform_to_underscore(input_path, output_path, fields_to_underscore)
# Satisfaction rating Correction _______________________________________________________________________________________
input_path = 'Data/Underscore.json'
output_path = 'Data/SatisRating.json'
fields_satisfaction = ['satisfaction_rating']
transform_satisfaction_rating(input_path, output_path, fields_satisfaction)
# Boolean Correction ___________________________________________________________________________________________________
input_path = 'Data/SatisRating.json'
output_path = 'Data/Boolean.json'
fields_boolean = ['NT_partner','NT_customer']
transform_to_boolean(input_path, output_path, fields_boolean)
# Null value Correction ________________________________________________________________________________________________
input_path = 'Data/Boolean.json'
output_path = 'Data/Output.json'
fields_with_dashes = ['product_feedback_status', 'pod', 'jira_url', 'categories','level', 'type_custom', 'severity','NT_partner','NT_customer', 'group', 'priority']
replace_dashes_with_null(input_path, output_path, fields_with_dashes)
print("... Transformation complete ...")
# BigQuery client initialization with a local login ____________________________________________________________________
client = bigquery.Client(project=project_id) # Instantiating the BigQuery client
# Verification Dataset & Table _________________________________________________________________________________________
dataset_ref = client.dataset(dataset_id) # Dataset definition
table_ref = dataset_ref.table(table_name) # Table definition
check_dataset(dataset_ref, client) # Check dataset existence
check_table(table_ref, client) # Check table existence
# Load job configuration ______________________________________________________________________________________________
print("... Confirguration ...")
job_config = bigquery.LoadJobConfig() # Instantiating the LoadJob for setup
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON # Define the data format
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND # Configure the data push settings
print("... Configuration terminé ...")
# Loading JSON data into the BigQuery table ___________________________________________________________________________
json_file_path = 'Data/Output.json'
print("... Début du chargement des données ...")
with open(json_file_path, "rb") as source_file: # rb = read binary => better compatibility and integrity of data
load_job = client.load_table_from_file(source_file, table_ref, job_config=job_config) # pushing data on BigQuery
load_job.result() # waitting the end of loading
print("... Chargement des données terminé ...")
print(f"Les données du fichier {json_file_path} ont été chargées avec succès dans {table_name}")
client.close() # Close connection