-
Notifications
You must be signed in to change notification settings - Fork 1
/
convertor.py
243 lines (200 loc) · 8.02 KB
/
convertor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""Data Files Convertor
This script allows the user to convert different data types.
This tool accepts comma separated value files (.csv) as well as apache parquet
(.parquet) files. It is assumed that the first row of the spreadsheet is the
location of the columns.
This script requires that `pandas`, `pyarrow`, `argparse` and `pathlib` be installed within the Python
environment you are running this script in.
This file can also be imported as a module and contains the following
functions:
* convert_csv_to_parquet - convert csv to parquet and save to file
* convert_parquet_to_csv - convert parquet to csv and save to file
* get_get_parquet_schema - returns schema of parquet file
* get_filename_with_suffix - returns filename string with added suffix for filename and change extension
* is_file_ext_correct - returns returns True if filename has correct file extension and prints message otherwise
* print_success_message - prints message of successfull convertion with elapsed time
* construct_argument_parser - constructs the argument parser
* main - the main function of the script
"""
# import the necessary packages
import pandas as pd
import pyarrow
import argparse
import pathlib
import time
# define functions for convert
def convert_csv_to_parquet(csv_path: str, parquet_path: str, delimiter=','):
"""Convert csv to parquet and save to file
Parameters
----------
csv_path: str
The file name of the csv
parquet_path: str
The file name of the parquet
delimiter: str, optional
Delimiter to use in parsing engine (default is ',')
"""
try:
df = pd.read_csv(csv_path, sep=delimiter)
df.to_parquet(parquet_path)
except Exception as e:
print(e)
def convert_parquet_to_csv(parquet_path: str, csv_path: str, delimiter=','):
"""Convert parquet to csv and save to file
Parameters
----------
parquet_path: str
The file name of the parquet
csv_path: str
The file name of the csv
delimiter: str, optional
Delimiter to use in parsing engine (default is ',')
"""
try:
df = pd.read_parquet(parquet_path)
df.to_csv(csv_path, sep=delimiter, index=False)
except Exception as e:
print(e)
def get_parquet_schema(parquet_path: str) -> str:
"""Get schema of parquet file
Parameters
----------
parquet_path: str
The file name of the parquet
Returns
-------
str
a string of parquet schema
"""
try:
df = pd.read_parquet(parquet_path)
schema = pyarrow.Table.from_pandas(df=df).schema
except Exception as e:
print(e)
return schema
# define functions for working with filenames
def get_filename_with_suffix(filename: str, suffix: str, extension: str) -> str:
"""Add suffix for filename and change extension
Parameters
----------
filename: str
The file name string
suffix: str
The suffix which should be added at the end of filename
extension: str
New extension of filename
Returns
-------
str
filename string with added suffix and new file extension
"""
try:
stem = pathlib.Path(filename).stem
filename_with_suffix = stem + '_' + suffix + '.' + extension
except Exception as e:
print(e)
return filename_with_suffix
def is_file_ext_correct(parameter: str, filename: str, extension: str) -> bool:
"""Returns True if filename has correct file extension and prints message otherwise"
Parameters
----------
parameter: str
The name of parameter used to print in error message
filename: str
The file name string
extension: str
File extension without dot (.) used to compare with filename
Returns
-------
bool
A flag used to determinate is the given filename has correct extension
"""
try:
assert pathlib.Path(filename).suffix == '.' + extension
except:
print(f'Wrong argument for --{parameter}. You must specify *.{extension} file for input')
return False
else:
return True
# define other functions
def print_success_message(inputFilename: str, outputFilename: str, time_start: float):
"""Print final message of successfully converted files with elapsed time
Parameters
----------
inputFilename : str
Name of input file
outputFilename : str
Name of converted output file
time_start : float
Start of time countdown used for calculating elapsed time
"""
time_elapsed = time.perf_counter() - time_start
print(f'Successfully converted from {inputFilename} to {outputFilename} in {time_elapsed:.4f} secs')
def construct_argument_parser() -> dict:
"""Construct the argument parser and get the arguments
Returns
-------
dict
Dictionary of arguments and paramenters
"""
ap = argparse.ArgumentParser(
description=__doc__)
ap.add_argument("-cp", "--csv2parquet", type=str,
help="Convert csv to parquet. Set input csv filename string (example: data.csv)")
ap.add_argument("-pc", "--parquet2csv", type=str,
help="Convert parquet to csv. Set input parquet filename string (example: data.parquet)")
ap.add_argument("-s", "--get_schema", type=str,
help="Get schema of parquet file. Set input parquet filename string (example: data.parquet)")
ap.add_argument("-o", "--output", type=str,
help="Set output file name without extension (example: newfile)")
ap.add_argument("-d", "--delimiter", type=str, default=",",
help="Set delimiter for csv file (default: ,)")
return vars(ap.parse_args())
def main():
# save start time for calculating
time_start = time.perf_counter()
args = construct_argument_parser()
# check convert option
if args['csv2parquet']:
# convert csv to parquet
if is_file_ext_correct('csv2parquet', args['csv2parquet'], 'csv'):
inputFilename = args['csv2parquet']
# check output filename argument
if args['output']:
outputFilename = args['output'] + '.parquet'
else:
outputFilename = get_filename_with_suffix(inputFilename, 'converted', 'parquet')
# check delimeter argument and convert
if args['delimiter']:
convert_csv_to_parquet(inputFilename, outputFilename,
delimiter=args['delimiter'])
else:
convert_csv_to_parquet(inputFilename, outputFilename)
print_success_message(inputFilename, outputFilename, time_start)
elif args['parquet2csv']:
# convert parquet to csv
if is_file_ext_correct('parquet2csv', args['parquet2csv'], 'parquet'):
inputFilename = args['parquet2csv']
# check output filename argument
if args['output']:
outputFilename = args['output'] + '.csv'
else:
outputFilename = get_filename_with_suffix(inputFilename, 'converted', 'csv')
# check delimeter argument and convert
if args['delimiter']:
convert_parquet_to_csv(inputFilename, outputFilename,
delimiter=args['delimiter'])
else:
convert_parquet_to_csv(inputFilename, outputFilename)
print_success_message(inputFilename, outputFilename, time_start)
elif args['get_schema']:
# get schema of parquet
if is_file_ext_correct('get_schema', args['get_schema'], 'parquet'):
print(get_parquet_schema(args['get_schema']))
else:
# arguments are None
print('Please, pass one of the necessary arguments for convertion:\n--csv2parquet\n--parquet2csv\n--get_schema \
\n(example: --csv2parquet data.csv) \
\n\nType --help for description of parameters.')
if __name__ == "__main__":
main()