Skip to content
This repository has been archived by the owner on Sep 27, 2023. It is now read-only.

Commit

Permalink
Add xlsx-workbook2csv-table converter (#1191)
Browse files Browse the repository at this point in the history
  • Loading branch information
dchassin authored Aug 1, 2022
1 parent ceb9fe1 commit 0ebaf45
Show file tree
Hide file tree
Showing 6 changed files with 523 additions and 19 deletions.
4 changes: 4 additions & 0 deletions converters/Makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ dist_pkgdata_DATA += converters/json2zip.py
dist_pkgdata_DATA += converters/xls2csv.py
dist_pkgdata_DATA += converters/xls-spida2csv-geodata.py

# xlsx -> csv
dist_pkgdata_DATA += converters/xlsx2csv.py
dist_pkgdata_DATA += converters/xlsx-workbook2csv-table.py

#
# SUPPORT MODULES
#
Expand Down
222 changes: 222 additions & 0 deletions converters/xlsx-workbook2csv-table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
"""Convert XLS worksheets to CSV tables
SYNOPSIS
Shell:
$ gridlabd convert WORKBOOK.xls TABLE.csv -f xlsx-workbook -t csv-table [OPTIONS ...]
GLM:
#convert WORKBOOK.xls TABLE.csv -f xlsx-workbook -t csv-table [OPTIONS ...]
DESCRIPTION
This converter extracts worksheets in an XLS workbook to CSV tables. If more
than one worksheet is found and more than one worksheet matches table.pattern
option, then the name TABLE is used as a root and the worksheet name is appended
to the CSV filename when saving each worksheet.
The read.converters option may be used to specify converters for individual columns
of worksheets. The converters are specified in a JSON file as a dict where the
original column name is specified and a gridlabd data type is associated, i.e.,
"gridlabd.double", "gridlabd.complex", "gridlabd.int64", "gridlabd.int32",
"gridlabd.int16", or "gridlabd.timestamp". You may also specify python or numpy
data types, e.g., "float" or "numpy.float64", etc. Failsafe converters are provided
by "failsafe.float" by "failsafe.int". These return invalid.float and invalid.int,
respectively, if the data conversion fails.
OPTIONS:
table.fixnames {<regex>,<bool>} (default False)
Removes characters matching the pattern. If True is used the pattern
[^A-Za-z0-9] is used.
table.pattern <regex> (default None)
Saves only worksheets when names that match the pattern. None saves
all worksheets found. Only used when more than one worksheet is found.
invalid.float <str> (default 'nan')
invalid.int <str> (default '-1')
glm.class <str> (default None)
Class to use when outputing the GLM file. If None is used, no GLM
file is output.
read options (see pandas read_excel options):
read.sheet_name <str> (default None)
read.header <int> (default 0)
read.names <str>[,...] (default None)
read.index_col <int>[,...] (default None)
read.usecols <int>[,...] (default None)
read.skiprows <int> (default 0)
read.nrows <int> (default None)
read.parse_dates <bool> (default False)
read.engine <str> (default openpyxl)
read.converters <file> (default None)
write options (see pandas dataframe to_csv options):
write.header <bool> (default True)
write.index <bool> (default False)
"""
import os, sys
import pandas as pd
import warnings
import re
import json
import numpy

class Xlsx2csvConverter(Exception):
pass

default_options = {
"table" : {
"fixnames" : False,
"pattern" : None,
},
"glm" : {
"class" : None,
},
"datetime" : {
"read" : "%Y-%m-%d %H:%M:%S %Z",
},
"invalid" : {
"float" : 'nan',
"int" : '-1',
},
"read" : {
"sheet_name" : None,
"header" : 0,
"names" : None,
"index_col" : None,
"usecols" : None,
"skiprows" : 0,
"nrows" : None,
"parse_dates" : False,
"engine" : "openpyxl",
"converters" : None, # TODO
},
"write" : {
"header" : True,
"index" : False,
}
}

excel_types = {
"float64" : "double",
"int64" : "int64",
"datetime64[ns]" : "timestamp",
}

def convert(input_file, output_file, options={}):

# check input
if not input_file.endswith(".xlsx"):
raise Xlsx2csvConverter(f"input file '{input_file}' must have xlsx extension")

# check output
if not output_file.endswith(".csv"):
raise Xlsx2csvConverter(f"output file '{output_file}' must have csv extension")

# check options
for key,value in options.items():
spec = key.split(".")
if len(spec) != 2 or not spec[1] in default_options[spec[0]].keys():
raise Xlsx2csvConverter(f"option '{key}={value}' is not valid")
if value == None:
value = ["True"]
elif not type(value) is str:
value = [str(value)]
elif "," in value:
value = value.split(",")
else:
value = [value]

# retype data
def autotype(x):
if x.lower() == "true":
return True
if x.lower() == "false":
return False
if x.lower() == "none":
return None
try:
return int(x)
except:
pass
try:
return float(x)
except:
pass
return x
default_options[spec[0]][spec[1]] = [autotype(x) for x in value]

# collapse non-str lists of len 1
if len(default_options[spec[0]][spec[1]]) == 1 and not type(default_options[spec[0]][spec[1]]) is str:
default_options[spec[0]][spec[1]] = default_options[spec[0]][spec[1]][0]

# setup converters if needed
class failsafe:
def float(x):
try:
return float(x)
except:
return float(default_options["invalid"]["float"])
def int(x):
try:
return int(x)
except:
return int(default_options["invalid"]["int"])

if default_options["read"]["converters"]:
with open(default_options["read"]["converters"],"r") as fh:
import gridlabd
default_options["read"]["converters"] = {}
for key,value in json.load(fh).items():
default_options["read"]["converters"][key] = eval(value)

# read input
with warnings.catch_warnings(record=True):
warnings.simplefilter("always")
book = pd.read_excel(input_file,**default_options["read"])

# fix names
def typeof(df,col):
try:
return excel_types[str(df.dtypes[col])]
except:
return "string"

def writecsv(book,file,**options):
fixed = []
if options["table"]["fixnames"]:
if type(options["table"]["fixnames"]) is str:
fixpattern = options["table"]["fixnames"]
else:
fixpattern = r'[^a-zA-Z0-9_]'
book.columns = [re.sub(re.compile(fixpattern),'',x) for x in book.columns]
book.to_csv(file,**options["write"])
if options["glm"]["class"]:
glmname = file[:-4]+".glm"
with open(glmname,"w") as glm:
glm.write(f"class {options['glm']['class']} {{\n")
for name in book.columns:
glm.write(f"\t{typeof(book,name)} {name};\n")
glm.write("}\n")

# write output
if len(book.keys()) > 1:
if options["table"]["pattern"]:
pattern = re.complex(options["table"]["pattern"])
else:
pattern = None
for name,sheet in book.items():
if pattern == None or re.match(pattern,name):
file = output_file.replace(".csv","_"+name+".csv")
writecsv(sheet,file,**default_options)
else:
name = list(book.keys())[0]
writecsv(book[name],output_file,**default_options)
91 changes: 91 additions & 0 deletions converters/xlsx2csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json
import os
import sys, getopt
from datetime import datetime
import importlib, copy
from importlib import util

config = {
"input" : "xlsx",
"output" : "csv",
"from" : ["workbook"],
"type" : ["table"],
}

def help():
print('Syntax:')
print(f'{config["input"]}2{config["output"]}.py -i|--ifile <input-file>[,<input-file>[,...]] -o|--ofile <output-file> [options ...]')
print(f' -c|--config : [OPTIONAL] display converter configuration')
print(f' -i|--ifile : [REQUIRED] {config["input"]} input file name')
print(f' -o|--ofile : [REQUIRED] {config["output"]} output file name')
print(f' -f|--from : [REQUIRED] input {config["input"]} data type')
print(f' -t|--type : [REQUIRED] output {config["output"]} data type')
print(f' -r|--read : [OPTIONAL] set converter read option')
print(f' -w|--write : [OPTIONAL] set converter write option')

def error(msg):
print(f'ERROR [{config["input"]}2{config["output"]}]: {msg}')
sys.exit(1)

input_file = None
input_type = None
output_file = None
output_type = None
options = {}

opts, args = getopt.getopt(sys.argv[1:],"hci:o:f:t:r:w:",["help","config","ifile=","ofile=","from=","type=","read=","write="])

if not opts :
help()
sys.exit(1)

for opt, arg in opts:
if opt in ("-h","--help"):
help()
sys.exit(0)
elif opt in ("-c","--config"):
print(json.dumps(config))
sys.exit(0)
elif opt in ("-i", "--ifile"):
input_file = arg.strip()
elif opt in ("-o", "--ofile"):
output_file = arg.strip()
elif opt in ("-f","--from"):
input_type = arg.strip()
elif opt in ("-t","--type"):
output_type = arg.strip()
elif opt in ("-r","--read"):
spec = arg.split("=")
if len(spec) == 1:
options[f"read.{arg}"] = True
elif len(spec) == 2:
options[f"read.{spec[0]}"] = spec[1]
else:
options[f"read.{spec[0]}"] = "=".join(spec[1:])
elif opt in ("-w","--write"):
spec = arg.split("=")
if len(spec) == 1:
options[f"write.{arg}"] = True
elif len(spec) == 2:
options[f"write.{spec[0]}"] = spec[1]
else:
options[f"write.{spec[0]}"] = "=".join(spec[1:])
else:
error(f"{opt}={arg} is not a valid option");

if input_file == None:
error("missing input file name")
elif output_file == None:
error("missing output file name")
elif input_type == None:
error("missing input data type")
elif output_type == None:
error("missing output data type")

modname = sys.argv[0].replace(f'{config["input"]}2{config["output"]}.py',f'{config["input"]}-{input_type}2{config["output"]}-{output_type}.py')
if os.path.exists(modname):
modspec = util.spec_from_file_location(output_type, f"{modname}.py")
mod = importlib.import_module(f'{config["input"]}-{input_type}2{config["output"]}-{output_type}')
mod.convert(input_file,output_file,options)
else:
error(f"{modname} not found")
11 changes: 10 additions & 1 deletion docs/Module/Python.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,16 @@ Units:
~~~
>>> gridlabd.convert_unit(string,to)
>>> gridlabd.convert_unit(real,from,to)
~~~
Converters:
~~~
>>> gridlabd.bool(string)
>>> gridlabd.int16(string)
>>> gridlabd.int32(string)
>>> gridlabd.int64(string)
>>> gridlabd.timestamp(string)
>>> gridlabd.double(string)
>>> gridlabd.complex(string)
~~~

# Description
Expand Down
Loading

0 comments on commit 0ebaf45

Please sign in to comment.