Add xlsx-workbook2csv-table converter (#1191)

arras-energy · Aug 1, 2022 · 0ebaf45 · 0ebaf45
1 parent ceb9fe1
commit 0ebaf45
Show file tree

Hide file tree

Showing 6 changed files with 523 additions and 19 deletions.
diff --git a/converters/Makefile.mk b/converters/Makefile.mk
@@ -77,6 +77,10 @@ dist_pkgdata_DATA += converters/json2zip.py
 dist_pkgdata_DATA += converters/xls2csv.py
 dist_pkgdata_DATA += converters/xls-spida2csv-geodata.py
 
+# xlsx -> csv
+dist_pkgdata_DATA += converters/xlsx2csv.py
+dist_pkgdata_DATA += converters/xlsx-workbook2csv-table.py
+
 #
 # SUPPORT MODULES
 #

diff --git a/converters/xlsx-workbook2csv-table.py b/converters/xlsx-workbook2csv-table.py
@@ -0,0 +1,222 @@
+"""Convert XLS worksheets to CSV tables
+
+SYNOPSIS
+
+	Shell:
+		$ gridlabd convert WORKBOOK.xls TABLE.csv -f xlsx-workbook -t csv-table [OPTIONS ...]
+
+	GLM:
+		#convert WORKBOOK.xls TABLE.csv -f xlsx-workbook -t csv-table [OPTIONS ...]
+
+DESCRIPTION
+
+	This converter extracts worksheets in an XLS workbook to CSV tables. If more
+	than one worksheet is found and more than one worksheet matches table.pattern
+	option, then the name TABLE is used as a root and the worksheet name is appended 
+	to the CSV filename when saving each worksheet.
+
+	The read.converters option may be used to specify converters for individual columns
+	of worksheets.  The converters are specified in a JSON file as a dict where the 
+	original column name is specified and a gridlabd data type is associated, i.e.,
+	"gridlabd.double", "gridlabd.complex", "gridlabd.int64", "gridlabd.int32", 
+	"gridlabd.int16", or "gridlabd.timestamp". You may also specify python or numpy
+	data types, e.g., "float" or "numpy.float64", etc.  Failsafe converters are provided
+	by "failsafe.float" by "failsafe.int". These return invalid.float and invalid.int,
+	respectively, if the data conversion fails.
+
+OPTIONS:
+
+	table.fixnames {<regex>,<bool>} (default False)
+
+		Removes characters matching the pattern. If True is used the pattern
+		[^A-Za-z0-9] is used.
+
+	table.pattern <regex> (default None)
+
+		Saves only worksheets when names that match the pattern. None saves 
+		all worksheets found. Only used when more than one worksheet is found.
+
+	invalid.float <str> (default 'nan')
+	invalid.int <str> (default '-1')
+
+	glm.class <str> (default None)
+
+		Class to use when outputing the GLM file.  If None is used, no GLM
+		file is output.
+
+	read options (see pandas read_excel options):
+
+		read.sheet_name <str> (default None)
+		read.header <int> (default 0)
+		read.names <str>[,...] (default None)
+		read.index_col <int>[,...] (default None)
+		read.usecols <int>[,...] (default None)
+		read.skiprows <int> (default 0)
+		read.nrows <int> (default None)
+		read.parse_dates <bool> (default False)
+		read.engine <str> (default openpyxl)
+		read.converters <file> (default None)
+
+	write options (see pandas dataframe to_csv options):
+
+		write.header <bool> (default True)
+		write.index <bool> (default False)
+"""
+import os, sys
+import pandas as pd 
+import warnings
+import re
+import json
+import numpy
+
+class Xlsx2csvConverter(Exception):
+	pass
+
+default_options = {
+	"table" : {
+		"fixnames" : False,
+		"pattern" : None,
+	},
+	"glm" : {
+		"class" : None,
+	},
+	"datetime" : {
+		"read" : "%Y-%m-%d %H:%M:%S %Z",
+	},
+	"invalid" : {
+		"float" : 'nan',
+		"int" : '-1',
+	},
+	"read" : {
+		"sheet_name" : None,
+		"header" : 0,
+		"names" : None,
+		"index_col" : None,
+		"usecols" : None,
+		"skiprows" : 0,
+		"nrows" : None,
+		"parse_dates" : False,
+		"engine" : "openpyxl",
+		"converters" : None, # TODO
+	},
+	"write" : {
+		"header" : True,
+		"index" : False,
+	}
+}
+
+excel_types = {
+	"float64" : "double",
+	"int64" : "int64",
+	"datetime64[ns]" : "timestamp",
+}
+
+def convert(input_file, output_file, options={}):
+
+	# check input
+	if not input_file.endswith(".xlsx"):
+		raise Xlsx2csvConverter(f"input file '{input_file}' must have xlsx extension")
+
+	# check output
+	if not output_file.endswith(".csv"):
+		raise Xlsx2csvConverter(f"output file '{output_file}' must have csv extension")
+
+	# check options
+	for key,value in options.items():
+		spec = key.split(".")
+		if len(spec) != 2 or not spec[1] in default_options[spec[0]].keys():
+			raise Xlsx2csvConverter(f"option '{key}={value}' is not valid")
+		if value == None:
+			value = ["True"]
+		elif not type(value) is str:
+			value = [str(value)]
+		elif "," in value:
+			value = value.split(",")
+		else:
+			value = [value]
+
+		# retype data
+		def autotype(x):
+			if x.lower() == "true":
+				return True
+			if x.lower() == "false":
+				return False
+			if x.lower() == "none":
+				return None
+			try:
+				return int(x)
+			except:
+				pass
+			try:
+				return float(x)
+			except:
+				pass
+			return x
+		default_options[spec[0]][spec[1]] = [autotype(x) for x in value]
+
+		# collapse non-str lists of len 1
+		if len(default_options[spec[0]][spec[1]]) == 1 and not type(default_options[spec[0]][spec[1]]) is str:
+			default_options[spec[0]][spec[1]] = default_options[spec[0]][spec[1]][0]
+
+	# setup converters if needed
+	class failsafe:
+		def float(x):
+			try: 
+				return float(x)
+			except: 
+				return float(default_options["invalid"]["float"])
+		def int(x):
+			try: 
+				return int(x)
+			except: 
+				return int(default_options["invalid"]["int"])
+
+	if default_options["read"]["converters"]:
+		with open(default_options["read"]["converters"],"r") as fh:
+			import gridlabd
+			default_options["read"]["converters"] = {}
+			for key,value in json.load(fh).items():
+				default_options["read"]["converters"][key] = eval(value)
+
+	# read input
+	with warnings.catch_warnings(record=True):
+		warnings.simplefilter("always")
+		book = pd.read_excel(input_file,**default_options["read"])
+
+	# fix names
+	def typeof(df,col):
+		try:
+			return excel_types[str(df.dtypes[col])]
+		except:
+			return "string"
+
+	def writecsv(book,file,**options):
+		fixed = []
+		if options["table"]["fixnames"]:
+			if type(options["table"]["fixnames"]) is str:
+				fixpattern = options["table"]["fixnames"]
+			else:
+				fixpattern = r'[^a-zA-Z0-9_]'
+			book.columns = [re.sub(re.compile(fixpattern),'',x) for x in book.columns]
+		book.to_csv(file,**options["write"])
+		if options["glm"]["class"]:
+			glmname = file[:-4]+".glm"
+			with open(glmname,"w") as glm:
+				glm.write(f"class {options['glm']['class']} {{\n")
+				for name in book.columns:
+					glm.write(f"\t{typeof(book,name)} {name};\n")
+				glm.write("}\n")
+
+	# write output
+	if len(book.keys()) > 1:
+		if options["table"]["pattern"]:
+			pattern = re.complex(options["table"]["pattern"])
+		else:
+			pattern = None
+		for name,sheet in book.items():
+			if pattern == None or re.match(pattern,name):
+				file = output_file.replace(".csv","_"+name+".csv")
+				writecsv(sheet,file,**default_options)
+	else:
+		name = list(book.keys())[0]
+		writecsv(book[name],output_file,**default_options)
diff --git a/converters/xlsx2csv.py b/converters/xlsx2csv.py
@@ -0,0 +1,91 @@
+import json 
+import os 
+import sys, getopt
+from datetime import datetime 
+import importlib, copy
+from importlib import util
+
+config = {
+    "input" : "xlsx",
+    "output" : "csv",
+    "from" : ["workbook"],
+    "type" : ["table"],
+    }
+
+def help():
+    print('Syntax:')
+    print(f'{config["input"]}2{config["output"]}.py -i|--ifile <input-file>[,<input-file>[,...]] -o|--ofile <output-file> [options ...]')
+    print(f'  -c|--config    : [OPTIONAL] display converter configuration')
+    print(f'  -i|--ifile     : [REQUIRED] {config["input"]} input file name')
+    print(f'  -o|--ofile     : [REQUIRED] {config["output"]} output file name')
+    print(f'  -f|--from      : [REQUIRED] input {config["input"]} data type')
+    print(f'  -t|--type      : [REQUIRED] output {config["output"]} data type')
+    print(f'  -r|--read      : [OPTIONAL] set converter read option')
+    print(f'  -w|--write     : [OPTIONAL] set converter write option')
+
+def error(msg):
+    print(f'ERROR    [{config["input"]}2{config["output"]}]: {msg}')
+    sys.exit(1)
+
+input_file = None
+input_type = None
+output_file = None
+output_type = None
+options = {}
+
+opts, args = getopt.getopt(sys.argv[1:],"hci:o:f:t:r:w:",["help","config","ifile=","ofile=","from=","type=","read=","write="])
+
+if not opts : 
+    help()
+    sys.exit(1)
+
+for opt, arg in opts:
+    if opt in ("-h","--help"):
+        help()
+        sys.exit(0)
+    elif opt in ("-c","--config"):
+        print(json.dumps(config))
+        sys.exit(0)
+    elif opt in ("-i", "--ifile"):
+        input_file = arg.strip()
+    elif opt in ("-o", "--ofile"):
+        output_file = arg.strip()
+    elif opt in ("-f","--from"):
+        input_type = arg.strip()
+    elif opt in ("-t","--type"):
+        output_type = arg.strip()
+    elif opt in ("-r","--read"):
+        spec = arg.split("=")
+        if len(spec) == 1:
+            options[f"read.{arg}"] = True
+        elif len(spec) == 2:
+            options[f"read.{spec[0]}"] = spec[1]
+        else:
+            options[f"read.{spec[0]}"] = "=".join(spec[1:])
+    elif opt in ("-w","--write"):
+        spec = arg.split("=")
+        if len(spec) == 1:
+            options[f"write.{arg}"] = True
+        elif len(spec) == 2:
+            options[f"write.{spec[0]}"] = spec[1]
+        else:
+            options[f"write.{spec[0]}"] = "=".join(spec[1:])
+    else:
+        error(f"{opt}={arg} is not a valid option");
+
+if input_file == None:
+    error("missing input file name")
+elif output_file == None:
+    error("missing output file name")
+elif input_type == None:
+    error("missing input data type")
+elif output_type == None:
+    error("missing output data type")
+
+modname = sys.argv[0].replace(f'{config["input"]}2{config["output"]}.py',f'{config["input"]}-{input_type}2{config["output"]}-{output_type}.py')
+if os.path.exists(modname):
+    modspec = util.spec_from_file_location(output_type, f"{modname}.py")
+    mod = importlib.import_module(f'{config["input"]}-{input_type}2{config["output"]}-{output_type}')
+    mod.convert(input_file,output_file,options)
+else:
+    error(f"{modname} not found")
diff --git a/docs/Module/Python.md b/docs/Module/Python.md
@@ -66,7 +66,16 @@ Units:
 ~~~
   >>> gridlabd.convert_unit(string,to)
   >>> gridlabd.convert_unit(real,from,to)
-
+~~~
+Converters:
+~~~
+  >>> gridlabd.bool(string)
+  >>> gridlabd.int16(string)
+  >>> gridlabd.int32(string)
+  >>> gridlabd.int64(string)
+  >>> gridlabd.timestamp(string)
+  >>> gridlabd.double(string)
+  >>> gridlabd.complex(string)
 ~~~
 
 # Description