-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_data.py
163 lines (138 loc) · 6.34 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import sys
import argparse
# Add the project root directory to sys.path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if project_root not in sys.path:
print(project_root)
sys.path.insert(0, project_root)
from src.linking.generate_links import generate_links
from src.run_parsers import parse_datasets
from src.loadprofiles import generate_loadprofiles
from src.generate_metadata import generate_metadata
from src.generate_consumption_data import generate_consumption_data
from src.database_reset import reset_database
from scripts.generate_training_data import generate_training_data
from src.remove_devices import remove_devices
from pathlib import Path
import gc
from configs import pipeline_config as config
if __name__ == "__main__":
# get sample and full option from command line
# if --sample is passed it uses only the sample data if --full is
# passed it uses the full data
parser = argparse.ArgumentParser(description='Process data for the energy knowledge graph')
parser.add_argument('--sample', action='store_true', help='Use sample data')
parser.add_argument('--full', action='store_true', help='Use full data')
args = parser.parse_args()
if args.sample:
datasets = [
"REFIT",
"ECO",
"HES",
"UKDALE",
"HUE",
"LERTA",
"UCIML",
"DEKN",
"SUST1",
"SUST2",
"HEART",
"ENERTALK",
"DEDDIAG",
"IDEAL"]
elif args.full:
datasets = [
"REFIT",
"ECO",
"HES",
"UKDALE",
"HUE",
"LERTA",
"UCIML",
"DRED",
"REDD",
"IAWE",
"DEKN",
"SUST1",
"SUST2",
"HEART",
"ENERTALK",
"DEDDIAG",
"IDEAL",
"ECDUY",
"PRECON",
"EEUD"
]
else:
datasets = config.DATASETS
# path to the raw data folder
raw_data_path: Path = Path(config.RAW_DATA_PATH).resolve()
# path to the folder to save the parsed data
parsed_data_path: Path = Path(config.PARSED_DATA_PATH).resolve()
if not parsed_data_path.exists():
parsed_data_path.mkdir()
# path to the folder to save the loadprofiles
loadprofiles_path: Path = Path(config.LOADPROFILES_PATH).resolve()
if not loadprofiles_path.exists():
loadprofiles_path.mkdir()
# path to the folder containing metadata
metadata_path: Path = Path(config.METADATA_PATH).resolve()
# path to the folder to save the generated metadata
generated_metadata_path: Path = Path(config.GENERATED_METADATA_PATH).resolve()
if not generated_metadata_path.exists():
generated_metadata_path.mkdir()
# path to the folder to save the consumption data
consumption_data_path: Path = Path(config.CONSUMPTION_DATA_PATH).resolve()
if not consumption_data_path.exists():
consumption_data_path.mkdir()
# path to the folder containing the pretrained model
model_path: Path = Path(config.MODEL_PATH).resolve()
#path to the labels
labels_path: Path = Path(config.LABELS_PATH).resolve()
# path to the folder to save the predicted appliances
predicted_appliances_path: Path = Path(config.PREDICTED_APPLIANCES_PATH).resolve()
if not predicted_appliances_path.exists():
predicted_appliances_path.mkdir()
# endpoint to the knowledge graph where the data will be inserted
knowledge_graph_endpoint = config.KNOWLEDGE_GRAPH_ENDPOINT
steps = config.STEPS
# to avoid needing tensorflow to be installed when not using the "predict-devices" step
if "predict-devices" in steps:
from src.label_datasets import get_predicted_appliances
if "add-predicted-devices" in steps:
from src.add_predicted_devices import add_predicted_devices
# datasets on which to predict appliances
predict_datasets = config.PREDICT_DATASETS
# static dictionary to store the descriptions of the steps to be printed to the console
step_descriptions = {
"parse" : "Parsing the raw datasets and storing them as pickle files.....",
"loadprofiles" : "Generating loadprofiles from the parsed datasets.....",
"metadata" : "Generating metadata from the parsed datasets.....",
"consumption-data" : "Generating household and appliance average daily and on/off consumption data.....",
"db-reset" : "Resetting the database if it's not empty and populating it with the generated data.....",
"generate-links" : "Generating links between our knowledge graph and WikiData and DBpedia.....",
"predict-devices" : "Predicting devices for the specified unlablled datasets and storing them as a pickle file.....",
"add-predicted-devices" : "Adding the predicted devices to the knowledge graph....."
}
functions = {
"parse": lambda: parse_datasets(raw_data_path, parsed_data_path, datasets),
"loadprofiles": lambda: generate_loadprofiles(parsed_data_path, loadprofiles_path, datasets),
"metadata": lambda: generate_metadata(metadata_path, generated_metadata_path, datasets),
"consumption-data": lambda: generate_consumption_data(parsed_data_path, consumption_data_path, datasets),
"db-reset": lambda: reset_database(generated_metadata_path / "residential_metadata.parquet",
loadprofiles_path / "merged_loadprofiles.pkl",
consumption_data_path / "consumption_data.pkl", datasets,
config.POSTGRES_URL),
"generate-links": lambda: generate_links(knowledge_graph_endpoint),
"predict-devices": lambda: (
get_predicted_appliances(parsed_data_path, model_path, labels_path, predicted_appliances_path,
predict_datasets)),
"add-predicted-devices": lambda: add_predicted_devices(predicted_appliances_path,
graph_endpoint=knowledge_graph_endpoint)
}
for step in steps:
print("********************************************************************************************\n",
step_descriptions[step])
functions[step]()
gc.collect()