-
Notifications
You must be signed in to change notification settings - Fork 1
/
label_datasets.py
183 lines (146 loc) · 6.1 KB
/
label_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import argparse
import os
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import configs.model_config as model_config
# min-max normalization Xmin=0
def normalize(X, max_value):
"""
Normalize the data
## Parameters
`X` : Numpy array containing the data
`max_value` : max value in the data
## Returns
`X` : Normalized data
"""
for x in X:
v = np.max(x)
if v > max_value:
max_value = v
if max_value == 0:
return X
return X / max_value
def process_data(df: pd.DataFrame, time_window, upper_bound: pd.Timedelta, max_gap: pd.Timedelta) -> list:
"""
Process the data by resampling it to 8s and filling the gaps with the nearest value and then splitting it into windows of size time_window.
If there is a gap of more than max_gap skip the window. If there are more than 15 gaps of upper_bound or more skip the window. If the device is always off skip the window.
## Parameters
`df` : DataFrame containing the data
`time_window` : size of the window in rows
`upper_bound` : upper bound for the gap in seconds if there is more than 15 gaps of this size in a window skip the window
`max_gap` : max gap in seconds if there is a gap of more than this size in a window skip the window
## Returns
`windows` : List of windows for the aggregated data
"""
df = df.resample(model_config.SAMPLING_RATE).fillna(method="nearest", limit=4)
df.fillna(0, inplace=True)
# handle negatve values
df[df < 0] = 0
windows = []
for i in range(0, len(df) - time_window, time_window + 1):
window = df.iloc[i: i + time_window]
# if there is a gap of more than max_gap skip the window
time_diffs = window.index.to_series().diff().dropna()
if (time_diffs >= max_gap).any():
continue
# if there are more than 15 gaps of upper_bound or more skip the window
if len(time_diffs[time_diffs > upper_bound]) > 15:
continue
# skip if the device is always off
if window.max().max() < 5:
continue
window.reset_index(drop=True, inplace=True)
window_values = window.values
max_value = np.max(window_values)
windows.append(window_values)
return np.array(windows), max_value
def preprocess_dataset(data_path: Path):
"""
Preprocess the datasets by resampling the data and splitting it into windows
## Parameters
`data_path` : Path to the dataset
## Returns
`household_windows` : Dictionary containing the windows for each household
`max_value` : max value in the data used for normalization
"""
max_value = 0
household_windows = {}
data = pd.read_pickle(data_path)
for h in tqdm(data):
windows, max_value_window = process_data(data[h]["aggregate"], model_config.WINDOW_SIZE, model_config.UPPER_BOUND, model_config.MAX_GAP)
if max_value_window > max_value:
max_value = max_value_window
household_windows[h] = windows
return household_windows, max_value
def predict_appilances(windows: np.array, models: list, max_value: float) -> dict:
"""
Predict the appliances for the given windows using the models
## Parameters
`windows` : Numpy array containing the windows
`models` : List of models to use for prediction
`max_value` : max value in the data used for normalization
## Returns
`y_pred_tf` : vector containing the predictions
"""
predictions = []
windows = normalize(windows, max_value)
# predict for each model
for model in models:
y_pred = model.predict(windows)
predictions.append(y_pred)
# average the predictions of the models
predictions_models = np.array(predictions)
predictions_models = np.mean(predictions_models, axis=0)
predictions_houses = np.mean(predictions_models, axis=0)
# threshold the predictions
y_pred_tf = np.where(predictions_houses > model_config.THRESHOLD, 1, 0)
return y_pred_tf
def get_labels(data: dict, model_path: Path, label_path: Path, max_value: float) -> dict:
"""
Get the appliances for the given households
## Parameters
`data` : Dictionary containing the windows for each household
`model_path` : Path to the folder containing the models
`label_path` : Path to the labels
`max_value` : max value in the data used for normalization
## Returns
`devices` : Dictionary containing the predicted appliances for each household
"""
# load labels
labels = np.array(pd.read_pickle(label_path))
# load models
models = []
for f in os.listdir(model_path / "model"):
# skip init file and jupyter notebook checkpoints
if "init" in f or "ipynb" in f:
continue
model = tf.keras.models.load_model(model_path / "model" / f)
models.append(model)
devices = {}
for house in data:
devices[house] = labels[predict_appilances(data[house], models, max_value) == 1]
return devices
def get_predicted_appliances(data_path: Path, model_path: Path, label_path: Path, save_path: Path,
datasets: list[str]) -> None:
"""
Label unlabeled datasets utilizing InceptionTime model.
## Parameters
`data_path` : Path to the parsed data
`model_path` : Path to the pretrained model folder
`label_path` : Path to the labels
`save_path` : Path to the save folder to save the predicted devices
`datasets` : List of datasets to generate labels for, example: 'IDEAL' will generate only for IDEAL
"""
household_labels = {}
for dataset in datasets:
assert (data_path / (dataset + ".pkl")).exists(), f"Dataset {dataset} does not exist"
household_windows, max_value = preprocess_dataset(data_path / (dataset + ".pkl"))
labels = get_labels(household_windows, model_path, label_path, max_value)
household_labels.update(labels)
# save with pickle
with open(Path(save_path) / "predicted_devices.pkl", "wb") as handle:
pickle.dump(household_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)