-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_compilation.py
124 lines (109 loc) · 5.95 KB
/
data_compilation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# This is the code for compiling all the data.
# It's demonstrating how the data is generated, no raw data will be released.
import glob
import os
import warnings
import pandas as pd
import smogn
from tqdm import trange, tqdm
pd.set_option('display.max_columns', None)
energy_files = os.listdir('./Energy raw data set 20200101-20201211')
climate_files = os.listdir('./Climate raw data set 20200101-20201211')
energy = []
for i in trange(len(energy_files)):
energy.append(pd.read_csv('./Energy raw data set 20200101-20201211/{}'.format(energy_files[i]), encoding='utf16',
header=0, sep='\t'))
energy_total = pd.concat(energy)
energy_total = energy_total.drop(
['Location Path', 'Location Description', 'Total (kWh)', 'Light (kWh)', 'Socket (kWh)', 'Water Heater (kWh)',
'Mixed Usage (kWh)', 'Time (day of week)'], axis=1)
energy_total = energy_total.rename(
columns={'Time (date)': 'Date', 'Time (hour)': 'Hour', 'Location': 'Location', 'AC (kWh)': 'AC'})
energy_total['Time'] = pd.to_datetime(
energy_total['Date'].apply(str) + energy_total['Hour'].apply(str), format='%Y-%m-%d%H:%M')
energy_total['Time'] = energy_total['Time'].apply(str)
irr = glob.glob('./Climate raw data set 20200101-20201211/Irr*')
pre = glob.glob('./Climate raw data set 20200101-20201211/Pre*')
Rel = glob.glob('./Climate raw data set 20200101-20201211/Rel*')
Tem = glob.glob('./Climate raw data set 20200101-20201211/Tem*')
I = pd.concat([pd.read_csv(i, index_col=None) for i in irr]).drop(
['Source', 'Height', 'Status', 'Method ID', 'Details'], axis=1)
P = pd.concat([pd.read_csv(i, index_col=None) for i in pre]).drop(
['Source', 'Height', 'Status', 'Method ID', 'Details', 'Hour'], axis=1)
H = pd.concat([pd.read_csv(i, index_col=None) for i in Rel]).drop(
['Source', 'Height', 'Status', 'Method ID', 'Details'], axis=1)
T = pd.concat([pd.read_csv(i, index_col=None) for i in Tem]).drop(['Source', 'Height', 'Status', 'Method ID'], axis=1)
for i in [I, P, H, T]:
i['Time'] = pd.to_datetime(i['Time'].apply(str), format='%Y/%m/%d %H:%M:%S')
i['Time'] = i['Time'].apply(str)
data = pd.merge(pd.merge(pd.merge(pd.merge(energy_total, I, on='Time'), P, on='Time'), H, on='Time'), T, on='Time')
data = data.rename(
columns={'w/m2': 'Irradiance', 'mm': 'Precipitation', '%': 'Humidity', 'Degree Celsius': 'Temperature'})
data = data.fillna(method='pad', axis=0)
drop_list = ['10/F Public', '10/F Rooms', '3/F Public', '3/F Rooms', '2/F', '4/F', '5/F', '6/F Public', '6/F Rooms',
'7/F', '8/F Public', '8/F Rooms', '9/F Public', '9/F Rooms', 'ST302', 'ST602', 'ST802', 'ST902',
'Warden Flat']
for i in drop_list:
data = data[data['Location'] != i]
data.to_csv('2020_data_initial.csv', index=False)
data = data.sort_values(by=['Location', 'Time'])
data.insert(1, 'Prev_2hrs', '')
data.insert(1, 'Prev_1hr', '')
data.insert(1, 'Prev_1hr_AC', '')
data.insert(2, 'Prev_3hr_AC', '')
data.insert(3, 'Prev_5hr_AC', '')
data = data.reset_index(drop=True)
print("Checking Merged Data:")
print(data.head(10))
print("Number of data: {}".format(len(data)))
print(data.isnull().any())
print("Start Generating Previous Data")
for index in trange(len(data)):
if (index == 0) or (index == 1):
data.loc[index, 'Prev_2hrs'] = False
data.loc[index, 'Prev_1hr'] = False
else:
data.loc[index, 'Prev_2hrs'] = data.at[index - 1, 'AC'] > 0 and data.at[index - 2, 'AC'] > 0
data.loc[index, 'Prev_1hr'] = data.at[index - 1, 'AC'] > 0
if index <= 4:
data.loc[index, 'Prev_1hr_AC'] = 0
data.loc[index, 'Prev_3hr_AC'] = 0
data.loc[index, 'Prev_5hr_AC'] = 0
else:
data.loc[index, 'Prev_1hr_AC'] = data.at[index - 1, 'AC']
data.loc[index, 'Prev_3hr_AC'] = data.at[index - 1, 'AC'] + data.at[index - 2, 'AC'] + data.at[index - 3, 'AC']
data.loc[index, 'Prev_5hr_AC'] = data.at[index - 1, 'AC'] + data.at[index - 2, 'AC'] + data.at[
index - 3, 'AC'] + data.at[index - 4, 'AC'] + data.at[index - 5, 'AC']
data.to_csv('2020_data_compiled.csv', index=True)
print("Checking Generated Previous Data:")
print(data.head(10))
print("Number of data: {}".format(len(data)))
print(data.isnull().any())
# =============================================================================================================
# This code is concatenated from the original smogn_generation.py
# It uses the SMOGN algorithm to generate more human synthesised data to solve the imbalance problem
# =============================================================================================================
# Ignore all the warnings and set pandas to display every column and row everytime we print a dataframe
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Load the data with a positive AC electricity consumption value, and drop the time data as we don't need them
data = pd.read_csv("summer_data_compiled.csv", index_col=0)
data = data[data.AC > 0].drop(['Time', 'Date', 'Hour'], axis=1).reset_index(drop=True)
# Create some directory to store the models and future analysis figures.
log_folder_name = "SMOGN_processed"
if not os.path.exists('./{}/'.format(log_folder_name)):
os.mkdir('./{}'.format(log_folder_name))
# ranging through all the rooms and do the training and cross-validation for each room.
for room in tqdm(data['Location'].unique()):
# Four rooms have low quality data and we delete them manually
if room == 309 or room == 312 or room == 917 or room == 1001:
continue
# We extract the data of particular room and run the SMOTE algorithm on it.
room_data = data[data.Location == room].drop(['Location'], axis=1).reset_index(drop=True).fillna(method='pad')
if len(room_data) < 500 or room <= 812:
continue
room_data_smogn = smogn.smoter(data=room_data, y='AC', rel_coef=0.1)
room_data_smogn.to_csv('./{}/{}.csv'.format(log_folder_name, room))
y = room_data_smogn['AC']
X = room_data_smogn.drop(['AC'], axis=1)