-
Notifications
You must be signed in to change notification settings - Fork 0
/
EPC_Significance_Test_Appliances_Added.R
181 lines (139 loc) · 7.86 KB
/
EPC_Significance_Test_Appliances_Added.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# June 2022, Despina Manouseli, d.manouseli@ucl.ac.uk. Jessica Few, jessica.few@ucl.ac.uk
#Significance tests for metered PEUI and EPC modelled PEUI
# results of this analysis are published in this paper: https://doi.org/10.1016/j.enbuild.2023.113024
library(ggplot2)
library(dplyr)
library(broom)
library(purrr)
save_data_loc2 = "P:/196860_SERLAnnual_Report/Working/JF/Data/Processed/EPC_Analysis/"
metered_comb2 = read.csv(paste(save_data_loc2, 'Metered_combined_dt_Scotland.csv', sep = ""))
save_output_loc = paste(save_data_loc2, "/Scotland/", sep = "")
#heating-metered homes
#the intention is to test whether differences in the two measures of consumption are significant statistically.
##Lets first observe how differences in data are distributed. Do they look normal?
#create Difference column: Positive values indicate that modeled
#demand is higher than measured demand (overestimation of demand)
metered_comb2$Diff=metered_comb2$metered_primary_eui-metered_comb2$epc_tot_peui
metered_comb2$perc_diff = 100*metered_comb2$Diff/metered_comb2$epc_tot_peui
# first do basic comparison of metered against modelled and perc difference
#are differences looking normally distributed for different bands?
g1<-ggplot(metered_comb2,aes(Diff,fill=currentEnergyRatingMerge))+
geom_histogram()+
xlab("Difference between modelled and measured EUI (kWh/year/m2)")+
guides(fill=guide_legend(title="EPC Band"))+
facet_wrap(~currentEnergyRatingMerge,scales='free')
g1
ggsave(paste('Difference between modeled and measured EUI - Hist_per_EPC.png'), path = save_data_loc2, device = png,
width = 8, height = 6, units = "in", dpi = 450)
#how many houses per EPC Band?
metered_comb2 %>% dplyr::count(currentEnergyRatingMerge)
#A&B should be tested using K-S
#Source: Mishra et al. 2019.Descriptive Statistics and Normality Tests for Statistical Data
#perform shapiro tests for normality
metered_comb2$currentEnergyRatingMerge<-as.factor(metered_comb2$currentEnergyRatingMerge)
tests=metered_comb2 %>%
group_by(currentEnergyRatingMerge) %>%
do(test=shapiro.test(.$Diff))
#Kolmogorov SMirnov for A&B
dfAB<-metered_comb2[,c('currentEnergyRatingMerge','Diff')]
dfAB<-filter(dfAB,currentEnergyRatingMerge=='A and B' )
dfAB<-dfAB[,c('Diff')]
ks.test(dfAB,'pnorm')
#Lets produce summary stats for the difference per EPC band
library(plyr)
summary_stats<-ddply(metered_comb2,.(currentEnergyRatingMerge),summarise,mean=mean(Diff), median=median(Diff), sd=sd(Diff),min=min(Diff), max=max(Diff))
write.csv(summary_stats, paste(save_data_loc, 'Summary_Stats_for_Diff_per_EPC_Band2.csv', sep = ""), row.names = FALSE)
gbox<-ggplot(metered_comb2,aes(x=currentEnergyRatingMerge,y=Diff))+
geom_boxplot()+
xlab("Difference between modelled and measured EUI (kWh/year/m2)")
gbox
ggsave(paste('Difference between modeled and measured EUI - Box_per_EPC.png'), path = save_data_loc2, device = png,
width = 8, height = 6, units = "in", dpi = 450)
#Perform Paired T test for All bands
df_all<-metered_comb2%>%
do(tidy(t.test(.$epc_tot_peui,.$metered_primary_eui,
paired=TRUE, conf.level = 0.95)))
df_all
# save results
df_all = dplyr::rename(df_all, mean_difference = estimate, n = parameter)
df_all$n = df_all$n + 1 # the parameter variable was degrees of freedom, we just want to report n so add 1
write.csv(df_all, paste(save_output_loc, 'Paired_test_for_significant_diff_Scotland.csv', sep = ""), row.names = FALSE)
# also do a t-test for the percent difference
df_all_perc<-metered_comb2%>%
do(tidy(t.test(.$perc_diff,
conf.level = 0.95)))
df_all_perc
df_all_perc = dplyr::rename(df_all_perc, mean_perc_difference = estimate, n = parameter)
df_all_perc$n = df_all_perc$n + 1 # the parameter variable was degrees of freedom, we just want to report n so add 1
write.csv(df_all_perc, paste(save_output_loc, 'Paired_test_for_significant_perc_diff.csv', sep = ""), row.names = FALSE)
df_ttest_NEW<-metered_comb2%>%
group_by(currentEnergyRatingMerge)%>%
do(tidy(t.test(.$epc_tot_peui,.$metered_primary_eui,
mu=0,alt="two.sided",
paired=TRUE,
conf.level = 0.95 )))
df_ttest_NEW
# rename some columns to make easier for SDC checking
df_ttest_NEW = dplyr::rename(df_ttest_NEW, mean_difference = estimate, n = parameter)
df_ttest_NEW$n = df_ttest_NEW$n + 1 # the parameter variable was degrees of freedom, we just want to report n so add 1
write.csv(df_ttest_NEW, paste(save_output_loc, 'Paired_test_for_significant_diff_by_EPC_band.csv', sep = ""), row.names = FALSE)
df_wilcox_NEW<-metered_comb2%>%
filter( currentEnergyRatingMerge=="F and G")%>%
group_by(currentEnergyRatingMerge)%>%
do(tidy(wilcox.test(.$epc_tot_peui,.$metered_primary_eui,
paired=TRUE)))
df_wilcox_NEW
df_wilcox_NEW = subset(df_wilcox_NEW, select = -c(statistic))
df_wilcox_NEW$n = sum(metered_comb2$currentEnergyRatingMerge == 'F and G')
write.csv(df_wilcox_NEW, paste(save_output_loc, 'Wilcox_for_significant_diff_Bands_FG.csv', sep = ""), row.names = FALSE)
#Tests by new_home_epc variable
gbox<-ggplot(metered_comb2,aes(x=new_home_epc,y=Diff))+
geom_boxplot()+
xlab("Difference between modelled and measured EUI (kWh/year/m2)")
gbox
ggsave(paste('Difference between modeled and measured EUI - box_per_NEW_HOME.png'), path = save_data_loc2, device = png,
width = 8, height = 6, units = "in", dpi = 450)
#are differences looking normally distributed for the 2 categories?
g4<-ggplot(metered_comb2,aes(Diff,fill=new_home_epc))+
geom_histogram()+
xlab("Difference between modelled and measured EUI (kWh/year/m2)")+
guides(fill=guide_legend(title="New Home EPC"))+
facet_wrap(~new_home_epc,scales='free')
g4
ggsave(paste('Difference between modeled and measured EUI - Hist_per_NewHome.png'), path = save_data_loc2, device = png,
width = 8, height = 6, units = "in", dpi = 450)
#how many houses per EPC Band?
metered_comb2 %>% dplyr::count(new_home_epc)
df_ttest_NEW_home<-metered_comb2%>%
group_by(new_home_epc)%>%
do(tidy(t.test(.$epc_tot_peui,.$metered_primary_eui,
mu=0,alt="two.sided",
paired=TRUE,
conf.level = 0.95 )))
df_ttest_NEW_home
df_ttest_NEW_home = dplyr::rename(df_ttest_NEW_home, mean_difference = estimate, n = parameter)
df_ttest_NEW_home$n = df_ttest_NEW_home$n + 1 # the parameter variable was degrees of freedom, we just want to report n so add 1
write.csv(df_ttest_NEW_home, paste(save_output_loc, 'Paired_test_for_significant_Diff_newHome.csv', sep = ""), row.names = FALSE)
#Tests by match_sap_occupancy variable
gbox<-ggplot(metered_comb2,aes(x=match_sap_occupancy,y=Diff))+
geom_boxplot()+
xlab("Difference between modelled and measured EUI (kWh/year/m2)")
gbox
#are differences looking normally distributed for the 2 categories?
g4<-ggplot(metered_comb2,aes(Diff,fill=match_sap_occupancy))+
geom_histogram()+
xlab("Difference between modelled and measured EUI (kWh/year/m2)")+
guides(fill=guide_legend(title="Match SAP occupancy"))+
facet_wrap(~match_sap_occupancy,scales='free')
g4
#how many houses per group?
metered_comb2 %>% dplyr::count(match_sap_occupancy)
df_ttest_NEW_match_sap_occupancy<-metered_comb2%>%
group_by(match_sap_occupancy)%>%
do(tidy(t.test(.$epc_tot_peui,.$metered_primary_eui,
mu=0,alt="two.sided",
paired=TRUE,
conf.level = 0.95 )))
df_ttest_NEW_match_sap_occupancy = dplyr::rename(df_ttest_NEW_match_sap_occupancy, mean_difference = estimate, n = parameter)
df_ttest_NEW_match_sap_occupancy$n = df_ttest_NEW_match_sap_occupancy$n + 1 # the parameter variable was degrees of freedom, we just want to report n so add 1
write.csv(df_ttest_NEW_match_sap_occupancy, paste(save_output_loc, 'Paired_test_for_significant_Diff_match_occupancy.csv', sep = ""), row.names = FALSE)