-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimpute_weather_data.R
144 lines (108 loc) · 3.59 KB
/
impute_weather_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# AUTHOR : Elisa Covato
# DATE : 10/11/2019
# DESCRIPTION: We use this script to mpute the missing entries in the
# entries in the weather dataset.
#####################################################################
library(mice)
library(tidyverse)
# GET DATA -------
weather <- read.csv("./data/weather_data.csv", sep = ",", stringsAsFactors = FALSE)
str(weather)
# FIXING PRECIPITATION FIELDS -------
# Precipitation type
# 1. If precip_intensity == 0 then precip_type == no_precip
# We replace "" with no_precip for all these values
for(i in 1:nrow(weather)){
if(weather$precip_intensity_avg[i] == 0 &&
weather$precip_intensity_max[i] == 0 &&
!is.na(weather$precip_intensity_avg[i]) &&
!is.na(weather$precip_intensity_max[i])) {
weather$precip_type[i] <- "no_precip"
}
}
# Precipitation type
# 2. We replace "" with NA for all the cases for which
# precipitation is not known
for(i in 1:nrow(weather)){
if( is.na(weather$precip_intensity_avg[i]) &&
is.na(weather$precip_intensity_max[i])) {
weather$precip_type[i] <- NA
}
}
# Precipitation type
# 3. Transform values into factors
weather <- weather %>%
mutate(
precip_type = as.factor(precip_type)
)
# Precipitation average
# Some of the values for precip_intensity_avg are 0 even though
# precip_intensity_max is not 0. We substitute this with NA
# and we will predict them later.
for(i in 1:nrow(weather)){
if(weather$precip_intensity_avg[i] == 0 &&
weather$precip_intensity_max[i] > 0 &&
!is.na(weather$precip_intensity_avg[i]) &&
!is.na(weather$precip_intensity_max[i]) ) {
weather$precip_intensity_avg[i] <- NA
}
}
# COUNTING MISSING DATA -------
weather %>%
summarise_all(funs(sum(. == "")))
weather %>%
summarise_all(funs(sum(is.na(.))))
# IMPUTATION -------
# We are now ready to impute the missing data
# 1. Create prediction matrix
# To create the prediction matrix, we need to compute the
# percentage for each field to be usable in the imputation
percentage_usable_cases <- md.pairs(weather)
percentage_usable_cases <-round(100*percentage_usable_cases$mr/(percentage_usable_cases$mr+percentage_usable_cases$mm))
# We set to 0 those fields whose percentage is NaN
# (hence cannot be used for the imputation)
percentage_usable_cases[percentage_usable_cases %in% c("NaN")] <- 0
# We set to 1 those fields whose percentage is not zero
# (hence they will be used for the imputation)
percentage_usable_cases[percentage_usable_cases != 0] <- 1
# Prediction matrix
predM <- percentage_usable_cases
# Choose imputation method for each variable to impute
meth <- c(
"day" = "",
"month" = "",
"year" = "",
"precip_intensity_max" = "cart",
"precip_intensity_avg" = "pmm",
"precip_type" = "cart",
"wind_speed_max" = "",
"wind_speed_avg" = "",
"gust_max" = "cart",
"gust_avg" = "pmm",
"temp_min" = "",
"temp_max" = "",
"temp_avg"= "",
"temp_day" = "cart",
"temp_night" = "cart",
"humidity" = ""
)
# Imputation
weather_imputed <- weather %>% mice(
data = .,
predictorMatrix = predM,
method = meth,
maxit = 0,
m = 5
)
# Imputation - visualization
stripplot(weather_imputed)
densityplot(weather_imputed) # all imputation
#densityplot(weather_imputed, ~precip_intensity_max)
#densityplot(weather_imputed, ~precip_intensity_avg)
#densityplot(weather_imputed, ~precip_type)
#densityplot(weather_imputed, ~gust_max)
#densityplot(weather_imputed, ~gust_avg)
# Imputation - complete dataset
weather_complete <- mice::complete(weather_imputed)
# SAVE DATA -------
write.csv(weather_complete, "./data/weather_data_complete.csv", row.names = FALSE)