-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBigMVNtestKATE.r
245 lines (241 loc) · 10.7 KB
/
BigMVNtestKATE.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#######################################################################
#######################################################################
#######################################################################
###### ######
###### Testing routines for simulating high- ######
###### dimensional multivariate normal distributions ######
###### ######
#######################################################################
#######################################################################
#######################################################################
source("BigMVNgen.r")
#
# Generate some sampling locations uniformly distributed on the sphere
# NB to achieve this, latitudes must be sampled from a density
# proportional to cos(lat) with latitude in radians, otherwise points
# will be denser at the poles. Not that it really matters for
# illustrative purposes.
#
set.seed(2000)
Nsites <- 28000
site.coords <- matrix(ncol=2,nrow=Nsites)
site.coords[,1] <- 180*(asin(2*runif(Nsites)-1)/pi)
site.coords[,2] <- 360*runif(Nsites)-180
#
# Split latitude and longitude ranges into 20 bins each - so 400 groups in
# total, which means about 75 sites in each group on average if there
# are 30000 sites in total (although the equatorial grid cells have more
# more sites, obviously, because they're bigger)
#
N.grid <- 13
group.id <- 1 + (N.grid*floor(N.grid*(site.coords[,1]+90)/180)) +
floor(N.grid*(site.coords[,2]+180)/360)
cat("NUMBERS OF SITES PER GROUP:\n")
print(table(group.id))
#
# Now define the "neighbourhoods" of each grid cell. This could be
# done on a distance basis; but this would require that we compute
# all pairwise distances (which kind of defeats the purpose of
# trying to get everything done in something like linear
# computational time). The cunning plan is to calculate the
# distances fromthe grid cell centres to each site, and to
# define the neighbourhoods of each site to be all sites that
# are within an appropriate distance of the corresponding grid
# cell centre. For "appropriate distance", it seems reasonable
# to take, say, the maximum distance between adjacent
# grid cell centres (this means we're guaranteed to include
# anything half-way into the next cell)
#
cell.centres <- cbind(rep(-90 + (((1:N.grid)-0.5)*180/N.grid),each=N.grid),
rep(-180 + (((1:N.grid)-0.5)*360/N.grid),N.grid))
sites.to.cells <- howfar(cell.centres,site.coords)
cells.to.cells <- howfar(cell.centres,cell.centres)
max.dist <- max(cells.to.cells[row(sites.to.cells)-col(sites.to.cells) == 1])
neighbours <- vector("list",N.grid^2)
for (i in 1:(N.grid^2)) {
wanted.sites <- (group.id != i) & (sites.to.cells[i,] < max.dist)
neighbours[[i]] <- (1:Nsites)[wanted.sites]
}
#
# Function to define an exponential covariance structure. The
# covariance between sites separated by a distance of d km is
# sigma^2 * exp(-phi*d). The vector theta below contains
# two elements: (sigma^2,phi). coords is a two-column
# matrix of latitudes and longitudes
# KW: This substitutes a real covariance structure by assuming that
# covariances decay exponentially with distance from 1 at 0 km to
# 1/e(ish 0.37) at 1000km to 0.0 at 5000km
#
exp.corr <- function(theta,coords) {
d <- howfar(coords,coords)
z <- theta[1] * exp(-theta[2]*d)
}
#
# sites.to.cells is VERY big, so remove it and do garbage collection
#
rm(sites.to.cells); gc()
theta <- c(1,0.001)
#
# For this test script, could just pass the correlation function
# through to rbigmvn.setup(). However, for illustrative
# purposes the next fewlines demonstrate how to prepare a
# list of covariance matrices that can be passed over instead.
# In practice, the call to exp.corr() below would be replaced
# by code that calculates the appropriate empirical covariance
# matrices, but hopefully the general idea is clear. As it stands,
# the code is a bit inefficient because some pairs of sites
# will appear in more than one group and the code doesn't
# "remember" the pairs that it has already done. Worse things
# happen at sea.
#
# NB the "wanted.sites" command below is critical: as supplied,
# it ensures that the sites are extracted in *exactly* the same
# order that they appear in the full site list, which is
# necessary to avoid generating nonsense!
#
M <- N.grid^2
Sigmalist <- vector("list",M)
for (i in 1:M) {
cat(paste("Calculating covariances: group",i,"of",M,"...\r"))
wanted.sites <- ((1:Nsites) %in% neighbours[[i]]) | (group.id == i)
Sigmalist[[i]] <- exp.corr(theta,site.coords[wanted.sites,])
}
cat("\n")
#
# Now do all the preliminary calculations for the sampling
#
cat("Setting up neighbourhoods and covariance structures ...\n")
test.setup <- rbigmvn.setup(mu=1:Nsites,coords=site.coords,
groups=group.id,neighbours=neighbours,
coord.type="geographical",method="factorize",
Sigmalist=Sigmalist)
#
# And generate 10000 realisations at those 30000 sites (the graphics
# window is for the trace plot)
# KW: Do we need 10000 now that we don't need to thin (using Factorise)
# If not then perhaps only 600 (50 years of data) or 1200 (100 years of data)
#
if (dev.cur()==1) x11(width=8,height=6)
par(lwd=2,ask=TRUE,mar=c(3,3,2,2),mgp=c(2,0.75,0),oma=c(1,1,1,1),mfrow=c(1,1))
cat("Simulating ...\n")
test <- rbigmvnorm(600,setup=test.setup,nburnin=100,monitor=FALSE)
#
# Check the properties for, say, the first 10 sites
#
cat("PROPERTIES OF FIRST 12 SITES:\n")
cat("Sample means: "); print(round(colMeans(test[,1:10]),3))
cat("Theoretical means: "); print(round(1:10,3))
cat("Sample covariance matrix:\n")
print(round(cov(test[,1:10]),2))
cat("Theoretical covariance matrix:\n")
print(round(exp.corr(theta,site.coords[1:10,]),2))
par(mfrow=c(3,4),oma=c(1,1,3,1))
for (i in 1:12) {
qqnorm(test[,i],main=paste("Site",i))
}
mtext("Normal Q-Q plots for first 12 sites",outer=TRUE,line=1)
#
# That looks OK, but the sites are obviously well separated and hence
# effectively independent. What about sites that are close together -
# and in different groups? Take sites within 10 degrees (lat,long) of
# (18,36) which is a boundary between two grid cells.
#
wanted.sites <- (abs(site.coords[,1]-18) < 10) & (abs(site.coords[,2]-36) < 10)
cat("PROPERTIES OF SITES WITHIN 10 DEGREES OF (18,36):\n")
cat("Site numbers and groups:\n")
print(rbind((1:Nsites)[wanted.sites],group.id[wanted.sites]))
cat("Sample means: "); print(round(colMeans(test[,wanted.sites]),3))
cat("Theoretical means: "); print(round((1:Nsites)[wanted.sites],3))
cat("Sample covariance matrix:\n")
print(round(cov(test[,wanted.sites]),2))
cat("Theoretical covariance matrix:\n")
print(round(exp.corr(theta,site.coords[wanted.sites,]),2))
par(mfrow=c(2,4))
for (i in (1:Nsites)[wanted.sites]) {
qqnorm(test[,i],main=paste("Site",i))
}
mtext("Normal Q-Q plots for sites within 10 degrees of (18,36)",outer=TRUE,line=1)
#
# Finally, check results for sites around +/-180 longitude, to ensure
# that wrapping is handled correctly (NB the latitude range from -13.5
# to +13.5 is merely to ensure that there are 12 sites in total, which
# allows a 3*4 array of plots)
#
wanted.sites <- (abs(site.coords[,1]) < 13.5) & (abs(site.coords[,2]) > 170)
cat("PROPERTIES OF SITES AROUND (0,180):\n")
cat("Site numbers and groups:\n")
print(rbind((1:Nsites)[wanted.sites],group.id[wanted.sites]))
cat("Sample means: "); print(round(colMeans(test[,wanted.sites]),3))
cat("Theoretical means: "); print(round((1:Nsites)[wanted.sites],3))
cat("Sample covariance matrix:\n")
print(round(cov(test[,wanted.sites]),2))
cat("Theoretical covariance matrix:\n")
print(round(exp.corr(theta,site.coords[wanted.sites,]),2))
par(mfrow=c(3,4))
for (i in (1:Nsites)[wanted.sites]) {
qqnorm(test[,i],main=paste("Site",i))
}
mtext("Normal Q-Q plots for sites around (0,180)",outer=TRUE,line=1)
stop()
#
# Just out of interest, here's a demonstration of how one might
# fit a VAR model to those data using least squares. Consider
# the rows of the matrix we've just generated to be "time points",
# and the columns to be "variables" (which they are). Exploit
# the fact that if the response variable in a call to lm() is
# a matrix, a separate regression model will be fitted for
# each column; and the coefficients themselves will be stored
# in columns (this doubtless exploits the fact that the design
# matrix is the same for all of the regressions, so that
# things like (X'X)^-1 have to be calculated only once). The
# estimate of Phi is then just the transpose of the returned
# coefficient matrix. Neat!
#
# NB the "-1" in the model formula is to exclude the intercept
# in order to fit a VAR model with zero mean.
#
cat("\nFitting VAR(1) model to output ...\n")
VAR.fit <- lm(test[-1,] ~ test[-nrow(test),] - 1)
Phi <- t(coef(VAR.fit))
par(mfrow=c(1,1))
plot(density(diag(Phi)),
main="Diagonal coefficients of Phi matrix in VAR(1) model for simulated sequence")
rug(diag(Phi))
stop()
#
# All looking good. Now: just out of interest, how long will it take to
# do the setup for 100,000 sites (which is the order of magnitude for
# the ISTI project)? Probably split into something like a 30*30 grid
# here, so that there are ~100 sites per block and 900 blocks. Storage
# could be an issue given that i've just stored a 10000*1000 matrix, so
# get rid of it.
#
rm(test.setup); rm(test); gc()
cat("Generating 100000 new site locations ...\n")
Nsites <- 100000
site.coords <- matrix(ncol=2,nrow=Nsites)
site.coords[,1] <- 180*(asin(2*runif(Nsites)-1)/pi)
site.coords[,2] <- 360*runif(Nsites)-180
N.grid <- 30
group.id <- 1 + (N.grid*floor(N.grid*(site.coords[,1]+90)/180)) +
floor(N.grid*(site.coords[,2]+180)/360)
cell.centres <- cbind(rep(-90 + (((1:N.grid)-0.5)*180/N.grid),each=N.grid),
rep(-180 + (((1:N.grid)-0.5)*360/N.grid),N.grid))
sites.to.cells <- howfar(cell.centres,site.coords)
cells.to.cells <- howfar(cell.centres,cell.centres)
max.dist <- max(cells.to.cells[row(sites.to.cells)-col(sites.to.cells) == 1])
neighbours <- vector("list",N.grid^2)
for (i in 1:(N.grid^2)) {
wanted.sites <- (group.id != i) & (sites.to.cells[i,] < max.dist)
neighbours[[i]] <- (1:Nsites)[wanted.sites]
}
#
# sites.to.cells is VERY big, so remove it and do garbage collection
#
rm(sites.to.cells); gc()
theta <- c(1,0.001)
cat("Setting up neighbourhoods and covariance structures ...\n")
test.setup <- rbigmvn.setup(mu=1:Nsites,covfunc=exp.corr,covpars=theta,
coords=site.coords,groups=group.id,neighbours=neighbours,
coord.type="geographical")
test <- rbigmvnorm(1,setup=test.setup)