forked from asussman52/hotspot_comparison
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcomparative_hotspot_analysis.R
376 lines (317 loc) · 21.2 KB
/
comparative_hotspot_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
####################################################################
####################################################################
#
# HOTSPOT ANAYLSIS
# 2 parametric models (persistence, conditional on presence)
# 1 spatial model (Getis-Ord Gi*)
# (2nd spatial model KDE done in arcgis)
#
# COMPARATIVE ANALYSIS
# Percent congruence
# Pearson's correlation
#
# Runtime: ~16 minutes
#
# Authors: Sussman A.L., Gardner B., Adams E.M., Salas L., Kenow K.P.,
# Luukkonen D.R., Monfils M.J., Mueller W.P., Williams K.A., Leduc-Lapierre M., and Zipkin E.F
#
####################################################################
####################################################################
#set working directory
setwd("C:/Users/")
#read in standardized data
blocks_surv <- read.csv("data_blocks.csv") #distinct block list
bird_recs_persist <- read.csv("data_birds_persistence.csv") #bird data for persistence model
bird_recs <- read.csv("data_birds.csv") #bird data for conditional on presence & Gi* models (also used for kde in ArcGIS)
#set species code for selected species/group
species = 'ALLSP'
####### KERNEL DENSITY ESTIMATION MODEL (Spatial Approach #1) ######
#
# Sources:
# Wilson L.J., McSorley, C.A., Gray C.M., Dean B.J., Dunn T.E., Webb A., and Reid J.B. 2009.
# Radio-telemetry as a tool to define protected areas for seabirds in the marine environment. Biological Conservation 142:1808-1817.
# O'Brien S.H., Webb A., Brewer M.J., and Reid J.B. 2012. Use of kernel density estimation and maximum curvature to set
# Marine Protected Area boundaries: Identifying a Special Protection Area for wintering red-throated divers in the UK.
# Biological Conservation 156:15-21.
# Suryan R.M., Santora J.A., and Veit R.R. 2012. New approach for using remotely sensed chlorophyll a to
# identify seabird hotspots. Marine Ecology Progress Series 451:213-225.
# Wong S.N.P., Gjerdrum C., Morgan K.H., and Mallory M.L. 2014. Hotspots in cold seas: The composition, distribution,
# and abundance of marine birds in the North American Arctic. Journal of Geophysical Research: Oceans 119:1691-1705.
#
####################################################################
# this model is done in ArcGIS using the kernel density tool, as at the time of this
# analysis R did not have the capability of conducting KDE with weighting (e.g., using abundance or density values)
#### GETIS-ORD Gi* HOTSPOT ANALYSIS MODEL (Spatial Approach #2) ####
#
# Sources:
# Getis A. and Boots B. 1978. Models of spatial processes: an approach to the study of point, line, and area patterns.
# Cambridge University Press, Cambridge, England. 198pp.
# Getis A. and Ord J.K. 1992. The analysis of spatial association by use of distance statistics. Geographical Analysis 24(3):189-206.
# Santora J.A., Reiss C.S., Loeb V.J., and Veit R.R. 2010. Spatial association between hotspots of baleen whales and demographic
# patterns of Antarctic krill Euphasia superba suggests size-dependent predation. Marine Ecology Progress Series 405:255-269.
# Kuletz K.J., Ferguson M.C., Hurley B., Gall A.E., Labunski E.A., and Morgan T.C. 2015. Seasonal spatial patterns in seabird and
# marine mammal distribution in the eastern Chukchi and western Beaufort seas: Identifying biologically important areas.
# Progress in Oceanography 136:175-200.
#
####################################################################
#load libraries
library(maptools)
library(rgdal)
library(spdep)
library(plyr)
library(data.table)
#import shapefile of grid cells with standardized effort-corrected counts
#(could also use grid cell layer joined to bird data and convernt to spatial feature...)
abund_shp<-readShapePoly("C:/Users/ALLSP_poly.shp")
#set projection (important for maintaining km in transects)
gl_albers<-CRS("+proj=aea +lat_1=42.122774 +lat_2=49.01518 +lat_0=45.568977 +lon_0=-84.455955 +x_0=1000000 +y_0=1000000 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
proj4string(abund_shp) <- gl_albers
#merge shapefile with block list to eliminate all blocks with transects <1km
g.data<-merge(x = abund_shp, y = blocks_surv, by = "grid_id" )
g.data<-g.data[which(g.data$trans_len>=1),]
#calculate contiguity based neighbors (i.e., sharing boundary)
data_nbr<-poly2nb(g.data,queen=FALSE) #rook method
data_nbr<-include.self(data_nbr) #include self
#create row standardized spatial weights matrix, "listw" object (default style = "W")
nbr_w<- nb2listw(data_nbr)
#run Getis-Ord G-test (uses the data column and the spatial weights matrix)
G <- localG(g.data$abun_std, nbr_w)
#convert localG object to dataframe
t.df <- do.call("rbind", lapply(G, as.data.frame))
row.names(t.df)<-g.data$grid_id
#do some calculations on zscores to determine which level of confidence you are: 90 , 95 , 99 (zscore: 2.575, 1.96, 1.645); positive = hotspot, negative = coldspot
t.df$bin<- ifelse(t.df$'X[[i]]' <=-2.575, "-3",
ifelse( (t.df$'X[[i]]' >-2.575 & t.df$'X[[i]]' <= -1.96) , "-2",
ifelse( (t.df$'X[[i]]' >-1.96 & t.df$'X[[i]]' <= -1.645) , "-1",
ifelse( (t.df$'X[[i]]' >-1.645 & t.df$'X[[i]]' < 1.645) , "0",
ifelse( (t.df$'X[[i]]' >=1.645 & t.df$'X[[i]]' <1.96) , "1",
ifelse( (t.df$'X[[i]]' >=1.96 & t.df$'X[[i]]' <2.575) , "2",
ifelse(t.df$'X[[i]]'>=2.575, "3",
NA ))))))) # all other values map to NA
#convert back to dataframe and merge with orig data (data.merge) to get block counts and num times surveyed included in output
setDT(t.df, keep.rownames = TRUE)[]
t.df$grid_id <- as.numeric(t.df$rn)
colnames(t.df) <- c("blockid","zscore","bin","grid_id")
t.merg<-merge(x = t.df, y = g.data, by= "grid_id")
gstat.hs<-as.data.frame(t.merg)
row.names(gstat.hs)<-gstat.hs$grid_id
#select specific columns for output
gstat.hs<-gstat.hs[,c(1,3:5,15,18:20,25)]
#export model output
gstat_out<-paste(species,"_gstat.csv",sep="", collapse = "")
write.csv(gstat.hs, (gstat_out))
############ PERSISTENCE MODEL (Parametric Approach #1) ############
#
# Sources:
# Suryan R.M., Santora J.A., and Veit R.R. 2012.
# New approach for using remotely sensed chlorophyll a to identify seabird hotspots. Marine Ecology Progress Series 451:213-225.
# Santora J.A. and Veit R.R. 2013. Spatio-temporal persistence of top predator hotspots near the Antarctic Peninsula.
# Marine Ecology Progress Series 487:287-304.
# Johnson S.M., Connelly E.E., Williams K.A., Adams E.M., Stenhouse I.J., and Gilbert A.T. 2015.
# Integrating data across survey methods to identify spatial and temporal patterns in wildlife distributions in:
# Wildlife densities and habitat use across temporal and spatial scales on the Mid- Atlantic Outer Continental Shelf:
# Final report to the Department of Energy Efficiency and Renewable Energy Wind & Water Power Technologies Office.
# Williams K.A., Connelly E.E., Johnson S.M., and Stenhouse I.J. (eds.) Award Number: DE-EE0005362.
# Report BRI 2015-11, Biodiversity Research Institute, Portland, Maine. 56 pp.
#
####################################################################
#create function that loops through all survey data
persist.hs.function<-function(data){
require(fitdistrplus) #pull in library to fit gamma distribution to data
all<-data #read in data
survey_prob<-matrix(NA,nrow=nrow(all)) #make blank matrix to store survey specific hotspots
all_prob<-matrix(NA,nrow=nrow(all)) #make blank matrix to store all survey hotspots
#get probability of being a hotspot for a unique survey
for (i in 1:nrow(all)){
surv<-all[i,]$surveyid #this is a factor of distinct survey IDs
tmp<-all[which(all$surveyid==surv),] #pull out all the data from a given survey for this point
if(nrow(tmp)>1){
tmp.g<-fitdist(tmp$block_survey_count,"gamma",method="mme") #fit a gamma distribution to the survey data
survey_prob[i]<-pgamma(all[i,9],shape=tmp.g$estimate[1],rate=tmp.g$estimate[2]) #assign a quantile to the data point given the gamma distribution we just fit
}
}
#get probability of being a hotspot for ALL surveys
for (i in 1:nrow(all)){
all.g<-fitdist(all$block_survey_count,"gamma",method="mme") #fit a gamma distribution for all data
all_prob[i]<-pgamma(all[i,9],shape=all.g$estimate[1],rate=all.g$estimate[2]) #assign a quantile to the data point given the gamma distribution for all data
}
combo_prob<-(survey_prob+all_prob)/2 #combine the two probabilities together, essentially overall hotspots plus survey-specific hotspots (averages the two)
all.final<-data.frame(all,survey_prob,all_prob,combo_prob) #put all new calculated fields into dataframe with original data
plot(all.g) #plot the cumulative density frequencies, QQ plot, PP plot
dev.off() #turn off plot
return(all.final)
}
#merge bird data with surveyed block list and remove all blocks that have less than 1km transects
persist.data.blocks<-merge(x = bird_recs_persist, y = blocks_surv, by = "grid_id" )
persist.data.blocks<-persist.data.blocks[which(persist.data.blocks$trans_len>=1),]
#run function on bird data
persist.hs<-persist.hs.function(data=persist.data.blocks)
persist.hs[is.na(persist.hs)] <- 0 #remove NAs
#export model output
persist_out<-paste(species,"_persistence.csv",sep="", collapse = "")
write.csv(persist.hs, (persist_out))
###### CONDITIONAL ON PRESENCE MODEL (Parametric Approach #2) ######
#
# Sources:
# Kinlan et al. 2012, Zipkin et al. 2015
# Kinlan, B.P., E.F. Zipkin, A.F. O'Connell, and C. Caldow. 2012.
# Statistical analyses to support guidelines for marine avian sampling:
# final report. U.S. Department of the Interior, Bureau of Ocean Energy Management,
# Office of Renewable Energy Programs, Herndon, VA. OCS Study BOEM 2012-101. NOAA Technical Memorandum NOS NCCOS 158. xiv+77 pp.
# Zipkin E.F., Kinlan B.P., Sussman A., Rypkema D., Wimer and M., O'Connell A.F. 2015.
# Statistical guidelines for assessing marine avian hotspots and coldspots:
# A case study on wind energy development in the U.S. Atlantic Ocean. Biological Conservation 191: 216-223.
#
####################################################################
#load libraries
library(fitdistrplus)
#aggregate on unique blocks using the total standardized effort-corrected count
cond.blocks<-with(bird_recs, aggregate(bird_recs[,12], data.frame(grid_id), sum))
colnames(cond.blocks) <- c("grid_id",species)
data.merge<-merge(x = cond.blocks, y = blocks_surv, by = "grid_id" ) #merge with surveyed blocks
data.merge$stdabund<-data.merge[,species]/data.merge$num_times_surveyed #add column to standardize for effort (number of times surveyed)
data.merge<-data.merge[which(data.merge$trans_len>=1),] #cutout all blocks that have less than 1km transect
#pull out all records for the species
x=data.merge$stdabund
names(x)=data.merge$grid_id
n=length(x)
#pull out all the nonzero counts
x1=x[which(x>0)]
n1=length(x1)
#create a vector with the number of times that a block was surveyed, including times when no individuals were observed
times.surv = data.merge$num_times_surveyed
names(times.surv)=data.merge$grid_id
#create a vector with the mean count of each block
species.mean = rep(0,length(times.surv))
names(species.mean)=names(times.surv)
#fill in the vector with the mean values for each block
for(k in 1:length(species.mean)) {
a=which(names(x)==names(times.surv[k]))
species.mean[k]=data.merge$stdabund[a]
}
#find the prevalence
prev=n1/n
#num simulations
nsim=10000
#fit lognormal distribution to nonzero counts
fit.lnorm <- fitdist(x1, "lnorm")
#pull out the parameter estimates
ln.mean = fit.lnorm$estimate[1]
ln.sd = fit.lnorm$estimate[2]
#simulate data for lnorm distribution given lognormal distribution mean and sd on count data
rn.binom <- rbinom(n*nsim,1,prev) #binom = prob of seeing a bird (yes/no)
binmat<-matrix(rn.binom,nrow=n,ncol=nsim)
rn.lnorm <- rlnorm(n*nsim,ln.mean,ln.sd) #lnorm = okay, we see a bird, what's our species count?
lnmat<-matrix(rn.lnorm, nrow=n, ncol=nsim)
#combine into matrix: 1767 rows (num blocks surveyed) by 10,000 cols (num simulations)
y=binmat*lnmat
#estimate the hotspot value
ln.hotpvalue.3xmean=rep(NA,length(times.surv)) #create empty vector
for(j in 1:length(times.surv)) {
ln.hotpvalue.3xmean[j]=length(which((3*apply(y,2,mean))<species.mean[j]))/dim(y)[2]
} #loop through all surveyed blocks (~15min)
#get the 3x mean value for each of the 10,000 columns in the simulated data y
#how many of those y values are smaller than the true mean? (divide by num simulations to return to scale)
#a value that is close to 0 indicates that the location is not a hotspot
#a value that is close to 1 indicates that the location is significantly a hotspot
#combine results & export model output
conditional.hs=cbind(as.numeric(names(times.surv)),times.surv,species.mean, ln.hotpvalue.3xmean)
rownames(conditional.hs)=rep(names(times.surv))
colnames(conditional.hs)=c("grid_id","times_sampled","mean_count", "hotpvalue_3xmean")
conditional.hs<-as.data.frame(conditional.hs)
conditional.hs_out<-paste(species,"_conditional.csv",sep="", collapse = "")
write.csv(conditional.hs, (conditional.hs_out))
####################### COMPARATIVE ANALYSIS #######################
#
# Format and combine results, look at percent congruence
# and run pearson's correlations on all four hotspot models
#
####################################################################
#load libraries
library(limma)
library(psych)
################### PREPARE HOTSPOT MODEL RESULTS ##################
#read in kernel density estimation hotspot analysis data (from ArcGIS)
kde.hs<-read.table("ALLSP_kde.csv", sep=",",header=TRUE)
names(kde.hs) <- tolower(names(kde.hs)) #change column names to lower case
names(kde.hs)[names(kde.hs)=="gid"] <- "grid_id" #set row names = blocks
kde.hs<-merge(x=blocks_surv,y=kde.hs,by="grid_id",all.x=TRUE) #merge with blocks list
kde.hs[is.na(kde.hs)] <- 0 #set NAs to zero
#Gi*
names(gstat.hs)[names(gstat.hs)=="X"] <- "grid_id" #change grid col name to grid_id to match others
gstat.hs[is.na(gstat.hs)] <- 0 #set NAs to zero
#read in hotspot persistence (persistence calculated in PostgreSQL)
persist.hs.tot <- read.csv("ALLSP_persistence_totals.csv",header=TRUE)
persist.hs.tot[is.na(persist.hs.tot)] <- 0 #set NAs to zero
#conditional on presence
conditional.hs.b<-merge(x=blocks_surv,y=conditional.hs,by="grid_id",all=TRUE) #merge with blocks list
conditional.hs.b[is.na(conditional.hs.b)] <- 0 #set NAs to zero
################ COMBINE RESULTS INTO ONE DATAFRAME ################
#create empty df, and join each model's hotspot data
all.values<-data.frame(matrix("NA", ncol = 6, nrow = 1767))
colnames(all.values)<-c("grid_id","num_times_surv","kde","gstat","persistence","conditional")
all.values$grid_id<-blocks_surv$grid_id
all.values$num_times_surv<-blocks_surv$num_times_surveyed[match(all.values$grid_id, blocks_surv$grid_id)]
row.names(all.values)<-all.values$grid_id
#populate models' columns
all.values$persistence<-persist.hs.tot$prop_hotspot_75[match(all.values$grid_id, persist.hs.tot$grid_id)]
all.values$conditional<-conditional.hs.b$hotpvalue_3xmean[match(all.values$grid_id, conditional.hs.b$grid_id)]
all.values$kde<-kde.hs$mean[match(all.values$grid_id, kde.hs$grid_id)]
all.values$gstat<-gstat.hs$zscore[match(all.values$grid_id, gstat.hs$grid_id)]
#we only want to run rank/correlations on blocks with 4 or more sampling events
all.values<-all.values[which(all.values$num_times_surv>=4),]
####################### LOOK AT % CONGRUENCE #######################
#determine if value/grid cell falls within the top 25% (1) or not (0)
all.values$kde.bin<-ifelse(all.values$kde <= quantile(all.values$kde,c(0.75)), 0,1)
all.values$gstat.bin<-ifelse(all.values$gstat <= quantile(all.values$gstat,c(0.75)), 0,1)
all.values$persistence.bin<-ifelse(all.values$persistence <= quantile(all.values$persistence,c(0.75)), 0,1)
all.values$conditional.bin<-ifelse(all.values$conditional <= quantile(all.values$conditional,c(0.75)), 0,1)
c<-all.values[,7:10] #combine into one df
#create venn diagram to look at percent congruence
meth.venn<- vennCounts(c)
vennDiagram(meth.venn,names=c("kde","getis-ord gi*", "persistence","conditional"),circle.col =c("skyblue", "pink1", "mediumorchid", "orange") )
##################### RUN PEARSON'S CORRELATION ####################
#create new df for each model and add model.75 column --> selects all raw values above 75th percentile, everything else gets 0 (i.e., taking the top 25%)
#kde
kde_top25<-transform(all.values, kde.75 = ifelse(all.values$kde <= quantile(all.values$kde,0.75), 0, all.values$kde))
#select only relevant cols
kde_top25<-kde_top25[,c(2,1,3,7,11)]
names(kde_top25)[names(kde_top25)=="grid_id"] <- "kde_grid_id"
#Gi*
gstat_top25<-transform(all.values, gstat.75 = ifelse(all.values$gstat <= quantile(all.values$gstat,0.75),0, all.values$gstat))
#select only relevant cols
gstat_top25<-gstat_top25[,c(1,4,8,11)]
names(gstat_top25)[names(gstat_top25)=="grid_id"] <- "gstat_grid_id"
#persistence
persistence_top25<-transform(all.values, persistence.75 = ifelse(all.values$persistence <= quantile(all.values$persistence,0.75), 0, all.values$persistence))
#select only relevant cols
persistence_top25<-persistence_top25[,c(1,5,9,11)]
names(persistence_top25)[names(persistence_top25)=="grid_id"] <- "persistence_grid_id"
#conditional
conditional_top25<-transform(all.values, conditional.75 = ifelse(all.values$conditional <= quantile(all.values$conditional,0.75), 0, all.values$conditional))
#select only relevant cols
conditional_top25<-conditional_top25[,c(1,6,10,11)]
names(conditional_top25)[names(conditional_top25)=="grid_id"] <- "conditional_grid_id"
#combine all models together
allmethods_top25=cbind(kde_top25,gstat_top25,persistence_top25,conditional_top25)
#pull out four 75th percentile cols for correlations
allmethods_top25_corr_rawvals<-allmethods_top25[,c(3,7,11,15)]
#run correlations
cor.four.top25.raw<-corr.test(allmethods_top25_corr_rawvals,use = "complete",method="pearson",adjust="bonferroni", alpha=.05,ci=TRUE)
#export correlations & all model values for mapping
four.top25.raw.out<-paste(species,"_vals.csv",sep="", collapse = "")
write.csv(allmethods_top25, (four.top25.raw.out))
cor.four.top25.raw.out<-paste(species,"_corr.csv",sep="", collapse = "")
write.csv(cor.four.top25.raw$r, (cor.four.top25.raw.out))
#Determine which correlations are significant
library(Hmisc)
library(corrplot)
cortest<-rcorr(as.matrix(allmethods_top25[,c(4,8,12,16,5)]),type="pearson") #run correlation on model raw val columns
cortest.r<-cortest$r #save correlations
cortest.p<-cortest$P #save p-values
corrplot.mixed(cortest.r,tl.col="black",lower.col="black",p.mat = cortest.p, sig.level = 0.05) #plot #insig=blank
title(paste(species," Significant Correlations",sep="", collapse = ""),line=-0.75) #add title
#In the above figure, correlations with p-value > 0.05 are considered as insignificant.
#In this case the correlation coefficient values are left blank or crosses are added (insig=blank for Xs).
#Positive correlations are displayed in blue and negative correlations in red color.
#Color intensity and the size of the circle are proportional to the correlation coefficients.