-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnp-cutoff.within_host_diversity.R
569 lines (441 loc) · 24.5 KB
/
snp-cutoff.within_host_diversity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
# If you find this code useful, please cite:
# Coll et al. Definition of a genetic relatedness cutoff to exclude recent transmission of meticillin-resistant Staphylococcus aureus: a genomic epidemiology analysis. Lancet Microbe. 2020 Dec;1(8):e328-e335. doi: 10.1016/S2666-5247(20)30149-X. PMID: 33313577; PMCID: PMC7721685. (https://www.thelancet.com/journals/lanmic/article/PIIS2666-5247(20)30149-X/fulltext)
# Intructions on using this R script:
# 1. Change working_dir variable to include the full path of your working directory
# 2. Change full path to SupplementaryData1.xlsx
# 3. The R package gdata needs to be installed, which is used to load .xlsx files
# 4. The R package ggplot2 needs to be installed, which is used for plotting
# 5. The R package lme4 needs to be installed, which is used to run linear mixed models
# 6. The R package svglite needs to be installed, which is used to save plots as SVG files
##########################################################################################################
### 1. INPUT FILES ####
##########################################################################################################
working_dir = "";
setwd(working_dir)
dataS1_file = "SupplementaryData1.xlsx"
require(gdata)
require(ggplot2)
require(svglite)
dataS1 = read.xls(dataS1_file, sheet = 1, header = T); # SNP distances derived from mapping to the whole chromosome of the ST22 strain HO 5096 0412 reference genome (and removing MGEs)
dataS1 = read.xls(dataS1_file, sheet = 2, header = T); # SNP distances derived from mapping to the core genome portion of the ST22 strain HO 5096 0412 reference genome (and removing MGEs)
dataS1 = read.xls(dataS1_file, sheet = 3, header = T); # SNP distances derived from mapping to the whole genome portion of the ST30 strain MRSA252 reference genome (and removing MGEs)
dataS1 = read.xls(dataS1_file, sheet = 4, header = T); # SNP distances derived from mapping to the core genome portion of the SST30 strain MRSA252 reference genome (and removing MGEs)
output_sufix = "st22_ref.whole";
output_sufix = "st22_ref.core";
output_sufix = "st30_ref.whole";
output_sufix = "st30_ref.core";
dim(dataS1)
# [1] 1557 12
# Total number of MRSA isolates used
length(unique(c(as.vector(dataS1$SequencingTag1),as.vector(dataS1$SequencingTag2))))
# [1] 1276
##########################################################################################################
### 2. PERCENTAGE OF MIXED STRAINS ####
##########################################################################################################
# Number of individuals with more than one isolate
length(unique(dataS1$AnonymisedPatientId))
# [1] 459
# Number of individuals with more than one strain (as defined by having isolates from different CCs or
# isolates from the same CC but different clades, labelled as outliers)
length(unique(dataS1$AnonymisedPatientId[which(grepl("outlier",dataS1$Note)==TRUE)]))
# [1] 23
# Removing outliers
dataS1 = dataS1[-which(grepl("outlier",dataS1$Note)==TRUE),]
dim(dataS1)
# [1] 1510 12
##########################################################################################################
### 3. SNP DISTANCES AMONG ISOLATES COLLECTED ON THE SAME DAY ####
##########################################################################################################
# Isolates from the same patient collected on the same day will be used to calculate the cloud of diversity
dataS1sd = dataS1[which(dataS1$TimeGap==0),];
percentiles_95 = vector()
for(r in 1:100)
{
keepInd = vector()
individuals = unique(as.vector(dataS1sd$AnonymisedPatientId))
for(i in 1:length(individuals))
{
tmp = which(dataS1sd$AnonymisedPatientId==individuals[i])
# Extracting total number of collection dates available per host
dates_host = unique(c(as.vector(dataS1sd$CollectionDate1[tmp]), as.vector(dataS1sd$CollectionDate2[tmp])))
# If multiple collection dates are available, select earliest one
if(length(dates_host)>1)
{
earliest_date = min(as.Date(dates_host, format="%Y-%m-%d"))
tmp = which(dataS1sd$AnonymisedPatientId==individuals[i] & as.Date(dataS1sd$CollectionDate1) == earliest_date)
}
# Extracting only one comparison per host
if(length(tmp)==1)
{
keepInd = c(keepInd,tmp)
} else
{
keepInd = c(keepInd, sample(tmp, 1))
}
}
# Keeping one isolate pair per patient (the one with the maximum SNP distance)
dataS1sd_max = dataS1sd[keepInd,]
dim(dataS1sd_max)
# [1] 82 12
percentiles_95 = c(percentiles_95, quantile(dataS1sd_max$SNPs, probs = 0.95))
}
quantile(percentiles_95)
# 0% 25% 50% 75% 100%
# 12.00 12.95 13.90 13.95 13.95 > st22_ref.whole
# 6.90 7.95 8.90 9.85 9.90 > st22_ref.core
# 15.00 20.70 20.70 22.60 22.65 > st30_ref.whole
# 6.9500 7.9500 9.8500 9.8625 9.9000 > st30_ref.core
##########################################################################################################
### 4. EMPIRICAL DISTRIBUTION OF CLOUD OF DIVERSITY ####
##########################################################################################################
# The "cloud of diversity" follows an exponential distribution
quantile(dataS1sd$SNPs)
# 0% 25% 50% 75% 100%
# 0 1 3 5 82 > st22_ref.whole
quantile(dataS1sd_max$SNPs)
# 0% 25% 50% 75% 100%
# 0 1 3 5 82
quantile(dataS1sd_max$SNPs, probs = 0.95)
# 95%
# 13.95
quantile(dataS1sd$SNPs, probs = 0.95)
# 95%
# 14.85
## Plots
# Empirical distribution of the cloud of diversity across all CCs
plot_width = 6; plot_height = 5;
plot_cloud_of_diversity = function(data, text_x_offset, plot_title)
{
size_dot = 1; size_axis_lines = 0.3; text_y_offset = 4; font = "Times"; dot_color = "dimgray";
axis_text_size = 15; axis_title_size = 20; ann_text_size = 5;
co_y = round(as.numeric(quantile(data$SNPs, probs = 0.95)));
co_x = which(data$SNPs <= co_y); co_x = co_x[length(co_x)];
g1 <- ggplot(data, aes(x=seq(1,nrow(data),1), y=SNPs)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black", size = size_axis_lines), axis.ticks = element_line(size = size_axis_lines)) +
geom_point(shape = 21, colour = dot_color, fill = dot_color, size = size_dot) +
ylim(0, 100) +
geom_segment(aes(x= 0, y = co_y, xend = co_x, yend = co_y), linetype="dashed", size=size_axis_lines) +
geom_segment(aes(x= co_x, y = 0, xend = co_x, yend = co_y), linetype="dashed", size=size_axis_lines) +
annotate("text", x = 0 + text_x_offset, y = co_y + text_y_offset, label = paste("95 percentile =",co_y,"SNPs",sep=" "), family=font, size=ann_text_size) +
ylab("Number of SNPs") +
xlab("Patients") +
ggtitle(plot_title) +
theme(text = element_text(family = font)) +
theme(axis.text = element_text(size=axis_text_size, color="black"), axis.title=element_text(size=axis_title_size), title=element_text(size=axis_title_size))
return(g1)
}
text_x_offset = 15;
dataS1sd = dataS1sd_max[order(as.numeric(dataS1sd_max$SNPs)),];
plot_title = "Empirical Cloud of Diversity"
g1 = plot_cloud_of_diversity(dataS1sd,text_x_offset,plot_title)
plot_file = paste("empirical_clould_of_diversity.allCCs.",output_sufix,".pdf",sep="");
ggsave(plot_file, plot = g1, device = "pdf", width = plot_width, height = plot_height, dpi = 300, units = "in")
##########################################################################################################
### 6. APPLYING LINEAR MIXED MODELS ####
##########################################################################################################
# Linear mixed models are applied to calculate the SNP accumulation rate and to model the "cloud of diversity"
# The number of SNPs between MRSA isolates (SNPs) is modelled as a function of the time gap (TimeGap) between isolates
# The intercept (that is, number of SNPs at time 0) is interpreted as the "cloud of diversity" at time 0 and assumbed to
# vary by patient (AnonymisedPatientId, random variable)
library(lme4)
lmer_all = lmer(SNPs ~ TimeGap + (1|AnonymisedPatientId), data=dataS1)
summary(lmer_all)
##########################################################################################################
### 5. PLOT SNP DISTANCES OVER TIME ####
##########################################################################################################
# Binning data point by TimeGap in months
bins_from = seq(0,330,30); bins_to = seq(30,360,30); bin_month = seq(1,12,1);
dataS1$bin = NA;
for(b in 1:length(bin_month))
{
tmp = which(dataS1$TimeGap>=bins_from[b] & dataS1$TimeGap<bins_to[b])
if(length(tmp)>0){ dataS1$bin[tmp] = as.character(bin_month[b]); }
}
# Converting month bin label to factor
dataS1$bin = factor(dataS1$bin,seq(1,12,1))
# Creating X labels
xlab <- paste(levels(dataS1$bin),"\n(N=",table(dataS1$bin),")",sep="")
size_axis_lines = 0.3; axis_x_text_size = 8; axis_y_text_size = 15; axis_title_size = 20;
plot_title = "Number of SNPs over time"
# Boxplot of binned SNP distances per month
boxplot = ggplot(dataS1,aes(x = bin, y = SNPs)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black",
size = size_axis_lines), axis.ticks = element_line(size = size_axis_lines)) +
geom_boxplot(varwidth = TRUE, notch=FALSE, outlier.size = 0.5) +
scale_y_continuous(limits=c(0,25)) +
scale_x_discrete(labels=xlab) +
ylab("Number of SNPs") +
xlab("Time distance in months\n(Pairwise Comparisons)") +
ggtitle(plot_title) +
theme(text = element_text(family = font)) +
theme(axis.text.x = element_text(size=axis_x_text_size, color="black"), axis.text.y = element_text(size=axis_y_text_size, color="black"),
axis.title=element_text(size=axis_title_size), title=element_text(size=axis_title_size))
boxplot_file = paste("snp_distances_over_time.allCCs.",output_sufix,".svg",sep="");
ggsave(boxplot_file, plot = boxplot, device = "svg", width = 6, height = 5, dpi = 300, units = "in")
##########################################################################################################
### 6. CALCULATION OF THE MRSA SUBSTITUTION RATE ####
##########################################################################################################
## Converting units to substitutions per site per year
chromosome_length = 2832299; # ST22 chromosome reference
mge_regions_length = 159127; # total length of MGEs on the ST22 chromosome
chromosome_length = 1759534; # ST22 core-genome reference
mge_regions_length = 2809; # MGEs on the ST22 core-genome reference
### Using MRSA isolates from all CCs
timegap_coefficient_all = coef(summary(lmer_all))[2,1]
timegap_coefficient_all
# [1] 0.01120942
# [1] 0.007115821
## Converting units to SNPs per genome per year
substitution_rate_all = timegap_coefficient_all*365
substitution_rate_all
# [1] 4.091438
# [1] 2.597275
## 95% confidence interval
CI = confint(lmer_all, "TimeGap", level = 0.95)
CI_lower_bound = CI[1]*365
CI_lower_bound
# [1] 2.409668
# [1] 1.475942
CI_upper_bound = CI[2]*365
CI_upper_bound
# [1] 5.774858
# [1] 3.719656
## Converting units to substitutions per site per year
substitution_rate_all_ps = substitution_rate_all/(chromosome_length-mge_regions_length);
substitution_rate_all_ps
# [1] 1.530556e-06
# [1] 1.478475e-06
CI_lower_bound_ps = CI_lower_bound/(chromosome_length-mge_regions_length)
CI_lower_bound_ps
# [1] 9.014263e-07
# [1] 8.401667e-07
CI_upper_bound_ps = CI_upper_bound/(chromosome_length-mge_regions_length)
CI_upper_bound_ps
# [1] 2.160302e-06
# [1] 2.117381e-06
##########################################################################################################
### APPLYING LINEAR MIXED MODELS WITH SUB-SAMPLED DATA ####
##########################################################################################################
# Because patients differ in the number of isolate genomes available per sample (colonies sequenced)
# the dataset needs to be deduplicated to keep only isolate per sample This is done by:
# 1. Keeping all different samples (i.e. collection dates) per patient
# 2. Keeping 1 isolate genome per sample
# 3. If a patient has a single sample (i.e. single collection date), two isolates from this sample are kept
# Isolates are selected randomly
library(lme4)
number_iterations = 100
mutation_rates_iterations = vector()
mutation_rates_lci_iterations = vector()
mutation_rates_uci_iterations = vector()
beta_zero_iterations = vector()
host_ids = unique(as.vector(dataS1$AnonymisedPatientId))
length(host_ids)
# [1] 445
for(i in 1:number_iterations)
{
print(paste("Iteration number: ",i,sep=""))
# Vector to store pairwise isolate comparisons to keep in each iteration
host_isolates_kept = vector()
# Across all hosts, sub-sample randomly to de-duplicate dataset
for(h in 1:length(host_ids))
{
hhh = which(dataS1$AnonymisedPatientId == host_ids[h])
# Sample only one CC
host_clonal_complexes = as.character(unique(c(as.vector(dataS1$ClonalComplex1[hhh]), as.vector(dataS1$ClonalComplex2[hhh]))))
host_clonal_complex = sample(host_clonal_complexes,1)
# Select comparison of sampled CC
hhh = which(dataS1$AnonymisedPatientId == host_ids[h] & dataS1$ClonalComplex1 == host_clonal_complex)
# Get all available collection dates
host_collection_dates = unique(c(as.vector(dataS1$CollectionDate1[hhh]),as.vector(dataS1$CollectionDate2[hhh])))
# If only one available collection data/sample > keep two random isolates
if(length(host_collection_dates)==1)
{
host_isolates = vector()
hhhd1 = which(dataS1$AnonymisedPatientId == host_ids[h] & dataS1$CollectionDate1 == host_collection_dates[1])
if(length(hhhd1)>0){ host_isolates = c(host_isolates, as.vector(dataS1$SequencingTag1[hhhd1])); }
hhhd1 = which(dataS1$AnonymisedPatientId == host_ids[h] & dataS1$CollectionDate2 == host_collection_dates[1])
if(length(hhhd1)>0){ host_isolates = c(host_isolates, as.vector(dataS1$SequencingTag2[hhhd1])); }
host_isolates = unique(host_isolates)
host_isolates = sample(host_isolates, 2);
host_isolates_kept = c(host_isolates_kept, host_isolates)
} else
{
# Else, for each collection date, randomly sample one isolate
for(d in 1:length(host_collection_dates))
{
host_isolates = vector()
hhhd1 = which(dataS1$AnonymisedPatientId == host_ids[h] & dataS1$CollectionDate1 == host_collection_dates[d])
if(length(hhhd1)>0){ host_isolates = c(host_isolates, as.vector(dataS1$SequencingTag1[hhhd1])); }
hhhd1 = which(dataS1$AnonymisedPatientId == host_ids[h] & dataS1$CollectionDate2 == host_collection_dates[d])
if(length(hhhd1)>0){ host_isolates = c(host_isolates, as.vector(dataS1$SequencingTag2[hhhd1])); }
host_isolates = unique(host_isolates)
host_isolates = sample(host_isolates, 1);
host_isolates_kept = c(host_isolates_kept, host_isolates)
}
}
}
print(paste("Number of isolates sub-sampled: ",length(host_isolates_kept), sep=""))
# Keeping comparisons including isolates sub-sampled
iii1 = which(!is.na(match(dataS1$SequencingTag1, host_isolates_kept)))
iii2 = which(!is.na(match(dataS1$SequencingTag2, host_isolates_kept)))
iii = iii1[which(!is.na(match(iii1,iii2)))]
print(paste("Number of pairwise comparisons sub-sampled: ",length(iii), sep=""))
print(paste("Number of patients sub-sampled: ",length(unique(dataS1$AnonymisedPatientId[iii])), sep=""))
tmp = match(host_ids, dataS1$AnonymisedPatientId[iii])
print(paste("Missing sub-sampled patients: ",paste(host_ids[which(is.na(tmp))], collapse = ";"), sep=""))
### Running linear mixed model
dataS1_sub = dataS1[iii,]
dataS1_sub <- droplevels(dataS1_sub)
dataS1_sub$AnonymisedPatientId=as.factor(dataS1_sub$AnonymisedPatientId)
# dataS1_sub = rbind(dataS1_sub, dataS1_sub)
lmer_all = lmer(SNPs ~ TimeGap + (1|AnonymisedPatientId), data=dataS1_sub)
timegap_coefficient_all = coef(summary(lmer_all))[2,1]
mutation_rates_iterations = c(mutation_rates_iterations, timegap_coefficient_all)
CI = confint(lmer_all, "TimeGap", level = 0.95)
CI_lower_bound = CI[1]; mutation_rates_lci_iterations = c(mutation_rates_lci_iterations, CI_lower_bound);
CI_upper_bound = CI[2]; mutation_rates_uci_iterations = c(mutation_rates_uci_iterations, CI_upper_bound);
beta0_all = as.vector(unlist(coef(lmer_all)$AnonymisedPatientId["(Intercept)"]))
beta0_95per = quantile(beta0_all,prob=0.95)
beta_zero_iterations = c(beta_zero_iterations, beta0_95per)
}
##### Mutation rates and 95% cloud of diversity across all 100 iterations
quantile(mutation_rates_iterations*365)
# 0% 25% 50% 75% 100%
# 4.431645 4.607433 4.707698 4.817113 4.913721 > st22_ref.whole
# 2.727992 2.852357 2.935401 2.988398 3.095441 > st22_ref.core
# 4.513096 4.749575 4.902059 5.119699 5.441598 > st30_ref.whole
# 2.704350 2.839135 2.887466 2.952478 3.069959 > st30_ref.core
quantile(beta_zero_iterations)
# 0% 25% 50% 75% 100%
# 18.51329 18.93635 19.22067 19.47765 19.72289 > st22_ref.whole
# 9.776901 10.004914 10.452521 10.822996 10.846875 > st22_ref.core
# 18.18067 19.29616 21.44925 22.16469 22.29634 > st30_ref.whole
# 10.02485 10.08829 10.36068 10.61637 10.66863 > st30_ref.core
### Extracting median substitution rate and 95% CI across iterations
if(output_sufix == "st22_ref.whole")
{
chromosome_length = 2832299; # ST22 chromosome reference
mge_regions_length = 159127; # total length of MGEs on the ST22 chromosome
}
if(output_sufix == "st22_ref.core")
{
chromosome_length = 1759534; # ST22 core-genome reference
mge_regions_length = 2809; # MGEs on the ST22 core-genome reference
}
if(output_sufix == "st30_ref.whole")
{
chromosome_length = 2902619; # ST30 chromosome reference
mge_regions_length = 314479; # total length of MGEs on the ST30 chromosome
}
if(output_sufix == "st30_ref.core")
{
chromosome_length = 1754228; # ST30 core-genome reference
mge_regions_length = 3866; # MGEs on the ST30 core-genome reference
}
mutation_rate_median = sort(mutation_rates_iterations)[50]
tmp = which(mutation_rates_iterations == mutation_rate_median)
mutation_rate_lci = mutation_rates_lci_iterations[tmp[1]]
mutation_rate_uci = mutation_rates_uci_iterations[tmp[1]]
print(paste(mutation_rate_median," (",mutation_rate_lci," - ",mutation_rate_uci,")",sep=""))
# [1] "0.0128724506584772 (0.00781873051982872 - 0.0179269008529202)" > st22_ref.whole
# [1] "0.0134591253302896 (0.0070021257138012 - 0.0199192055155959)" > st30_ref.whole
# [1] "0.00800455270067963 (0.00463717411557472 - 0.0113724084147899)" > st22_ref.core
# [1] "0.00791734559036188 (0.00448982538564908 - 0.0113475954839798)" > st30_ref.core
# Converting units to substitutions per site per year
print(paste(mutation_rate_median*365," (",mutation_rate_lci*365," - ",mutation_rate_uci*365,")",sep=""))
# [1] "4.69844449034419 (2.85383663973748 - 6.54331881131588)" > st22_ref.whole
# [1] "4.9125807455557 (2.55577588553744 - 7.2705100131925)" > st30_ref.whole
# [1] "2.92166173574806 (1.69256855218477 - 4.15092907139832)" > st22_ref.core
# [1] "2.88983114048209 (1.63878626576192 - 4.14187235165264)" > st30_ref.core
print(paste(mutation_rate_median*365/(chromosome_length-mge_regions_length)," (",mutation_rate_lci*365/(chromosome_length-mge_regions_length)," - ",mutation_rate_uci*365/(chromosome_length-mge_regions_length),")",sep=""))
# [1] "1.75762894806028e-06 (1.06758436783622e-06 - 2.44777321149401e-06)" > st22_ref.whole
# [1] "1.89811244583203e-06 (9.87495222645389e-07 - 2.80916411523043e-06)" > st30_ref.whole
# [1] "1.66312982154183e-06 (9.63479515681038e-07 - 2.36287926192109e-06)" > st22_ref.core
# [1] "1.65099056108513e-06 (9.36255623557821e-07 - 2.36629471598026e-06)" > st30_ref.core
##########################################################################################################
### 8. MODELED DISTRIBUTION OF CLOUD OF DIVERSITY ####
##########################################################################################################
# Extracting varying intercepts for all patients
beta0_all = as.vector(unlist(coef(lmer_all)$AnonymisedPatientId["(Intercept)"]));
quantile(beta0_all)
# 0% 25% 50% 75% 100%
# -0.5326214 2.2241942 4.2607700 7.8025847 56.8797944
# -0.6022719 1.2223954 2.0550299 3.8494356 34.6246406
quantile(beta0_all,prob=0.95)
# 95%
# 19.43245
# 10.51029
## Plots
# Modelled distribution of the cloud of diversity across all CCs
beta0_all_df = as.data.frame(cbind(seq(1,length(beta0_all),1),sort(beta0_all)))
colnames(beta0_all_df) = c("X","SNPs")
text_x_offset = 70;
plot_title = "Modelled Cloud of Diversity";
g3 = plot_cloud_of_diversity(beta0_all_df,text_x_offset,plot_title)
plot_file = paste("modelled_clould_of_diversity.allCCs.",output_sufix,".pdf",sep="");
ggsave(plot_file, plot = g3, device = "pdf", width = plot_width, height = plot_height, dpi = 300, units = "in")
##########################################################################################################
####**** 9. SIMULATION MODEL OF TRANSMISSION ****####
##########################################################################################################
# Load in the required functions
# These are the distribution sampler / the curve for the number of SNPs over time / the simulation model
source("simu_transmission_fn.R")
require(reshape2)
## Simulation population
npat = 459 # number of patient samples in cohort 1
nruns = 200000 # number of transmission samples - can be increased to reduce variation in number of SNPs
ndays = 180 # time between samples
## Choose which mapping: All of CC22/CC30 or core genome only
map <- "CC22" # ST22 strain HO 5096 0412 mapping data
#map <- "CC30"
#map <- "CC22_core"
#map <- "CC30_core"
## mu = substitution_rate
if(map == "CC22"){mu = 4.7 / 365}
if(map == "CC30"){mu = 4.9 / 365}
if(map == "CC22_core"){mu = 2.9 / 365}
if(map == "CC30_core"){mu = 2.9 / 365}
## Parameters from model fit
param_general_fit <- read.csv(paste0("output/param_general_fit_",map,".csv"))[,2]
## Data for time zero transferred variability
## Which sheet?
if(map == "CC22"){sheet_num = 1}
if(map == "CC30"){sheet_num = 2}
if(map == "CC22_core"){sheet_num = 3}
if(map == "CC30_core"){sheet_num = 4}
dataS1 = read.xls(dataS1_file, sheet = sheet_num, header = T)
# Removing outliers
dataS1 = dataS1[-which(grepl("outlier",dataS1$Note)==TRUE),]
# Same day
dd0 = dataS1[which(dataS1$TimeGap == 0),] # 104 pairs
h <- hist(dd0$SNPs, breaks = seq(0,max(dd0$SNPs),1))
t0prob_dist <- h$counts/sum(h$counts)
w <- which(t0prob_dist <= 0.01) ## Remove those at < 1%: has big impact on rMyDist.
t0prob_dist[w] <- 0
nonz <- which(t0prob_dist > 0) ## Make the length of the vector equal to the max number of SNPs (remove those 0 values above this)
t0prob_dist <- t0prob_dist[1:max(nonz)]
###### Sample from the baseline variance in the source patient at transmission ##############################
## Run the simulation 10 times to give a range on the maximum, some random variation expected, depending on the number of runs used.
m <- rep(0,10)
for(i in 1:10){
ss <- simu_runs(ndays,mu,npat,nruns, param_general_fit, t0prob_dist)
## Maximum number of SNPs needed to capture 95% or 99% of the transmission events
m[i] <- max(ss$store_limits[which(ss$store_limits$variable == "95%"),"value"]) # = below this, capture 95% of all transmissions
}
## Plot the output from the last run
# g5 = ggplot(ss$store_limits, aes(x=value, fill = variable)) + geom_histogram(aes(y=..density..), binwidth = 1, position = "identity") +
# facet_wrap(~variable) + guides(fill=FALSE) + scale_y_continuous(paste0("Density across ", nruns, " simulations")) + scale_x_continuous("Number of SNPs")
#
# plot_file = paste0("simulation_model_distribution_of_SNPs_",map,".pdf");
#
# ggsave(plot_file, plot = g5, device = "pdf", width = plot_width, height = plot_height, dpi = 300, units = "in")
## Results:
max(m)
range(m)
## 95% of transmission events (max [range])
## map = CC22 = 17 (16 - 17)
## map = CC30 = 55 (52 - 55)
## map = CC22_core = 12 (11 - 12)
## map = CC30_core = 12 (12 - 12)