-
Notifications
You must be signed in to change notification settings - Fork 0
/
impact_unsampled_locations.R
113 lines (101 loc) · 5.89 KB
/
impact_unsampled_locations.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
## This script reproduces the analysis evaluating how unsampled locations
## impact the computation of the relative risk of observing identical sequences
## between sampled locations.
library(tidyverse)
## Load characteristics of WA counties
df_char_counties <- read.csv('../data/maps/county_wa.csv') %>% as_tibble()
vec_counties_west <- df_char_counties$county[df_char_counties$is_west == T]
## Load the relative risk of observing identical sequences between two counties
df_RR_counties <- readRDS('../results/RR_county/df_RR_county_0_mut_away.rds')
## Dataframe with relative risk computed only using sequences from Western WA
df_RR_only_west <- df_RR_counties %>%
rename(RR_full = RR) %>%
filter(group_1 %in% vec_counties_west, group_2 %in% vec_counties_west) %>%
group_by(group_1) %>%
mutate(n_pairs_1_x = sum(n_pairs)) %>%
group_by(group_2) %>%
mutate(n_pairs_x_2 = sum(n_pairs)) %>%
ungroup() %>%
mutate(n_pairs_x_x = sum(n_pairs),
RR_only_west = n_pairs / n_pairs_1_x / n_pairs_x_2 * n_pairs_x_x) %>%
filter(group_1 >= group_2)
## Dataframe with relative risk computed only using sequences from Eastern WA
df_RR_only_east <- df_RR_counties %>%
rename(RR_full = RR) %>%
filter(! group_1 %in% vec_counties_west, ! group_2 %in% vec_counties_west) %>%
group_by(group_1) %>%
mutate(n_pairs_1_x = sum(n_pairs)) %>%
group_by(group_2) %>%
mutate(n_pairs_x_2 = sum(n_pairs)) %>%
ungroup() %>%
mutate(n_pairs_x_x = sum(n_pairs),
RR_only_east = n_pairs / n_pairs_1_x / n_pairs_x_2 * n_pairs_x_x) %>%
filter(group_1 >= group_2)
## Correlation between the relative risks computed from the subsampled dataset and from the full dataset
cor_only_east <- df_RR_only_east %>% summarise(cor = cor(RR_full, RR_only_east, method = 'spearman')) %>% round(digits = 2) %>% unlist() %>% as.numeric()
cor_only_west <- df_RR_only_west %>% summarise(cor = cor(RR_full, RR_only_west, method = 'spearman')) %>% round(digits = 2) %>% unlist() %>% as.numeric()
zero_value_east <- min(c(df_RR_only_east$RR_full[df_RR_only_east$RR_full > 0.],
df_RR_only_east$RR_only_east[df_RR_only_east$RR_only_east > 0.])) * 0.5
zero_value_west <- min(c(df_RR_only_west$RR_full[df_RR_only_west$RR_full > 0.],
df_RR_only_west$RR_only_west[df_RR_only_west$RR_only_west > 0.])) * 0.5
## Display the results
plt_cor_east <- df_RR_only_east %>%
mutate(RR_full_crop = ifelse(RR_full == 0., zero_value_east, RR_full),
RR_only_east_crop = ifelse(RR_only_east == 0., zero_value_east, RR_only_east)) %>%
ggplot() +
geom_text(data = tibble(RR_full = 1., RR_only_east = 1.),
aes(x = 3., y = 2e3), label = paste0('Spearman r = ', cor_only_east)) +
geom_point(aes(x = RR_full_crop, y = RR_only_east_crop)) +
scale_x_continuous(name = expression(RR['full dataset']),
trans = 'log',
breaks = c(zero_value_east, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4),
labels = c(0, expression(10^{-1}), expression(10^{0}), expression(10^{1}),
expression(10^{2}), expression(10^{3}), expression(10^{4})),
expand = expansion(add = c(0.5, 0.5))) +
scale_y_continuous(name = expression(RR['only Eastern WA counties']),
trans = 'log',
breaks = c(zero_value_east, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4),
labels = c(0, expression(10^{-1}), expression(10^{0}), expression(10^{1}),
expression(10^{2}), expression(10^{3}), expression(10^{4})),
expand = expansion(add = c(0.5, 0.5))) +
facet_grid((RR_full == 0.) ~ (RR_only_east != 0.),
scales= 'free', space = 'free') +
theme_classic() +
theme(axis.text = element_text(size = 13),
axis.title = element_text(size = 13),
strip.background = element_blank(),
strip.text = element_blank())
plt_cor_west <- df_RR_only_west %>%
mutate(RR_full_crop = ifelse(RR_full == 0., zero_value_west, RR_full),
RR_only_west_crop = ifelse(RR_only_west == 0., zero_value_west, RR_only_west)) %>%
ggplot() +
geom_text(data = tibble(RR_full = 1., RR_only_west = 1.),
aes(x = 3., y = 2e3), label = paste0('Spearman r = ', cor_only_west)) +
geom_point(aes(x = RR_full_crop, y = RR_only_west_crop)) +
scale_x_continuous(name = expression(RR['full dataset']),
trans = 'log',
breaks = c(zero_value_west, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4),
labels = c(0, expression(10^{-1}), expression(10^{0}), expression(10^{1}),
expression(10^{2}), expression(10^{3}), expression(10^{4})),
expand = expansion(add = c(0.5, 0.5))) +
scale_y_continuous(name = expression(RR['only Western WA counties']),
trans = 'log',
breaks = c(zero_value_west, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4),
labels = c(0, expression(10^{-1}), expression(10^{0}), expression(10^{1}),
expression(10^{2}), expression(10^{3}), expression(10^{4})),
expand = expansion(add = c(0.5, 0.5))) +
facet_grid((RR_full == 0.) ~ (RR_only_west != 0.),
scales= 'free', space = 'free') +
theme_classic() +
theme(axis.text = element_text(size = 13),
axis.title = element_text(size = 13),
strip.background = element_blank(),
strip.text = element_blank())
panel_impact_unsampled_locations <- ggarrange(plt_cor_west, plt_cor_east, nrow = 1, ncol = 2, labels = 'AUTO')
plot(panel_impact_unsampled_locations)
# pdf('../plots/figure_framework/impact_unsampled_locations.pdf', height = 3.5, width = 7.5)
# plot(panel_impact_unsampled_locations)
# dev.off()
# png('../plots/figure_framework/impact_unsampled_locations.png', height = 3.5, width = 7.5, res = 350, units = 'in')
# plot(panel_impact_unsampled_locations)
# dev.off()