-
Notifications
You must be signed in to change notification settings - Fork 0
/
impact_subsampling_RR_East_West.R
118 lines (100 loc) · 7.69 KB
/
impact_subsampling_RR_East_West.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
## This script evaluates how subsampling impacts our comparison of the
## RR of observing identical sequences between counties by adjacency status
## depending on Eastern / Western WA membership.
library(tidyverse)
library(ggpubr)
col_west <- 'dodgerblue3'
col_east <- 'coral1'
col_east_west <- 'darkslateblue'
## Load adjacency matrix
df_adj_county <- readRDS('../../data/maps/df_adj_county.rds')
## Load characteristics of WA counties
df_char_counties <- read.csv('../../data/maps/county_wa.csv') %>% as_tibble()
## Load the relative risk of observing identical sequences between two counties
df_RR_counties <- read_csv('../../results/RR_county/df_RR_county_0_mut_away.csv') %>%
left_join(df_adj_county, by = c('group_1' = 'county_1', 'group_2' = 'county_2')) %>%
mutate(is_same_county = (group_1 == group_2)) %>%
left_join(df_char_counties %>% select(county, is_west), by = c('group_1' = 'county')) %>%
rename(is_west_1 = is_west) %>%
left_join(df_char_counties %>% select(county, is_west), by = c('group_2' = 'county')) %>%
rename(is_west_2 = is_west) %>%
mutate(region = case_when(is_west_1 + is_west_2 == 2 ~ 'West - West',
is_west_1 + is_west_2 == 1 ~ 'West - East',
is_west_1 + is_west_2 == 0 ~ 'East - East'),
region = factor(region, levels = c('West - West', 'West - East', 'East - East'))) %>%
mutate(is_adjacent_same_county = case_when(is_same_county == T ~ 'Same county',
is_adjacent == T ~ 'Adjacent counties',
TRUE ~ 'Non adjacent counties'),
is_adjacent_same_county = factor(is_adjacent_same_county, levels = c('Non adjacent counties', 'Adjacent counties', 'Same county')))
########## County-level analysis
## Evaluate whether the fact that we do not observe any difference in the RR between
## adjacent and non adjacent counties in Western and Eastern WA can be explained by
## the fact that we are comparing a different number of groups.
## Procedure: Subsample a fraction of pairs of adjacent and non-adjacent counties
## for W/W, W/E and E/E to systematically compare the same number of pairs.
## Then, we perform a Wilcoxon test and report p-values.
## Dataframe with the number of pairs available in adjacent and non-adjacent counties
## by Eastern / Western WA status
df_pairs_by_counties <- df_RR_counties %>%
filter(group_1 > group_2) %>%
group_by(region, is_adjacent_same_county) %>%
summarise(n_pairs = n()) %>% ungroup()
## Lowest number of pairs of adjacent counties across regions
min_n_pairs_adjacent <- min(df_pairs_by_counties$n_pairs[df_pairs_by_counties$is_adjacent_same_county == 'Adjacent counties'])
## Lowest number of pairs of non adjacent counties across regions
min_n_pairs_non_adjacent <- min(df_pairs_by_counties$n_pairs[df_pairs_by_counties$is_adjacent_same_county == 'Non adjacent counties'])
## Vector from which to subsample
vec_RR_counties_WE_adj <- df_RR_counties[df_RR_counties$region == 'West - East' & df_RR_counties$is_adjacent_same_county == 'Adjacent counties' & (df_RR_counties$group_1 > df_RR_counties$group_2), ]$RR
vec_RR_counties_WE_non_adj <- df_RR_counties[df_RR_counties$region == 'West - East' & df_RR_counties$is_adjacent_same_county == 'Non adjacent counties' & (df_RR_counties$group_1 > df_RR_counties$group_2), ]$RR
vec_RR_counties_west_adj <- df_RR_counties[df_RR_counties$region == 'West - West' & df_RR_counties$is_adjacent_same_county == 'Adjacent counties' & (df_RR_counties$group_1 > df_RR_counties$group_2), ]$RR
vec_RR_counties_east_adj <- df_RR_counties[df_RR_counties$region == 'East - East' & df_RR_counties$is_adjacent_same_county == 'Adjacent counties' & (df_RR_counties$group_1 > df_RR_counties$group_2), ]$RR
vec_RR_counties_west_non_adj <- df_RR_counties[df_RR_counties$region == 'West - West' & df_RR_counties$is_adjacent_same_county == 'Non adjacent counties' & (df_RR_counties$group_1 > df_RR_counties$group_2), ]$RR
vec_RR_counties_east_non_adj <- df_RR_counties[df_RR_counties$region == 'East - East' & df_RR_counties$is_adjacent_same_county == 'Non adjacent counties' & (df_RR_counties$group_1 > df_RR_counties$group_2), ]$RR
## Perform subsampling
n_rep <- 1000
set.seed(2000)
df_sim <- Reduce('bind_rows', lapply(1:n_rep, FUN = function(i_rep){
subsample_RR_west_adj <- sample(vec_RR_counties_west_adj, size = min_n_pairs_adjacent, replace = F)
subsample_RR_east_adj <- sample(vec_RR_counties_east_adj, size = min_n_pairs_adjacent, replace = F)
subsample_RR_WE_adj <- sample(vec_RR_counties_WE_adj, size = min_n_pairs_adjacent, replace = F)
subsample_RR_west_non_adj <-sample(vec_RR_counties_west_non_adj, size = min_n_pairs_non_adjacent, replace = F)
subsample_RR_east_non_adj <- sample(vec_RR_counties_east_non_adj, size = min_n_pairs_non_adjacent, replace = F)
subsample_RR_WE_non_adj <- sample(vec_RR_counties_WE_non_adj, size = min_n_pairs_non_adjacent, replace = F)
c('p_val_west' = wilcox.test(x = subsample_RR_west_adj, y = subsample_RR_west_non_adj)$p.value,
'p_val_east' = wilcox.test(x = subsample_RR_east_adj, y = subsample_RR_east_non_adj)$p.value,
'p_val_WE' = wilcox.test(x = subsample_RR_WE_adj, y = subsample_RR_WE_non_adj)$p.value,
'rep' = i_rep)
}))
df_for_boxplot_p_val <- df_sim %>%
summarise(y05_west = quantile(p_val_west, 0.05), y25_west = quantile(p_val_west, 0.25), y50_west = quantile(p_val_west, 0.5), y75_west = quantile(p_val_west, 0.75), y95_west = quantile(p_val_west, 0.95),
y05_east = quantile(p_val_east, 0.05), y25_east = quantile(p_val_east, 0.25), y50_east = quantile(p_val_east, 0.5), y75_east = quantile(p_val_east, 0.75), y95_east = quantile(p_val_east, 0.95),
y05_WE = quantile(p_val_WE, 0.05), y25_WE = quantile(p_val_WE, 0.25), y50_WE = quantile(p_val_WE, 0.5), y75_WE = quantile(p_val_WE, 0.75), y95_WE = quantile(p_val_WE, 0.95))
plt_impact_subsampling_wilcoxon <- df_sim %>% ggplot() +
geom_jitter(aes(x = 'West - West', y = p_val_west), alpha = 0.08, col = 'darkgrey',
width = 0.3, height = 0.) +
geom_boxplot(data = df_for_boxplot_p_val,
aes(x = 'West - West', ymin = y05_west, lower = y25_west, middle = y50_west, upper = y75_west, ymax = y95_west),
stat = "identity", fill = NA, width = 0.7, color = col_west) +
geom_jitter(aes(x = 'West - East', y = p_val_WE), alpha = 0.08, col = 'darkgrey',
width = 0.3, height = 0.) +
geom_boxplot(data = df_for_boxplot_p_val,
aes(x = 'West - East', ymin = y05_WE, lower = y25_WE, middle = y50_WE, upper = y75_WE, ymax = y95_WE),
stat = "identity", fill = NA, width = 0.7, color = col_east_west) +
geom_jitter(aes(x = 'East - East', y = p_val_east), alpha = 0.08, col = 'darkgrey',
height = 0., width = 0.3) +
geom_boxplot(data = df_for_boxplot_p_val,
aes(x = 'East - East', ymin = y05_east, lower = y25_east, middle = y50_east, upper = y75_east, ymax = y95_east),
stat = "identity", fill = NA, width = 0.7, color = col_east) +
scale_x_discrete(name = '', limits = rev) +
scale_y_continuous(trans = 'log10',
name = 'p-value (Wilcoxon test for difference in RR\nbetween adjacent and non-adjacent counties)',
breaks = c(1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.),
labels = c(expression(10^{-5}),
expression(10^{-4}), expression(10^{-3}),
expression(10^{-2}), expression(10^{-1}),
expression(10^{0}))) +
theme_classic() +
theme(axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1.),
axis.title = element_text(size = 12))
plot(plt_impact_subsampling_wilcoxon)