This repository has been archived by the owner on Sep 15, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
R4SME 15 strategies.Rmd
379 lines (301 loc) · 9.37 KB
/
R4SME 15 strategies.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
---
title: "R for SME 15: Strategies of analysis"
author: [Andrea Mazzella](https://github.com/andreamazzella)
output: html_document
---
```{r Load packages}
library(haven)
library(pubh)
library(epiDisplay)
library(magrittr)
library(tidyverse)
```
## Data import and exploration
Make sure you have the mortality.dta dataset in the same folder as this .rmd file.
```{r Import}
mortality <- read_dta("./mortality.dta")
mortality %<>% mutate_if(is.labelled,as_factor)
```
Explore the data.
- What study design?
- How many individuals?
```{r Explore}
glimpse(mortality)
summary(mortality)
View(mortality)
```
## Data management
- How many variables?
- Which variables code for the main exposure, outcome?
- Which are your fixed confounders?
- What are the other potential confounders?
- How are these variables coded?
- Which variables do we need to recode, rename, categorise?
```{r Data management}
# died -> death (factor)
mortality %<>%
mutate(death = factor(died,
levels = c(0, 1),
labels = c("No", "Yes")))
mortality %$% table(death, died)
# District -> factor
mortality$district <- as.factor(mortality$district)
mortality %$% table(district)
# Compound size
mortality %<>% mutate(compound_grp = cut(
compound_size,
breaks = c(0, 9, 19, +Inf),
labels = c("1-9", "10-19", "20-55")
))
mortality %$% table(compound_size, compound_grp)
mortality %$% tabpct(compound_grp, death, percent = "row", graph = F)
mortality %$% tabpct(compound_grp, vimp, percent = "row", graph = F)
# Ethnic: simplify by recoding -> ethnic2
mortality %$% table(ethnic)
mortality %<>%
mutate(
ethnic2 = recode(
ethnic,
"Fulani" = "Other",
"Gwari" = "Other",
"Kiwollo" = "Other",
" Surubu" = "Other"
)
)
mortality %$% table(ethnic, ethnic2)
mortality %$% table(ethnic2)
# Occupation: simplify by recoding -> occ_sector
mortality %$% table(occupation)
mortality %$% tabpct(occupation, death, percent = "row", graph = F)
mortality %$% tabpct(occupation, vimp, percent = "row", graph = F)
mortality %<>%
mutate(
occ_sector = recode(
occupation,
"Farmer" = "Agriculture",
"Fishing" = "Agriculture",
"Hunting" = "Agriculture",
"Artisan" = "Other",
"Trader" = "Service",
"Homemaker" = "Service",
"Priest/Imam" = "Other",
"Teacher/Civil servant, etc." = "Other",
"Traditional healer" = "Other",
"Child below school age" = "Other",
"Unemployed" = "Other",
"Student" = "Other"
)
)
mortality %$% tabpct(occ_sector, death, percent = "row", graph = F)
# Education
mortality %$% table(education)
mortality %$% tabpct(education, death, percent = "row", graph = F)
mortality %$% tabpct(education, vimp, percent = "row", graph = F)
mortality %<>%
mutate(
education_grp = recode(
education,
"Koranic education only" = "Adult/Koranic education",
"Adult education only" = "Adult/Koranic education",
"Primary" = "Formal education",
"Secondary" = "Formal education",
"Post secondary" = "Formal education"
)
)
mortality %$% table(education, education_grp)
mortality %$% table(education_grp)
mortality %$% tabpct(education_grp, death, percent = "row", graph = F)
mortality %$% tabpct(education_grp, vimp, percent = "row", graph = F)
# Systolic
mortality %<>% mutate(systolic_grp = cut(
systolic,
breaks = c(0, 119, 139, +Inf),
labels = c("normal", "pre-hypertension", "hypertension")
))
mortality %$% table(systolic, systolic_grp)
class(mortality$systolic_grp)
# HR
mortality %<>% mutate(tachycardia = cut(
pulse,
breaks = c(0, 99, +Inf),
labels = c("normal", "tachycardia")
))
mortality %$% table(pulse, tachycardia)
mortality %$% tabpct(tachycardia, death, percent = "row", graph = F)
mortality %$% tabpct(tachycardia, vimp, percent = "row", graph = F)
```
## Descriptive analysis
NB: an alternative way to create a quick Table 1 is tableone::CreateTableOne() - check it out! (Thanks, Julian!)
### Describe the sample in terms of outcome and exposure
- What's the incidence risk of death?
- What's the prevalence of visual impairment?
```{r}
mortality %$% tab1(death, cum.percent = F, graph = F)
mortality %$% tab1(vimp, cum.percent = F, graph = F)
```
### Describe the sample in terms of the age and sex
```{r}
# Age
mortality %$% tab1(agegrp, cum.percent = F, graph = F)
# Sex
mortality %$% tab1(sex, cum.percent = F, graph = F)
# Age and sex
mortality %>% ggplot(aes(agegrp)) +
geom_bar(aes(fill = sex)) +
scale_fill_viridis_d(option = "D") +
labs(title = "Age and sex distribution", x = "Age", y = "frequency")
```
### Describe the sample in terms of geography
```{r}
# Area
mortality %$% tab1(area, cum.percent = F, graph = F)
# District
mortality %$% tab1(district, cum.percent = F, graph = F)
# Village ID number
mortality %$% tab1(vcode, cum.percent = F, graph = F) # useless, too many villages
```
### Describe sample in terms of socio-economic status
```{r}
# Compound size
print("Compound size")
mortality %$% summary(compound_size)
mortality %>% ggplot(aes(x = compound_size)) +
geom_histogram()
# Ethnic origin
mortality %$% tab1(ethnic2, cum.percent = F, graph = F)
# Religion
mortality %$% tab1(religion, cum.percent = F, graph = F)
# Occupation
mortality %$% tab1(occ_sector, cum.percent = F, graph = F)
# Education
mortality %$% tab1(education_grp, cum.percent = F, graph = F)
```
### Describe sample in terms of medical status
```{r}
# Systolic BP
mortality %>% ggplot(aes(x = systolic)) +
geom_histogram()
mortality %$% tab1(systolic_grp, cum.percent = F, graph = F)
# Diastolic BP
mortality %>% ggplot(aes(x = diastolic)) +
geom_histogram()
# Mean arterial BP
mortality %>% ggplot(aes(x = map)) +
geom_histogram()
# HR
mortality %>% ggplot(aes(x = pulse)) +
geom_histogram()
mortality %$% tab1(tachycardia, cum.percent = F, graph = F)
# Weight
mortality %>% ggplot(aes(x = weight)) +
geom_histogram()
print("Weight")
print("mean")
mortality %$% mean(weight, na.rm = T)
print("sd")
mortality %$% sd(weight, na.rm = T)
# Height
mortality %>% ggplot(aes(x = height)) +
geom_histogram()
print("Height")
print("mean")
mortality %$% mean(height, na.rm = T)
print("sd")
mortality %$% sd(height, na.rm = T)
# BMI
mortality %>% ggplot(aes(x = bmi)) +
geom_histogram()
mortality %$% tab1(bmigrp, cum.percent = F, graph = F)
# Microfilaria
mortality %>% ggplot(aes(x = mfpermg)) +
geom_histogram()
mortality %$% tab1(mfpos, cum.percent = F, graph = F)
mortality %$% tab1(mfgrp, cum.percent = F, graph = F)
```
## Crude analysis
- What is the crude OR for death in the visually impaired?
```{r}
# With chi-squared test
mortality %$% cc(death, vimp, graph = F)
# With logistic regression
glm(death ~ vimp,
data = mortality,
family = binomial()) %>% logistic.display()
```
## Stratified analysis - MH method
```{r}
# Age
mortality %$% epiDisplay::mhor(death, vimp, agegrp, graph = F)
# Sex
mortality %$% epiDisplay::mhor(death, vimp, sex, graph = F)
```
```{r}
# Area
mortality %$% epiDisplay::mhor(death, vimp, area, graph = F)
# District
mortality %$% epiDisplay::mhor(death, vimp, district, graph = F)
# Village number
mortality %$% epiDisplay::mhor(death, vimp, vcode, graph = F)
```
```{r}
# Compound size
mortality %$% epiDisplay::mhor(death, vimp, compound_grp, graph = F)
# Ethnic origin
mortality %$% epiDisplay::mhor(death, vimp, ethnic2, graph = F)
# Religion
mortality %$% epiDisplay::mhor(death, vimp, religion, graph = F)
# Occupation
mortality %$% epiDisplay::mhor(death, vimp, occ_sector, graph = F)
# Education
mortality %$% epiDisplay::mhor(death, vimp, education_grp, graph = F)
```
```{r}
# Systolic BP
mortality %$% epiDisplay::mhor(death, vimp, systolic_grp, graph = F)
# Diastolic BP
# Mean arterial BP
# HR
mortality %$% epiDisplay::mhor(death, vimp, tachycardia, graph = F)
# Weight
# Height
# BMI
mortality %$% epiDisplay::mhor(death, vimp, bmigrp, graph = F)
# Microfilaria
mortality %$% epiDisplay::mhor(death, vimp, mfpos, graph = F)
mortality %$% epiDisplay::mhor(death, vimp, mfgrp, graph = F)
```
## Stratified analysis - logistic regression
A possible model:
```{r}
# Age, hypertension, BMI
logit_without <- glm(death ~ vimp + agegrp + systolic_grp + bmigrp,
data = mortality,
family = binomial())
logistic.display(logit_without)
```
- How do you interpret these results?
## Analysis for interaction - logistic regression
```{r}
# Interaction between vimp and age
logit_with_age <- glm(death ~ vimp * agegrp + systolic_grp + bmigrp,
data = mortality,
family = binomial())
logistic.display(logit_with_age)
lrtest(logit_with_age, logit_without)
```
```{r}
# Interaction between vimp and systolic
logit_with_sys <- glm(death ~ vimp * systolic_grp + agegrp + bmigrp,
data = mortality,
family = binomial())
logistic.display(logit_with_sys)
lrtest(logit_with_sys, logit_without)
```
```{r}
# Interaction between vimp and BMI
logit_with_BMI <- glm(death ~ vimp * bmigrp + systolic_grp + agegrp,
data = mortality,
family = binomial())
logistic.display(logit_with_BMI)
lrtest(logit_with_BMI, logit_without)
```