-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathage_metadata.Rmd
301 lines (266 loc) · 11.7 KB
/
age_metadata.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
---
title: "age_metadata_analysis"
output: html_document
---
# create new age_years column in dataset21and22
```{r}
dataset21and22$age_years <- NA
dataset21and22$age_years <- as.numeric(dataset21and22$age_years)
```
# add "host age" data to "age_years" column
# create table for data not added to "age_years" -> "host_age_exceptions_df"
```{r}
column_name <- "host age"
# iterate by row
for (n_row in 1:nrow(dataset21and22)) {
# check that host age has value
if (!is.na(dataset21and22[n_row, column_name]))
{
### YEARS ###
# check for "years" unit (y), excluding values with "-" and "day"
if (grepl("y", dataset21and22[n_row, column_name], ignore.case=TRUE) &
!grepl("-", dataset21and22[n_row, column_name]) &
!grepl("day", dataset21and22[n_row, column_name], ignore.case=TRUE) &
!grepl("m", dataset21and22[n_row, column_name], ignore.case=TRUE)) {
dataset21and22[n_row, "age_years"] <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, column_name]))
}
# add exception "10 year-old"
if (grepl("year-old", dataset21and22[n_row, column_name], ignore.case=TRUE)) {
dataset21and22[n_row, "age_years"] <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, column_name]))
}
# add exception "0-1 years" (-> 1 y.o.)
if (grepl("0-1 years", dataset21and22[n_row, column_name], ignore.case=TRUE)) {
dataset21and22[n_row, "age_years"] <- 1
}
### MONTHS ###
if (grepl("m", dataset21and22[n_row, column_name], ignore.case=TRUE) &
!grepl("y", dataset21and22[n_row, column_name], ignore.case=TRUE)) {
# calculate months -> years
age_months <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, column_name]))
dataset21and22[n_row, "age_years"] <- round(age_months / 12)
}
### DAYS ###
if (grepl("day", dataset21and22[n_row, column_name], ignore.case=TRUE)) {
# calculate days -> years
age_days <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, column_name]))
dataset21and22[n_row, "age_years"] <- round(age_days / 365)
}
if (grepl("D", dataset21and22[n_row, column_name])) {
# calculate days -> years
age_days <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, column_name]))
dataset21and22[n_row, "age_years"] <- round(age_days / 365)
}
### NEONATE EXCEPTION ###
# add to "age_years" as 0 years
if (grepl("neonate", dataset21and22[n_row, column_name], ignore.case=TRUE)) {
dataset21and22[n_row, "age_years"] <- 0
}
}
}
sum(!is.na(dataset21and22$age_years))
# host_age_exceptions_df <- subset(dataset21and22[,c(1,29)], !is.na(dataset21and22$`host age`) & is.na(dataset21and22$`age_years`))
```
# add "age" column data to "age_years" column
```{r}
# only exception is data with "days", so convert that to years and add to "age_years"
# other than that, assume all numbers are in years (bc why else...)
# iterate by row
for (n_row in 1:nrow(dataset21and22)) {
# check that age_years hasn't been determined from host age
if (is.na(dataset21and22[n_row, "age_years"])) {
# check that "age" has value
if (!is.na(dataset21and22[n_row, "age"])) {
# check for values with "day" and convert to years
if (grepl("day", dataset21and22[n_row, "age"])) {
age_days <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, "age"]))
dataset21and22[n_row, "age_years"] <- round(age_days / 365)
}
# directly transfer all other data from "age" to "age_years"
else {
dataset21and22[n_row, "age_years"] <- as.numeric(dataset21and22[n_row, "age"])
}
}
}
}
```
### for "host age" w/o unit, check BioProject manually to confirm unit. Download list of BioSample ID's for each project, then add age to age_years with appropriate unit
```{r}
# sum of age_years values BEFORE = 667
# N samples w/ host age but not age_years = 1715 ; after list 0309 = 1563
# N samples w/ age_years = 3826 ; after list 0309 = 3978
# N samples w/ host age but not age_years = 1563 ; after list 0310 = 1445
# N samples w/ age_years = 3978 ; after list 0310 = 4096
# N samples w/ host age but not age_years = 1445 ; after list 0310_02 = 1389
# N samples w/ age_years = 4096 ; after list 0310_02 = 4152
Bioproject = X0310_02_biosampleList_years
age_unit = "years"
# make list of samples in bioproject
sampleIDs <- as.vector(Bioproject[[1]])
n_y = 0
n_m = 0
n_d = 0
for (n_row in 1:nrow(dataset21and22)) {
# check that there is a value in "host age"
if (!is.na(dataset21and22[n_row,"host age"])) {
#check that there is NOT a value in "age_years"
if (is.na(dataset21and22[n_row,"age_years"])) {
# check that value is "host age" is numeric (no dashes or words)
if (!is.na(as.numeric(dataset21and22[n_row,"host age"]))) {
# check if sample is in list of sample ID's for this bioproject
if (dataset21and22[n_row, "sample_id"] %in% sampleIDs) {
# if age unit is years, add "host age" directly to "age_years" column
if (age_unit == "years") {
dataset21and22[n_row, "age_years"] <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, "host age"]))
n_y = n_y + 1
# print(as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, "host age"])))
}
# if age unit is months, convert to years then add to "age_years"
else if (age_unit == "months") {
age_months <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, "host age"]))
dataset21and22[n_row, "age_years"] <- round(age_months / 12)
n_m = n_m + 1
}
# if age unit is days, convert to years then add to "age_years
else if (age_unit == "days") {
age_days <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, "host age"]))
dataset21and22[n_row, "age_years"] <- round(age_days / 365)
n_d = n_d + 1
}
# ONLY DO FOR AGE ASSUMPTION dataset21and22
# check if age is reasonably in years (from 1-100)
# if (as.numeric(dataset21and22[n_row,"host age"] >= 1 | as.numeric(dataset21and22[n_row,"host age"] <= 100))) {
# dataset21and22[n_row, "age_years"] <- as.numeric(gsub("[^0-9.]", "", dataset21and22[n_row, "host age"]))
# }
}
}
}
}
}
# AFTER X01_biosampleList_days, sum of age_years values = 1484
# ages added = 817
# AFTER X01_biosampleList_years, sum of age_years values = 3487
# ages added = 2003
# AFTER X01=2_biosampleList_years, sum of age_years values = 3826
# ages added = 339
# AFTER age assumption, sum of age_years values = 4152
# ages added = 326
sum(!is.na(dataset21and22$age_years))
sum(subset(is.na(dataset21and22$age_years), !is.na(dataset21and22$"host age")))
host_age_exceptions_df <- subset(dataset21and22[,c(1,29)], !is.na(dataset21and22$`host age`) & is.na(dataset21and22$`age_years`))
# View(host_age_exceptions_df)
# View(table(host_age_exceptions_df$`host age`))
print(n_y)
print(n_m)
print(n_d)
```
# retrieve biosample ID's for samples w/ unconfirmed "host age" unit AND "host age" value is numeric (no dashes/ not a binned age range)
```{r}
# create empty vector to append biosample ID's to
biosample_ID_toCheckUnits <- list()
# iterate by row
for (n_row in 1:nrow(dataset21and22)) {
# check that age_years is NA
if (is.na(dataset21and22[n_row,"age_years"])) {
# check that host age unit is numeric
if (!is.na(as.numeric(dataset21and22[n_row,"host age"]))) {
biosample_ID_toCheckUnits <- append(biosample_ID_toCheckUnits, dataset21and22[n_row,1])
}
}
}
# export as xlsx file
# write.xlsx(biosample_ID_toCheckUnits, file = "/Users/rebeccachristensen/Desktop/Cremer_Lab_2022/dsrAB_Biosample_Metadata_Analysis/biosample_ID_list_toCheckUnits_0310_02.xlsx")
```
# add "host age" and "age_years" values to binned age ranges
# bin_9yrSize : binned ages in increments of 9 years (eg: 0-9, 10-19...)
# bin_5yrCutoff : binned ages with cutoff of 5 (<5, >=5)
```{r}
# create new columns for binned ages
subject_df$bin_9yrSize <- NA
subject_df$bin_5yrCutoff <- NA
# someone else's code to bin continuous numeric variable:
# 9 year bin size:
subject_df <- subject_df %>%
mutate(bin_9yrSize = cut(age_years, breaks = seq(0,110, by=10), include.lowest = TRUE, right = FALSE))
# 5 year cutoff:
subject_df <- subject_df %>%
mutate(bin_5yrCutoff = cut(age_years, breaks = c(0,5,110), right = FALSE))
# it works beautifully. thank god for the internet. mental note: stop solving problems in R by iterating by row bc it's super slow
# change to bin columns to character type AFTER binning age_years data
subject_df$bin_9yrSize <- as.character(subject_df$bin_9yrSize)
subject_df$bin_5yrCutoff <- as.character(subject_df$bin_5yrCutoff)
# must iterate by row to add binned ages for "host age" value exceptions
for (n_row in 1:nrow(subject_df)) {
# skip over values with a value for age_years
if (is.na(subject_df[n_row, "age_years"])) {
# check if "host age" value !is.na
if (!is.na(subject_df[n_row, "host age"])) {
# determine bin values
if (subject_df[n_row, "host age"] == "0-4" |
subject_df[n_row, "host age"] == "<5") {
subject_df[n_row, "bin_9yrSize"] <- "[0,10)"
subject_df[n_row, "bin_5yrCutoff"] <- "[0,5)"
}
if (subject_df[n_row, "host age"] == "1 to 5") {
subject_df[n_row, "bin_9yrSize"] <- "[0,10)"
}
if (subject_df[n_row, "host age"] == "5-9") {
subject_df[n_row, "bin_9yrSize"] <- "[0,10)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "10-19") {
subject_df[n_row, "bin_9yrSize"] <- "[10,20)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "20-29" |
subject_df[n_row, "host age"] == "21 to 25") {
subject_df[n_row, "bin_9yrSize"] <- "[20,30)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "30-39") {
subject_df[n_row, "bin_9yrSize"] <- "[30,40)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "40-49") {
subject_df[n_row, "bin_9yrSize"] <- "[40,50)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "50-59" |
subject_df[n_row, "host age"] == "51 to 55") {
subject_df[n_row, "bin_9yrSize"] <- "[50,60)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "60-69" |
subject_df[n_row, "host age"] == "61 to 65") {
subject_df[n_row, "bin_9yrSize"] <- "[60,70)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "70-79" |
subject_df[n_row, "host age"] == "71 to 75") {
subject_df[n_row, "bin_9yrSize"] <- "[70,80)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "80-89") {
subject_df[n_row, "bin_9yrSize"] <- "[80,90)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
if (subject_df[n_row, "host age"] == "90-99") {
subject_df[n_row, "bin_9yrSize"] <- "[90,100)"
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
# don't fit into a 9 year bin
if (subject_df[n_row, "host age"] == "26-30" |
subject_df[n_row, "host age"] == "26-55 years" |
subject_df[n_row, "host age"] == "35-44" |
subject_df[n_row, "host age"] == "45-54" |
subject_df[n_row, "host age"] == "55-64" |
subject_df[n_row, "host age"] == "56 to 60" |
subject_df[n_row, "host age"] == "65-74" |
subject_df[n_row, "host age"] == "76 to 80" |
subject_df[n_row, "host age"] == ">90") {
subject_df[n_row, "bin_5yrCutoff"] <- "[5,110)"
}
}
}
}
# SUCCESS!!!
```