-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbiosample_per_subject.Rmd
262 lines (207 loc) · 9.87 KB
/
biosample_per_subject.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
---
title: "Biosample per Subject"
output: html_document
---
# Let's make subject_df from dataset21and22!
```{r}
# first, let's take a look at the available metadata variables and single out the one's related to subject ID
# View(colnames(dataset21and22))
variables_related_to_subject <- c("label",
"sample name",
"unique ID",
"host subject id",
"description",
"Replication #",
"Test person",
"biospecimen repository",
"submitted subject id",
"patient_name",
"Patient_name",
"sample_alias",
"ID",
"patient",
"individual name",
"individual",
"subject_id")
for (variable in variables_related_to_subject[17]){
View(table(dataset21and22[,variable]))
}
# check out the samples
subset(dataset21and22$sample_id, dataset21and22$`subject_id` == "ERP005989|mother_64")
# unique ID possibly may be of interest -- need to look in to the studies
# should be used to identify single subjects:
subject_variables <- c(
"host subject id",
"Test person",
"submitted subject id",
"patient_name",
"Patient_name",
"patient",
"individual name",
"individual",
"subject_id"
)
# now that we have a list of variables with subject information, we can add those samples to a df with row = subject ID
# first, let's figure out how many samples have overlapping data for the variables, and decide which variable should have priority when parsing the data
```
```{r}
# first, let's figure out how many samples have overlapping data for the variables, and decide which variable should have priority when parsing the data
# sum(!is.na(dataset21and22$`host subject id`) & !is.na(dataset21and22$`individual name`))
# create "merged_subject_id" column and add data for all variables in subject_variables list
# make sure there are no "missing" values -- change these to NA!
# na_values <- c("missing",
# "Missing",
# "",
# " ",
# "not collected",
# "not applicable",
# "Not Applicable",
# "Not Collected",
# "N/A",
# "n/a",
# "not collected",
# "not determined",
# "not provided",
# "Unknown",
# "unknown",
# "not available",
# "Not collected",
# "Not applicable",
# "Not available",
# "na",
# "NOT COLLECTED")
#
# for (variable in subject_variables) {
# for (replace_value in na_values) {
# dataset21and22[[variable]] <- replace(dataset21and22[[variable]], dataset21and22[[variable]] == replace_value, NA)
# }
# }
# dataset21and22$merged_subject_id <- NA
subject_variables <- c(
"host subject id",
"Test person",
"submitted subject id",
"patient_name",
"Patient_name",
"patient",
"individual name",
"individual",
"subject_id"
)
# for (variable in subject_variables){
# for (n_row in 1:nrow(dataset21and22)) {
# if (is.na(dataset21and22[n_row, "merged_subject_id"])) {
# if (!is.na(dataset21and22[[n_row, variable]])) {
# dataset21and22[n_row, "merged_subject_id"] <- dataset21and22[[n_row, variable]]
# }
# }
# }
# }
sum(!is.na(dataset21and22$merged_subject_id))
# 7159 samples with subject id
length(unique(dataset21and22$merged_subject_id))
# 2079 unique subject ids! not that bad, but i suppose it really depends on the available metadata...
```
```{r}
# create new df with subject id as row
OTU_names <- colnames(dataset[11:45])
OTU_names <- append(OTU_names, colnames(dataset[146:158]))
OTU_names <- append(OTU_names, colnames(dataset[448:454]))
# make empty df with column names from original dataset
subject_df <- data.frame(matrix(nrow = 0, ncol = ncol(dataset)))
colnames(subject_df) <- colnames(dataset)
# iterate over rows in original dataset and add rows with subject ID's one at a time
for (n_row in 1:nrow(dataset)) {
# check that there is a value for subject id
if (!is.na(dataset[n_row, "merged_subject_id"])) {
# add row to subject_df if subject id isn't already in there
if (!dataset[n_row, "merged_subject_id"] %in% as.list(subject_df$merged_subject_id)){
subject_df <- rbind(subject_df, dataset[n_row,])
}
# if subject id already in df, append sample_id to sample_id column and iterate over OTU names (retain TRUE)
else if (dataset[n_row, "merged_subject_id"] %in% as.list(subject_df$merged_subject_id)) {
sample_ids <- append(subject_df$sample_id[subject_df$merged_subject_id == dataset[n_row, "merged_subject_id"]], dataset[n_row, "sample_id"])
subject_df$sample_id[subject_df$merged_subject_id == dataset[n_row, "merged_subject_id"]] <- toString(sample_ids)
# retain TRUE ("1") values for OTUs
for (OTU in OTU_names) {
# check that OTU = 1 in original df
if (dataset[n_row, OTU] == 1) {
# check that OTU = 0 in subject df
if (subject_df[,OTU][subject_df$merged_subject_id == dataset[n_row, "merged_subject_id"]] == 0) {
# change OTU 0 --> 1 in subject df
subject_df[,OTU][subject_df$merged_subject_id == dataset[n_row, "merged_subject_id"]] <- 1
}
}
}
}
}
}
```
```{r}
# recalculate total num OTUs per subject (in subject df)
# (just copy the code used to calculate total num OTUs per sample ...)
subject_df$"n_leaf-level_OTUs_11to45" <- rowSums(subject_df[11:45])
subject_df$"n_leaf-level_OTUs_146to158" <- rowSums(subject_df[146:158])
subject_df$"n_leaf-level_OTUs_448to454" <- rowSums(subject_df[448:454])
subject_df$"n_leaf-level_OTUs" <- rowSums(subject_df[457:459])
subject_df$SRM_present <- subject_df$`n_leaf-level_OTUs` > 0
```
# finding project accession for every. damn. biosample.
# yes, you heard me right
```{r}
# working df = data (copy of dataset21and22)
# # next, let's get a list of the samples that fall into this category
# samples <- as.vector(dataset21and22$sample_id)
# and now, we check if these samples have any metadata variables related to a project number
# sum(!is.na(data$ProjectAccession))
# 731 biosamples have info for project accession, so we can add these directly to the nmerged project_accession column
# welp. none of the samples have a project accession.
# new strategy:
# step 1: manually find project accession for sample
# step 2: download list of samples associated with that project
# step 3: match samples from list in step 2 ("project_samples") with samples in "samples" and add to "matched_samples" list
# step 4: add project accession to new "project_accession" column for matched samples dataset21and22
# step 5: remove matched samples from "samples" -- wash rinse repeat
# first, add all values from ProjectAccession to merged_project_accession
# data$merged_project_accession <- data$ProjectAccession
PRJDB8054 <- read_csv("Bioproject_Biosample_Lists/PRJDB8054.txt", col_names = FALSE, show_col_types = FALSE)
projects <- c(PRJNA514245, PRJNA442434, PRJNA178162, PRJEB26092, PRJDB4597, PRJDB6127, PRJDB6498, PRJDB6872, PRJDB7521, PRJDB7714, PRJDB8054)
project_accession <- "PRJDB8054"
project_samples <- as.vector(tail(projects, 1)[[1]])
matched_samples <- intersect(project_samples, samples)
# first project
# data$merged_project_accession <- ifelse(data$sample_id %in% matched_samples, project_accession, NA)
# subsequent projects
data$merged_project_accession <-
ifelse(is.na(data$merged_project_accession),
ifelse(data$sample_id %in% matched_samples, project_accession, NA), data$merged_project_accession)
length(matched_samples)
# View(table(data$merged_project_accession))
sum(!is.na(data$merged_project_accession))
samples <- as.vector(subset(data$sample_id, is.na(data$merged_project_accession)))
samples[1]
```
```{r}
# projects with only one sample
sample <- samples[1]
proj_acc <- "PRJDB4440"
data[data$sample_id == sample, "merged_project_accession"] <- proj_acc
sum(!is.na(data$merged_project_accession))
samples <- as.vector(subset(data$sample_id, is.na(data$merged_project_accession)))
samples[1]
```
## OKAY. we are not doing this. i finally learned how to use NCBI's eUtils, so I wrote a bash script parser to get the project for every sample!!! exciting
# so while that's running, i'm gonna run the same code on a shorter list of samples for samples that: 1) do not have a merged_subject_id and 2) have a value for age_years or one of the binned age columns
# so this code is to get that list of samples!
```{r}
# n samples that have merged_subject_id = 7,159
length(as.vector(subset(dataset21and22$sample_id, !is.na(dataset21and22$merged_subject_id))))
# n samples that have merged_subject_id and age_years = 4,405
length(as.vector(subset(dataset21and22$sample_id, !is.na(dataset21and22$merged_subject_id) & (!is.na(dataset21and22$age_years) | !is.na(dataset21and22$bin_9yrSize) | !is.na(dataset21and22$bin_5yrCutoff)))))
# n samples that have merged_subject_id but not age_years = 2,754
7159 - 4405
# n samples that have age_years, bin_9yrSize, or bin_5yrCutoff but not merged_subject_id = 4,703
length(as.vector(subset(dataset21and22$sample_id, is.na(dataset21and22$merged_subject_id) & (!is.na(dataset21and22$age_years) | !is.na(dataset21and22$bin_9yrSize) | !is.na(dataset21and22$bin_5yrCutoff)))))
samples <- as.vector(subset(dataset21and22$sample_id, is.na(dataset21and22$merged_subject_id) & (!is.na(dataset21and22$age_years) | !is.na(dataset21and22$bin_9yrSize) | !is.na(dataset21and22$bin_5yrCutoff))))
write_lines(samples, file = "biosample_accession_withAgeData_noSubjectID.txt", sep = "\n")
```