-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_ml_table.R
executable file
·137 lines (107 loc) · 3.78 KB
/
make_ml_table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
args<-commandArgs(TRUE) # 1st = output folder (dir ML_tables will be created)
library(tidyverse)
options(stringsAsFactors=F)
path1 = "/Features_threePrime"
path2 = "/Features_ICE"
path3 = "/Features_ncRNA/lncRNA"
path4 = "/Features_ncRNA/ncRNA"
path5 = "/Features_ncRNA/pseudoGene"
path6 = "/Features_fivePrime"
out = args[1]
if(!dir.exists(paste(out, "/ML_tables", sep=""))){
system(paste("mkdir -m a=rwx ",out, "/ML_tables", sep=""))
}
# output path
out.path = paste(out, "/ML_tables", sep="")
pa_signal = c(
paste(path1, "/polyA_signal.txt", sep=""),
paste(path2, "/polyA_signal.txt", sep=""),
paste(path3, "/polyA_signal.txt", sep=""),
paste(path4, "/polyA_signal.txt", sep=""),
paste(path5, "/polyA_signal.txt", sep=""),
paste(path6, "/polyA_signal.txt", sep="")
)
ntf_di = c(
paste(path1, "/nt_di_freq.txt", sep=""),
paste(path2, "/nt_di_freq.txt", sep=""),
paste(path3, "/nt_di_freq.txt", sep=""),
paste(path4, "/nt_di_freq.txt", sep=""),
paste(path5, "/nt_di_freq.txt", sep=""),
paste(path6, "/nt_di_freq.txt", sep="")
)
exp = c(
paste(path1, "/all_tissue_exp_feat_colwise.txt", sep=""),
paste(path2, "/all_tissue_exp_feat_colwise.txt", sep=""),
paste(path3, "/all_tissue_exp_feat_colwise.txt", sep=""),
paste(path4, "/all_tissue_exp_feat_colwise.txt", sep=""),
paste(path5, "/all_tissue_exp_feat_colwise.txt", sep=""),
paste(path6, "/all_tissue_exp_feat_colwise.txt", sep="")
)
phastCons = c(
paste(path1, "/phastCons_feat.txt", sep=""),
paste(path2, "/phastCons_feat.txt", sep=""),
paste(path3, "/phastCons_feat.txt", sep=""),
paste(path4, "/phastCons_feat.txt", sep=""),
paste(path5, "/phastCons_feat.txt", sep=""),
paste(path6, "/phastCons_feat.txt", sep="")
)
# dna structural properties
dsp = c(
paste(path1, "/structural_feat_mean.txt", sep=""),
paste(path2, "/structural_feat_mean.txt", sep=""),
paste(path3, "/structural_feat_mean.txt", sep=""),
paste(path4, "/structural_feat_mean.txt", sep=""),
paste(path5, "/structural_feat_mean.txt", sep=""),
paste(path6, "/structural_feat_mean.txt", sep="")
)
# repeats
repeats = c(
paste(path1, "/repeats.txt", sep=""),
paste(path2, "/repeats.txt", sep=""),
paste(path3, "/repeats.txt", sep=""),
paste(path4, "/repeats.txt", sep=""),
paste(path5, "/repeats.txt", sep=""),
paste(path6, "/repeats.txt", sep="")
)
df1 = pa_signal %>%
map(read.table, header=TRUE) %>%
bind_rows() %>%
distinct(id, .keep_all = TRUE) %>%
dplyr::select(c(id, polyA_signal)) %>%
mutate(polyA_signal = as.factor(polyA_signal))
df2 = exp %>%
map(read.table, header=TRUE) %>%
bind_rows() %>%
distinct(id, .keep_all = TRUE) %>%
#dplyr::select(-c(id), -contains("meanCov_")) %>%
transmute(meanPd = rowMeans(dplyr::select(., contains("pd_"))),
meanEE = rowMeans(dplyr::select(., contains("EE_")))
)
df3 = phastCons %>%
map(read.table, header=TRUE) %>%
bind_rows() %>%
distinct(id, .keep_all = TRUE) %>%
dplyr::select(-c(id, class))
df4 = dsp %>%
map(read.table, header=TRUE) %>%
bind_rows() %>%
distinct(id, .keep_all = TRUE) %>%
dplyr::select(-c(id, class))
df5 = repeats %>%
map(read.table, header=TRUE) %>%
bind_rows() %>%
distinct(id, .keep_all = TRUE) %>%
plyr::rename(c("fracOverlap" = "RepeatsOverlap")) %>%
dplyr::select(-c(id, class))
df6 = ntf_di %>%
map(read.table, header=TRUE) %>%
bind_rows() %>%
distinct(id, .keep_all = TRUE) %>%
dplyr::select(-c(id))
data = cbind(df1, df2, df3, df4, df5, df6) %>%
mutate(class = as.factor(ifelse(class %in% "3-UTR", "UTR", class))) %>%
tidyr::drop_na() %>%
remove_rownames() %>%
column_to_rownames("id")
file = paste(out.path, "/ml_table.rds", sep="")
saveRDS(data, file= file)