-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarise_f3uter_cv.R
executable file
·406 lines (355 loc) · 18.1 KB
/
summarise_f3uter_cv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
args<-commandArgs(TRUE) # 1st argument: ML output directory
library(rlist)
library(matrixStats)
library(ROCR)
library(e1071)
library(PRROC)
library(tidyverse)
outPath = args[1]
system(paste("mkdir -m a=rwx ",outPath, "/Summary", sep=""))
auc <- list.files(path = outPath, pattern = "\\.auc.txt", recursive = TRUE, full.names = TRUE) %>%
map(read.table, header=F) %>%
bind_rows()
file1 = paste(outPath, "/Summary/auc.txt",sep="")
write.table(auc, file=file1, sep="\t",quote=F,row.names=F, col.names=F)
prauc <- list.files(path = outPath, pattern = "\\.prauc.txt", recursive = TRUE, full.names = TRUE) %>%
map(read.table, header=F) %>%
bind_rows()
file2 = paste(outPath, "/Summary/prauc.txt",sep="")
write.table(prauc, file=file2, sep="\t",quote=F,row.names=F, col.names=F)
decAcu <- list.files(path = outPath, pattern = "\\.decAccuracy.txt", recursive = TRUE, full.names = TRUE) %>%
map(read.table, header=F, row.names = 1) %>%
list.cbind() %>%
rownames_to_column("features") %>%
mutate(mean = rowMeans(.[,-1]), sd = rowSds(as.matrix(.[,-1])))
file3 = paste(outPath, "/Summary/decAccuracy.txt",sep="")
write.table(decAcu, file=file3, sep="\t",quote=F,row.names=F, col.names=T)
decGini <- list.files(path = outPath, pattern = "\\.decGini.txt", recursive = TRUE, full.names = TRUE) %>%
map(read.table, header=F, row.names = 1) %>%
list.cbind() %>%
rownames_to_column("features") %>%
mutate(mean = rowMeans(.[,-1]), sd = rowSds(as.matrix(.[,-1])))
file4 = paste(outPath, "/Summary/decGini.txt",sep="")
write.table(decGini, file=file4, sep="\t",quote=F,row.names=F, col.names=T)
stats <- list.files(path = outPath, pattern = "\\.results.txt", recursive = TRUE, full.names = TRUE) %>%
map(read.table, header=F, row.names = 1) %>%
list.cbind() %>%
rownames_to_column("stats") %>%
mutate(mean = rowMeans(.[,-1]), sd = rowSds(as.matrix(.[,-1])))
file5 = paste(outPath, "/Summary/stats.txt",sep="")
write.table(stats, file=file5, sep="\t",quote=F,row.names=F, col.names=T)
# combine the prediction lists RDS
res.pred <- list.files(path = outPath, pattern = "\\.resPred.rds", recursive = TRUE, full.names = TRUE) %>%
sapply(.,readRDS, USE.NAMES=FALSE)
attributes(res.pred) <- NULL
res.labels <- list.files(path = outPath, pattern = "\\.resLabels.rds", recursive = TRUE, full.names = TRUE) %>%
sapply(.,readRDS, USE.NAMES=FALSE)
attributes(res.labels) <- NULL
########################################################################
######################## ROCR curve ####################################
pred.all = prediction(res.pred,res.labels)
perf.all = performance(pred.all,"tpr","fpr")
# function to average results - modified function from ROCR package
avg.results <- function (perf)
{
## for infinite cutoff, assign maximal finite cutoff + mean difference
## between adjacent cutoff pairs
if (length(perf@alpha.values)!=0) perf@alpha.values <-
lapply(perf@alpha.values,
function(x) { isfin <- is.finite(x);
x[is.infinite(x)] <-
(max(x[isfin]) +
mean(abs(x[isfin][-1] -
x[isfin][-length(x[isfin])])));
x } )
## remove samples with x or y not finite
for (i in 1:length(perf@x.values)) {
ind.bool <- (is.finite(perf@x.values[[i]]) &
is.finite(perf@y.values[[i]]))
if (length(perf@alpha.values)>0)
perf@alpha.values[[i]] <- perf@alpha.values[[i]][ind.bool]
perf@x.values[[i]] <- perf@x.values[[i]][ind.bool]
perf@y.values[[i]] <- perf@y.values[[i]][ind.bool]
}
perf.sampled <- perf
alpha.values <- rev(seq(min(unlist(perf@alpha.values)), max(unlist(perf@alpha.values)),
length = max(sapply(perf@alpha.values, length))))
for (i in 1:length(perf.sampled@y.values)) {
perf.sampled@x.values[[i]] <- approxfun(perf@alpha.values[[i]],
perf@x.values[[i]], rule = 2, ties = mean)(alpha.values)
perf.sampled@y.values[[i]] <- approxfun(perf@alpha.values[[i]],
perf@y.values[[i]], rule = 2, ties = mean)(alpha.values)
}
perf.avg <- perf.sampled
perf.avg.data <<- perf.avg
perf.avg@x.values <- list(rowMeans(data.frame(perf.avg@x.values)))
perf.avg@y.values <- list(rowMeans(data.frame(perf.avg@y.values)))
perf.avg@alpha.values <- list(alpha.values)
perf.rocr.avg <<- perf.avg
}
avg.results(perf.all)
rocs.x = data.frame()
for(q in 1:length(perf.avg.data@x.values)){
values = unlist(perf.avg.data@x.values[q]) %>% as.data.frame() %>%
`colnames<-` (c("x"))
rocs.x = rbind(rocs.x,values)
}
rocs.y = data.frame()
for(q in 1:length(perf.avg.data@y.values)){
values = unlist(perf.avg.data@y.values[q]) %>%
as.data.frame() %>%
mutate(type1 = paste("fold",q, sep=""), type2 = "cv") %>%
plyr::rename(c("." = "y"))
rocs.y = rbind(rocs.y,values)
}
roc.all = cbind(rocs.x,rocs.y)
file6 = paste(outPath, "/Summary/rocValues.cv.txt",sep="")
write.table(roc.all, file=file6, sep="\t",quote=F,row.names=F)
rocs.y.wide = data.frame(perf.avg.data@y.values)
avg.y = rowMeans(rocs.y.wide)
rocs.y.wide = transform(rocs.y.wide, sd=apply(rocs.y.wide,1, sd))
rocs.y.wide$mean = avg.y
file7 = paste(outPath, "/Summary/rocMeanSd.txt",sep="")
write.table(rocs.y.wide, file=file7, sep="\t",quote=F,row.names=F)
mean.x = unlist(perf.rocr.avg@x.values) %>% as.data.frame() %>% `colnames<-` (c("x"))
mean.y = unlist(perf.rocr.avg@y.values) %>% as.data.frame() %>%
mutate(type1 = "mean", type2 = "mean") %>%
plyr::rename(c("." = "y"))
mean.all = cbind(mean.x,mean.y)
file8 = paste(outPath, "/Summary/rocValues.mean.txt", sep="")
write.table(mean.all, file=file8, sep="\t",quote=F,row.names=F)
######################################################################
######################## PR curve ####################################
perf.pr.all = performance(pred.all,"prec","rec")
avg.results(perf.pr.all)
pr.x = data.frame()
for(q in 1:length(perf.avg.data@x.values)){
values = unlist(perf.avg.data@x.values[q]) %>% as.data.frame() %>%
`colnames<-` (c("x"))
pr.x = rbind(pr.x,values)
}
pr.y = data.frame()
for(q in 1:length(perf.avg.data@y.values)){
values = unlist(perf.avg.data@y.values[q]) %>%
as.data.frame() %>%
mutate(type1 = paste("fold",q, sep=""), type2 = "cv") %>%
plyr::rename(c("." = "y"))
pr.y = rbind(pr.y,values)
}
pr.all = cbind(pr.x,pr.y)
file9 = paste(outPath, "/Summary/prValues.cv.txt",sep="")
write.table(pr.all, file=file9, sep="\t",quote=F,row.names=F)
pr.y.wide = data.frame(perf.avg.data@y.values)
avg.pr.y = rowMeans(pr.y.wide)
pr.y.wide = transform(pr.y.wide, sd=apply(pr.y.wide,1, sd))
pr.y.wide$mean = avg.pr.y
file10 = paste(outPath,"/Summary/prMeanSd.txt",sep="")
write.table(pr.y.wide,file=file10,sep="\t",quote=F,row.names=F)
mean.pr.x = unlist(perf.rocr.avg@x.values) %>% as.data.frame() %>% `colnames<-` (c("x"))
mean.pr.y = unlist(perf.rocr.avg@y.values) %>% as.data.frame() %>%
mutate(type1 = "mean", type2 = "mean") %>%
plyr::rename(c("." = "y"))
mean.pr.all = cbind(mean.pr.x,mean.pr.y)
file11 = paste(outPath,"/Summary/prValues.mean.txt",sep="")
write.table(mean.pr.all, file=file11, sep="\t",quote=F,row.names=F)
###########################################
######## Other plots ######################
###########################################
format_data_from_rocr_object <- function(perf.obj){
x = data.frame()
for(q in 1:length(perf.obj@x.values)){
values = unlist(perf.obj@x.values[q]) %>% as.data.frame() %>%
`colnames<-` (c("x"))
x = rbind(x,values)
}
y = data.frame()
for(q in 1:length(perf.obj@y.values)){
values = unlist(perf.obj@y.values[q]) %>%
as.data.frame() %>%
mutate(type1 = paste("fold",q, sep=""), type2 = "cv") %>%
plyr::rename(c("." = "y"))
y = rbind(y,values)
}
all = cbind(x,y)
return(all)
}
perf.mat = performance(pred.all,"mat")
mat.all = format_data_from_rocr_object(perf.mat)
mat.all.subset <- mat.all %>% filter(x %in% c(0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
mat.mean <- mat.all %>% group_by(x) %>% summarise(mean = mean(y))
file12 = paste(outPath,"/Summary/mcc_vs_cutoff.png",sep="")
png(file12,bg="transparent",units="in",width = 4.25, height= 3.75 ,res=600)
#plot(perf.mat, avg="vertical", spread.estimate="boxplot", col="blue")
ggplot(data=mat.all.subset, aes(x=factor(x), y=y)) +
geom_line(data = mat.mean, aes(x=as.factor(x), y=mean, group = 1), color = "red") +
geom_boxplot(lwd = 0.55, alpha=0.8, fill="red", width = 20, outlier.shape=21, outlier.size=0.8, outlier.fill = "grey") +
theme_bw() +
theme(plot.title = element_text(size=13, hjust=0.5),
legend.key.size = unit(0.40,"cm"),
legend.title = element_text(size=11),
legend.text = element_text(size = 11),
legend.position= c(0.85,0.20),
legend.background = element_rect(fill = "transparent",colour = NA),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
#panel.border = element_rect(colour="BLACK",size=0.4),
panel.border = element_blank(),
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12,angle = 90),
panel.background = element_rect(fill="transparent"),
plot.background = element_rect(fill = "transparent",colour = NA)
)+
theme(axis.line = element_line(color = "black", size=0.4)) +
scale_x_discrete(name = "Prediction cutoff", breaks = seq(0,1, by=0.1)) +
scale_y_continuous(name = "Matthews correlation coefficient", limits = c(0,1))
dev.off()
perf.err = performance(pred.all,"err")
err.all = format_data_from_rocr_object(perf.err)
err.all.subset <- err.all %>% filter(x %in% c(0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
err.mean <- err.all %>% group_by(x) %>% summarise(mean = mean(y))
file13 = paste(outPath,"/Summary/err_vs_cutoff.png",sep="")
png(file13,bg="transparent",units="in",width = 4.25, height= 3.75 ,res=600)
#plot(perf.err, avg="vertical", spread.estimate="boxplot", col="blue")
ggplot(data=err.all.subset, aes(x=factor(x), y=y)) +
geom_line(data = err.mean, aes(x=as.factor(x), y=mean, group = 1), color = "red") +
geom_boxplot(lwd = 0.55, alpha=0.8, fill="red", width = 20, outlier.shape=21, outlier.size=0.8, outlier.fill = "grey") +
theme_bw() +
theme(plot.title = element_text(size=13, hjust=0.5),
legend.key.size = unit(0.40,"cm"),
legend.title = element_text(size=11),
legend.text = element_text(size = 11),
legend.position= c(0.85,0.20),
legend.background = element_rect(fill = "transparent",colour = NA),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
#panel.border = element_rect(colour="BLACK",size=0.4),
panel.border = element_blank(),
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12,angle = 90),
panel.background = element_rect(fill="transparent"),
plot.background = element_rect(fill = "transparent",colour = NA)
)+
theme(axis.line = element_line(color = "black", size=0.4)) +
scale_x_discrete(name = "Prediction cutoff", breaks = seq(0,1, by=0.1)) +
scale_y_continuous(name = "Error rate", limits = c(0,1))
dev.off()
perf.f = performance(pred.all,"f")
f.all = format_data_from_rocr_object(perf.f)
f.all.subset <- f.all %>% filter(x %in% c(0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
f.mean <- f.all %>% group_by(x) %>% summarise(mean = mean(y))
file14 = paste(outPath,"/Summary/f_vs_cutoff.png",sep="")
png(file14,bg="transparent",units="in",width = 4.25, height= 3.75 ,res=600)
#plot(perf.f, avg="vertical", spread.estimate="boxplot", col="blue")
ggplot(data=f.all.subset, aes(x=factor(x), y=y)) +
geom_line(data = f.mean, aes(x=as.factor(x), y=mean, group = 1), color = "red") +
geom_boxplot(lwd = 0.55, alpha=0.8, fill="red", width = 20, outlier.shape=21, outlier.size=0.8, outlier.fill = "grey") +
theme_bw() +
theme(plot.title = element_text(size=13, hjust=0.5),
legend.key.size = unit(0.40,"cm"),
legend.title = element_text(size=11),
legend.text = element_text(size = 11),
legend.position= c(0.85,0.20),
legend.background = element_rect(fill = "transparent",colour = NA),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
#panel.border = element_rect(colour="BLACK",size=0.4),
panel.border = element_blank(),
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12,angle = 90),
panel.background = element_rect(fill="transparent"),
plot.background = element_rect(fill = "transparent",colour = NA)
)+
theme(axis.line = element_line(color = "black", size=0.4)) +
scale_x_discrete(name = "Prediction cutoff", breaks = seq(0,1, by=0.1)) +
scale_y_continuous(name = "Precision-recall f-measure", limits = c(0,1))
dev.off()
perf.acc = performance(pred.all,"acc")
acc.all = format_data_from_rocr_object(perf.acc)
acc.all.subset <- acc.all %>% filter(x %in% c(0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
acc.mean <- acc.all %>% group_by(x) %>% summarise(mean = mean(y))
file15 = paste(outPath,"/Summary/acc_vs_cutoff.png",sep="")
png(file15,bg="transparent",units="in",width = 4.25, height= 3.75 ,res=600)
#plot(perf.acc, avg="vertical", spread.estimate="boxplot", col="blue")
ggplot(data=acc.all.subset, aes(x=factor(x), y=y)) +
geom_line(data = acc.mean, aes(x=as.factor(x), y=mean, group = 1), color = "red") +
geom_boxplot(lwd = 0.55, alpha=0.8, fill="red", width = 20, outlier.shape=21, outlier.size=0.8, outlier.fill = "grey") +
theme_bw() +
theme(plot.title = element_text(size=13, hjust=0.5),
legend.key.size = unit(0.40,"cm"),
legend.title = element_text(size=11),
legend.text = element_text(size = 11),
legend.position= c(0.85,0.20),
legend.background = element_rect(fill = "transparent",colour = NA),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
#panel.border = element_rect(colour="BLACK",size=0.4),
panel.border = element_blank(),
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12,angle = 90),
panel.background = element_rect(fill="transparent"),
plot.background = element_rect(fill = "transparent",colour = NA)
)+
theme(axis.line = element_line(color = "black", size=0.4)) +
scale_x_discrete(name = "Prediction cutoff", breaks = seq(0,1, by=0.1)) +
scale_y_continuous(name = "Accuracy", limits = c(0,1))
dev.off()
perf.sens = performance(pred.all,"sens")
sens.all = format_data_from_rocr_object(perf.sens)
sens.all.subset <- sens.all %>% filter(x %in% c(0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
sens.mean <- sens.all %>% group_by(x) %>% summarise(mean = mean(y))
file16 = paste(outPath,"/Summary/sens_vs_cutoff.png",sep="")
png(file16,bg="transparent",units="in",width = 4.25, height= 3.75 ,res=600)
#plot(perf.sens, avg="vertical", spread.estimate="boxplot", col="blue")
ggplot(data=sens.all.subset, aes(x=factor(x), y=y)) +
geom_line(data = sens.mean, aes(x=as.factor(x), y=mean, group = 1), color = "red") +
geom_boxplot(lwd = 0.55, alpha=0.8, fill="red", width = 20, outlier.shape=21, outlier.size=0.8, outlier.fill = "grey") +
theme_bw() +
theme(plot.title = element_text(size=13, hjust=0.5),
legend.key.size = unit(0.40,"cm"),
legend.title = element_text(size=11),
legend.text = element_text(size = 11),
legend.position= c(0.85,0.20),
legend.background = element_rect(fill = "transparent",colour = NA),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
#panel.border = element_rect(colour="BLACK",size=0.4),
panel.border = element_blank(),
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12,angle = 90),
panel.background = element_rect(fill="transparent"),
plot.background = element_rect(fill = "transparent",colour = NA)
)+
theme(axis.line = element_line(color = "black", size=0.4)) +
scale_x_discrete(name = "Prediction cutoff", breaks = seq(0,1, by=0.1)) +
scale_y_continuous(name = "Sensitivity", limits = c(0,1))
dev.off()
perf.spec = performance(pred.all,"spec")
spec.all = format_data_from_rocr_object(perf.spec)
spec.all.subset <- spec.all %>% filter(x %in% c(0.02, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
spec.mean <- spec.all %>% group_by(x) %>% summarise(mean = mean(y))
file17 = paste(outPath,"/Summary/spec_vs_cutoff.png",sep="")
png(file17,bg="transparent",units="in",width = 4.25, height= 3.75 ,res=600)
#plot(perf.spec, avg="vertical", spread.estimate="boxplot", col="blue")
ggplot(data=spec.all.subset, aes(x=factor(x), y=y)) +
geom_line(data = spec.mean, aes(x=as.factor(x), y=mean, group = 1), color = "red") +
geom_boxplot(lwd = 0.55, alpha=0.8, fill="red", width = 20, outlier.shape=21, outlier.size=0.8, outlier.fill = "grey") +
theme_bw() +
theme(plot.title = element_text(size=13, hjust=0.5),
legend.key.size = unit(0.40,"cm"),
legend.title = element_text(size=11),
legend.text = element_text(size = 11),
legend.position= c(0.85,0.20),
legend.background = element_rect(fill = "transparent",colour = NA),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
#panel.border = element_rect(colour="BLACK",size=0.4),
panel.border = element_blank(),
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12,angle = 90),
panel.background = element_rect(fill="transparent"),
plot.background = element_rect(fill = "transparent",colour = NA)
)+
theme(axis.line = element_line(color = "black", size=0.4)) +
scale_x_discrete(name = "Prediction cutoff", breaks = seq(0,1, by=0.1)) +
scale_y_continuous(name = "Specificity", limits = c(0,1))
dev.off()