-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_signature_discovery.R
75 lines (60 loc) · 2.13 KB
/
1_signature_discovery.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Gene Signature: Discovery Analysis
# Copyright (C) ydavidchen 2022
rm(list=ls())
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
source("GeneSigUtils.R")
library(glmnet)
set.seed(SEED)
## Constants & data:
LOW_GROUP <- c("CID4067", "CID4530N")
HIGH_GROUP <- c("CID3941", "CID4290A", "CID3948")
wu <- read.csv(paste0(DIR_WU,"GSE176078_Wu_etal_2021_bulkRNAseq_raw_counts_resaved.csv"))
commonSet <- read.table(PATH_COMMONSET)$V1
## 1. Subset to gene universe shared across cohorts:
wu <- subset(wu, wus95idmgmtGenes %in% commonSet)
rownames(wu) <- wu$wus95idmgmtGenes
wu$wus95idmgmtGenes <- NULL
## 2a. Sample-wise normalization (across ALL samples)
hist( as.numeric(as.matrix(wu)) ) #counts
wu <- normalize_expr_mat(as.matrix(wu))
hist(wu)
## 2b. Winsorize normalized data:
summary(as.numeric(wu))
wu <- custom_winsorize(wu, -WINSOR, WINSOR)
hist(wu)
## 3. Code regression target;
wu <- t(wu)
y <- factor( c(rep(1, length(HIGH_GROUP)), rep(0, length(LOW_GROUP))) )
wu <- wu[c(HIGH_GROUP,LOW_GROUP), ] #reorder
## 4. Build model
stopifnot(identical(rownames(wu), c(HIGH_GROUP,LOW_GROUP))) #checkpoint
eNet <- glmnet(wu, y, family="binomial", alpha=0.5)
plot(eNet)
lambda_best <- min(eNet$lambda)
lambda_best #0.009797275
## 5. Extract coefficients:
coefs <- coef(eNet, s=lambda_best)
coefs <- data.frame(
Gene = coefs@Dimnames[[1]][1+coefs@i],
Coef = coefs@x
)
## Visualization
samp_annot <- data.frame(
row.names = c(HIGH_GROUP, LOW_GROUP),
HighKi67 = ifelse(y==1, "Yes", "No")
)
wrapper_heatmap(
t(wu[,colnames(wu) %in% coefs$Gene]),
clustering_distance_rows = CL_PARAMS[1],
clustering_method = CL_PARAMS[2],
col_annot = samp_annot,
showcn = TRUE
)
ggplot(aes(reorder(Gene, Coef), Coef), data=coefs) +
geom_bar(stat="identity") +
labs(x="Gene", y="Elastic-net coefficient") +
scale_y_continuous(breaks=seq(-1.3,0.5,0.2)) +
BARPLOT_THEME
## Export matrix & object:
# save(list=c("coefs","commonSet","eNet"), file=paste0(DIR_OUT,"enet_object.RData"), compress=TRUE)
# write.csv(coefs, paste0(DIR_OUT,"enet_coefs.csv"), row.names=FALSE, quote=FALSE)