-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmotif_distribution_from_summits.R
169 lines (129 loc) · 7.09 KB
/
motif_distribution_from_summits.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
###############
## Author : Pooja Sethiya
## Institute : Chris Lab Faculty of Health Science / University of Macau.
## Email : yb57662@umac.mo
## Date : 08 Oct 2018
###############
#--- Input
# 1. Genome file
# 2. summits file in bed format (preferably macs output)
# 3. copy paste motifs list
# 4. flankong base pair from summit (e.g. 100)
# 5. want to compute reverse complement for your motifs list? Work only for DNA (A,T,G,C,N) sequence
genome_fasta <- "genome_chromosomes.fasta"
macs2_summit_file <- "rcoa_summits.bed"
mymotifs = read.clipboard(header=FALSE)
# SYGGRG CTGGAG
# SYGGRG CTGGGG
# SYGGRG CCGGAG
# SYGGRG CCGGGG
# SYGGRG GTGGAG
# SYGGRG GTGGGG
# SYGGRG GCGGAG
# SYGGRG GCGGGG
# GGCSS GGCCC
# GGCSS GGCCG
# GGCSS GGCGC
# GGCSS GGCGG
#--- run as
motif_distirbution_from_summits(macs2_summit_file,genome_fasta,mymotifs,flank_from_summit = 100, motif_revComplement="FALSE")
#-- load function first
motif_distirbution_from_summits <- function(macs2_summit_file,genome_fasta, mymotifs, flank_from_summit, motif_revComplement="FALSE"){
#--- Load packages
library(GenomicRanges)
library(IRanges)
library(tidyverse)
library(seqinr)
library(rtracklayer)
library(BSgenome)
library(psych)
library(reshape2)
macs2_summits <- import.bed(macs2_summit_file)
print(head(macs2_summits))
#--- get 100bp from summit
summit_100bp <- macs2_summits+flank_from_summit
print(head(summit_100bp))
#---- Get the sequence
dna <- readDNAStringSet(genome_fasta)
names(dna) <- gsub(' .*', '',names(dna))
#---- Check whether the region boundaries are within genome, remove if out of range
dd = data.frame(cbind(names(dna), width(dna))) %>% mutate(Start=rep(1,length(names(dna)))) %>% dplyr::select(c("X1","Start","X2"))
colnames(dd)=c("Chr","Start","End")
dd$Chr <- gsub(' .*', '',dd$Chr)
dd = makeGRangesFromDataFrame(dd)
flank_region_within_bound <- subsetByOverlaps(summit_100bp,dd,type = "within")
message("Binding sites within genomic range: ",length(flank_region_within_bound))
#---- Get Sequence of within range regions
flank_seq <- getSeq(dna, flank_region_within_bound)
names(flank_seq) = flank_region_within_bound$name
#--- Compute reverse complement of the given motifs
DNA_mymotifs <- DNAStringSet(as.matrix(mymotifs))
if(motif_revComplement=="TRUE"){
revComplement <- reverseComplement(DNA_mymotifs)
revComplement <- data.frame(revComplement)
#--- Combine the motifs and their reverse complements
all_motifs <- rbind.DataFrame(mymotifs$V1,revComplement$revComplement)
all_motifs <- as.matrix(all_motifs$X)
}
else{
all_motifs <- as.matrix(mymotifs)
}
#--- compute the location of the motifs on the given sequences
ll <- list()
tt <- list()
for(i in seq_along(all_motifs[,1])){
#i=1
mi0 <- vmatchPattern(all_motifs[i,2], flank_seq,fixed="subject")
coords = as.data.frame(mi0)
nmatch_per_seq <- elementNROWS(mi0)
pos = which(nmatch_per_seq>0)
tt[[i]] = table(nmatch_per_seq)
Freq = nmatch_per_seq[coords$group]
genes = names(mi0)[coords$group]
start=coords[,3]
end=coords[,4]
ll[[i]] = as.data.frame(cbind(genes,Freq,start,end))
#print(ll[[i]])
}
names(ll)=all_motifs[,1]
#---- Unique genes associated with the motifs
genes_with_motifs <- as.tibble(do.call("rbind", ll)) %>% mutate(motif=rownames(.))
genes_with_motifs$motif <- str_replace(genes_with_motifs$motif,"\\..*","")
write_delim(genes_with_motifs, paste(basename(macs2_summit_file),"_peaks_with_motif.tab", sep=""), delim ="\t",col_names = TRUE )
genes_with_motifs$gene_width = width(flank_seq[genes_with_motifs$genes])
message("motif occurrences: ", nrow(genes_with_motifs))
print(table(genes_with_motifs$motif))
genes_with_motifs$start <- as.numeric(levels(genes_with_motifs$start))[genes_with_motifs$start]
# genes_with_motifs$genes <- genes_with_motifs$genes[match(genes_with_motifs$genes, summit_100bp$name)]
# genes_with_motifs$genes <- as.factor(genes_with_motifs$genes)
end_5 <- paste("-",flank_from_summit,"bp",sep="")
end_3 <- paste(flank_from_summit,"bp",sep="")
width <- unique(genes_with_motifs$gene_width)
#-- plot the motifs on the binding site
gg <- ggplot(genes_with_motifs,aes(x=start, y=genes, color=motif,shape=motif))+
geom_point(alpha=0.8, size=1.8)+
geom_vline(data = genes_with_motifs, aes(xintercept=gene_width/2),color="blue",size=2)+
ylab("binding sites")+
xlab("")+
theme_classic()+
scale_x_continuous(limits = c(0, genes_with_motifs$gene_width[1]),breaks=c(0,width/2,width), labels=c(end_5,"summit",end_3))+
theme(legend.position = "top",
axis.ticks.y = element_blank(),
legend.text = element_text(face="bold", colour="black", size=12,angle=0),
axis.text.y = element_blank(),
axis.text.x = element_text(face="bold", colour="black", size=12,angle=0))
print(gg)
#--- save output file
ggsave(paste(basename(macs2_summit_file),"_motifdistribution.pdf", sep=""),device = "pdf", width=5, height = 7)
#-- Plot density
gd <- ggplot(genes_with_motifs,aes(start, fill=motif))+geom_density(alpha=0.7)+
xlab("")+
theme_classic()+
scale_x_continuous(limits = c(0, genes_with_motifs$gene_width[1]),breaks=c(0,width/2,width), labels=c(end_5,"summit",end_3))+
theme(legend.position = "top",
legend.text = element_text(face="bold", colour="black", size=12,angle=0),
axis.text.y = element_text(face="bold", colour="black", size=12,angle=0),
axis.text.x = element_text(face="bold", colour="black", size=12,angle=0))
print(gd)
ggsave(paste(basename(macs2_summit_file),"_motifdensity.pdf", sep=""),device = "pdf", width=5, height = 7)
}