-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtransLoci2GenoLoci_AHCPSM_v1.py
194 lines (181 loc) · 9.93 KB
/
transLoci2GenoLoci_AHCPSM_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 9 21:20:14 2021
@convert transLoci to genoLoci base on refrence of gtf annotation
@author: Hang Qin
Platform: Linux, python3.6
Notes: the gtf file must be consistent with the transcriptome reference used for mapping in running DENA, such as download the cdna.fa and .gtf files of the same release version from Ensembl database
"""
import argparse, os, sys
import pandas as pd
def CalDistance(dic_exonLen, line):
transID, transLoci, motif, modRN, totalRN, ratio = line[:]
#<dic_exonLen> {'AT1G01010.1': [283, 281, 610]}
exonLen = dic_exonLen[transID]
length, exonID, distance = 0, 0, 0
res = dict()
for ele in range(len(exonLen)):
length += int(exonLen[ele])
if length <= int(transLoci):
continue
else:
distance = length - int(transLoci)
exonID = ele
res={"transID":transID, "distance":distance, "exonID":exonID}
break
return res
def get_GenoLoci(df, inf, ouf):
df, inf, ouf = df, inf, ouf
"""
<df>:
Chr start end strand geneid transid exonid geneName exonLength
1 3631 3913 + AT1G01010 AT1G01010.1 1 NAC001 283
1 3996 4276 + AT1G01010 AT1G01010.1 2 NAC001 281
1 4486 5095 + AT1G01010 AT1G01010.1 3 NAC001 610
"""
dic_exonLen = df.groupby('transid').exonLength.apply(list).to_dict() #<dic_exonLen> {'AT1G01010.1': [283, 281, 610]}
dic_leftBoundary = df.groupby('transid').start.apply(list).to_dict() #<dic_exonLen> {'AT1G01010.1': [3631, 3996, 4486]}
dic_rightBoundary = df.groupby('transid').end.apply(list).to_dict() #<dic_exonLen> {'AT1G01010.1': [3913, 4276, 5095]}
all_transids = set(df['transid'].tolist()) #get all transids in the GTF
dic_strand = df.groupby('transid').strand.apply(set).apply(list).to_dict() #<dic_strand> {'AT1G01010.1': ["+"]}
#dic_strand = df[['transid','strand']].set_index('transid').to_dict()['strand'] #<dic_strand> {'AT1G01010.1': [+, +, +]}
dic_geneid = df.groupby('transid').geneid.apply(set).apply(list).to_dict() #<dic_geneid> {'AT1G01010.1': ["AT1G01010"]}
dic_Chr = df.groupby('transid').Chr.apply(set).apply(list).to_dict() #<dic_Chr> {'AT1G01010.1': ["1"]}
dic_geneName = df.groupby('transid').geneName.apply(set).apply(list).to_dict() #<dic_geneName> {'AT1G01010.1': ["NAC001"]}
df_inf = pd.read_csv(inf, sep="\t", header = None, low_memory=False, error_bad_lines=False) #read the input files, and "error_bad_lines=False" can remove these lines more than 7 columns
df_inf.columns=['transid','transLoci','motif','modRN','totalRN','ratio'] #add colume names
"""
transid transLoci modRN totalRN ratio
AT1G01620.1 606 AAACA 13 376 0.034574468085106384
AT1G01820.1 1203 AAACA 10 26 0.38461538461538464
AT1G02130.1 514 AAACA 11 56 0.19642857142857142
"""
Strand, GenoLoci, Chrom, GeneID, GeneName = [], [], [], [], []
GenoLoci = []
for index, row in df_inf.iterrows():
genoLoci = 0
transID, transLoci, motif, modRN, totalRN, ratio = row[:]
try:
if str(transID) in all_transids:
Chr = str(dic_Chr[transID][0])
geneid = str(dic_geneid[transID][0])
geneName = str(dic_geneName[transID][0])
strand = str(dic_strand[transID][0])
line = list(row)
dicLoci = CalDistance(dic_exonLen, line) #{"transID":transID, "distance":distance, "exonID":exonID}
exonNum = int(dicLoci["exonID"])
if strand == "+":
genoLoci = int(dic_rightBoundary[transID][exonNum]) - int(dicLoci["distance"])
elif strand == "-":
genoLoci = int(dic_leftBoundary[transID][exonNum]) + int(dicLoci["distance"])
else:
strand, genoLoci = str(strand), str("NA")
else:
strand, genoLoci, Chr, geneid, geneName = str("NA"), str("NA"), str("NA"), str("NA"), str("NA")
except:
strand, genoLoci, Chr, geneid, geneName = str("Outlier"), str("Outlier"), str("Outlier"), str("Outlier"), str("Outlier")
GenoLoci.append(genoLoci)
Strand.append(strand)
Chrom.append(Chr)
GeneID.append(geneid)
GeneName.append(geneName)
continue
GenoLoci.append(genoLoci)
Strand.append(strand)
Chrom.append(Chr)
GeneID.append(geneid)
GeneName.append(geneName)
df_inf.insert(loc = 0, column = "Chr", value = Chrom)
df_inf.insert(loc = 1, column = "genoLoci", value = GenoLoci)
df_inf.insert(loc = 2, column = "strand", value= Strand)
df_inf.insert(loc = 3, column = "geneid", value = GeneID)
df_inf["geneName"] = GeneName
df_inf.to_csv(ouf, sep='\t',index=False)
"""
Chr genoLoci strand geneid transid transLoci modRN totalRN ratio geneName
"""
def Tair10(GTF): #Arabidopsis_thaliana.TAIR10.50.gtf
Gtf = GTF
"""
deal with the gtf files with shell
"""
cmd = str('''awk '{if($3=="exon"){print $0}}' %s > gtf.deal'''%(Gtf))
os.system(cmd)
os.system(r'''sed 's/gene_id //g' gtf.deal |sed 's/transcript_id //g'|sed 's/exon_number //g'|sed 's/gene_name //g'|sed 's/"//g'|sed 's/;//g'|sed 's/ /\t/g' > gtf.deal1''')
os.system(r"cut -f1,4,5,7,9,10,11,12 gtf.deal1 > gtf.deal")
"""
#gtf.deal:
Chr start end strand geneid transid exonid geneName
1 3631 3913 + AT1G01010 AT1G01010.1 1 NAC001
"""
df = pd.read_csv("gtf.deal", sep="\t", header = None, low_memory=False)
df[8] = df[2] - df[1] + 1
df.columns=['Chr','start','end','strand','geneid','transid','exonid','geneName','exonLength']
"""
<df>
Chr start end strand geneid transid exonid geneName exonLength
1 3631 3913 + AT1G01010 AT1G01010.1 1 NAC001 283
"""
tmpf = "_".join([Gtf, "deal"])
df.to_csv(tmpf, sep='\t',index=False)
os.system(r"rm gtf.deal gtf.deal1")
def S_cerevisiae(GTF): #Saccharomyces_cerevisiae.R64-1-1.50.gtf
Tair10(GTF)
def S_pombe(GTF): # Schizosaccharomyces_pombe.ASM294v2.50.gtf
Tair10(GTF)
def Celegans101(GTF): #Caenorhabditis_elegans.WBcel235.103.gtf
Tair10(GTF)
def Human38(GTF): #Homo_sapiens.GRCh38.103.gtf
Gtf = GTF #Homo_sapiens.GRCh38.103.gtf
cmd = str('''awk '{if($3=="exon"){print $0}}' %s > gtf.deal'''%(Gtf))
os.system(cmd)
os.system(r'''sed 's/gene_id //g' gtf.deal |sed 's/gene_version //g'|sed 's/transcript_id //g' |sed 's/"; transcript_version "/./g' |sed 's/exon_number //g'|sed 's/gene_name //g'|sed 's/"//g'|sed 's/;//g'|sed 's/ /\t/g' > gtf.deal1''')
os.system(r"cut -f1,4,5,7,9,11,12,13 gtf.deal1 > gtf.deal")
df = pd.read_csv("gtf.deal", sep="\t", header = None, low_memory=False)
df[8] = df[2] - df[1] + 1
df.columns=['Chr','start','end','strand','geneid','transid','exonid','geneName','exonLength']
tmpf = "_".join([Gtf, "deal"])
df.to_csv(tmpf, sep='\t',index=False)
os.system(r"rm gtf.deal gtf.deal1")
def Mouse(GTF): #Mus_musculus.GRCm39.103.gtf
Human38(GTF)
def Zebrefish(GTF): #Danio_rerio.GRCz11.103_Zebrafish.gtf
Human38(GTF)
def Pop_tri(GTF): #Populus_trichocarpa.Pop_tri_v3.50.gtf
Gtf = GTF
cmd = str('''awk '{if($3=="exon"){print $0}}' %s > gtf.deal'''%(Gtf))
os.system(cmd)
os.system(r'''sed 's/gene_id //g' gtf.deal |sed 's/transcript_id //g'|sed 's/exon_number //g'|sed 's/gene_source //g'|sed 's/"//g'|sed 's/;//g'|sed 's/ /\t/g' > gtf.deal1''')
os.system(r"cut -f1,4,5,7,9,10,11,12 gtf.deal1 > gtf.deal")
df = pd.read_csv("gtf.deal", sep="\t", header = None, low_memory=False)
df[8] = df[2] - df[1] + 1
df.columns=['Chr','start','end','strand','geneid','transid','exonid','geneName','exonLength']
tmpf = "_".join([Gtf, "deal"])
df.to_csv(tmpf, sep='\t',index=False)
os.system(r"rm gtf.deal gtf.deal1")
def TransLoci2GenoLoci():
species = ["Tair10", "Human38", "Celegans101", "Pop_tri", "S_cerevisiae", "S_pombe", "Mouse", "Zebrefish"]
slt = FLAGS.select
gtf, inf, ouf = FLAGS.gtf, FLAGS.input, FLAGS.output
if slt not in species:
sys.exit("please select the right species(Tair10, Human38, Celegans101, Pop_tri, S_cerevisiae, S_pombe), default: Tair10")
else:
sys.stderr.write("You select the %s species!\n"%(slt))
tmpf = str("_".join([gtf, "deal"]))
if os.path.isfile(tmpf):
sys.stderr.write("%s has existed.\n"%(tmpf))
else:
eval(slt)(gtf)
df = pd.read_csv(tmpf, sep="\t", low_memory=False)
get_GenoLoci(df, inf, ouf)
sys.stderr.write("%s has finished the conversion of transLoci to genoLoci.\n"%(inf))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='covert trans coordinate to geno coordinate. Notes: the gtf file must be consistent with the transcriptome reference used for mapping in running DENA, such as download the cdna.fa and .gtf files of the same release version from Ensembl database')
parser.add_argument('-s', '--select', default="Tair10", help="select the species <Tair10, Human38, Celegans101, Pop_tri, S_cerevisiae, S_pombe, Mouse, Zebrefish>, default: Tair10")
parser.add_argument('-g', '--gtf', required = True,help="the gtf file download from ensembl database")
parser.add_argument('-i', '--input', required = True,help="trans coordinate files from DENA predict, containing six columns <transid,transLoci,motif,modRN,totalRN,ratio>")
parser.add_argument('-o', '--output', required = True, help="name of Output file")
args = parser.parse_args(sys.argv[1:])
global FLAGS
FLAGS = args
TransLoci2GenoLoci()