forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimgtToFasta
executable file
·136 lines (114 loc) · 4.29 KB
/
imgtToFasta
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
import sys
from optparse import OptionParser
import glob, os.path
import collections, logging
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = OptionParser("""usage: %prog [options] DATFILE - convert imgt.dat to fasta format on stdout. Only process human sequences. Annotate with paper info in fasta id line.
to convert fastsa file to db:
/cluster/bin/blast/x86_64/ncbi-blast-2.2.24+/bin/makeblastdb -in imgt.fa -dbtype nucl
to blast on db:
/cluster/bin/blast/x86_64/ncbi-blast-2.2.24+/bin/tblastn -query david.fa -db imgt.fa
""")
parser.add_option("-f", "--filter", dest="filterWords", action="append", help="can be specified several times, keyword, output only records that contain one of the keywords in their fasta-id line, not case sensitive (includes the paper's title, authors. Example: -f ankylos -f spondilitis -f rheuma -f arthritis", metavar="WORDLIST_COMMASEP")
parser.add_option("-o", "--organism", dest="organism", action="store", help="organism to extract, default %default", metavar="NAME", default="Homo sapiens")
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
#def writeList(fname, list):
#of = open(fname, "w")
#for row in list:
#row = [str(d) for d in row]
#of.write("\t".join(row))
#of.write("\n")
#of.close()
# RECORDS
class Reference:
def __init__(self):
self.authors=[]
self.titles=[]
self.journals=[]
class IMGTRecord:
def __init__(self):
self.refList=[]
self.seqs=[]
self.genes={}
def refString(self):
strList = []
for ref in self.refList:
str = ";".join(ref.authors)+"_"+";".join(ref.titles)+"_"+";".join(ref.journals)
strList.append(str)
return "___".join(strList)
def parseImgt(fh):
" an iterator, yields each record as a IMGTRecord object "
grabGenes = False
grabSeq=False
for line in fh:
tag = line[:5].strip()
data = line[5:].strip()
if tag=="ID":
rec = IMGTRecord()
rec.id = data
grabSeq=False
if tag=="OS":
rec.species = data
if tag=="RN":
rec.refList.append(Reference())
#if tag=="RA":
#rec.refList[-1].authors.append(data)
if tag=="RT":
rec.refList[-1].titles.append(data)
#if tag=="RL":
#rec.refList[-1].journals.append(data)
if tag=="FT":
ftName = data[:16].strip()
desc = data[16:].strip()
if ftName!="":
grabGenes=False
if ftName=="V_region" or ftName=="J_segment":
genes={}
geneType = ftName
grabGenes=True
if desc.startswith("/gene=") and grabGenes:
gene = desc.split("=")[1].strip('"')
self.genes[geneType] = gene
if tag=="SQ":
grabSeq=True
if tag=="" and grabSeq:
seq = data.strip().strip("0123456789")
seq = seq.replace(" ", "")
rec.seqs.append(seq)
if tag=="//":
yield rec
yield rec
def filterConvertImgt(inFname, filterWords, species="Homo sapiens"):
""" go over all IMGT records, output ones from selected species with
certain keywords in fasta format to stdout """
if filterWords!=None:
filterWords = [fw.upper() for fw in filterWords]
logging.info("%s" % str(filterWords))
for rec in parseImgt(open(inFname)):
if not rec.species.startswith(species):
continue
seq = "".join(rec.seqs)
refString = rec.refString()
if filterWords!=None:
refStringUp = refString.upper()
found=False
for word in filterWords:
if word in refStringUp:
found=True
break
if not found:
continue
geneString = "_".join(list(rec.genes.iteritems()))
id = rec.id.split()[0]
print ">"+(id+"|"+refString+"|"+geneString).replace(" ","_")
print seq
# ----------- MAIN --------------
if args==[]:
parser.print_help()
exit(1)
inFname = args[0]
filterWordList = options.filterWords
filterOrg = options.organism
filterConvertImgt(inFname, filterWordList, filterOrg)