-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdemultiplexON.py
153 lines (133 loc) · 5.9 KB
/
demultiplexON.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import swalign
import argparse
import yaml
from Bio.Seq import Seq
from Bio import SeqIO
"""
To separate (demultiplex) pooled HLA reads, we have to generate handle sequences (and their reverse complement parts)
to align to the FASTQ reads. For Oxford Nanopore we are expecting ~90% reliability, meaning that in the ~50bps long
handle we will have approx 5bps errors, indels mostly.
Processing steps:
- generate handles from yaml
- get reads from the FASTQ, and try to align a handle to the beginning of the read. Note, there are usually a ~15bps
chunk extra prefix that precedes the prefix
- if a read has only a single best scoring handle, store as it is
- if the handle alignment is ambiguous, store in a separate file (we will find out later what to do, maybe use
different alignment scoring)
- if no handle found, store as junk
"""
def parse_args():
parser = argparse.ArgumentParser(description='Demultiplexing Oxford Nanopore reads')
parser.add_argument('-y', help='the YAML definiton of handle prefix,postfix and indexes',required=True, dest="indexes")
parser.add_argument('-r', help='FASTQ file containing reads',required=True, dest="reads")
parser.add_argument('-m', help='number of mismatches expected in the handle sequence',required=True, dest="mm")
return parser.parse_args()
def readYAMLConf(yamlfile):
config = {}
with open(yamlfile, 'r') as ymlfile:
cfg = yaml.load(ymlfile)
for section in cfg:
config[section] = cfg[section]
return config
def generateSeqHandles(anIndexCfg):
"""
The YAML config file to parse is like:
handles:
prefix: "TTAGTCTCCGACGGCAGGCTTCAAT"
postfix: "ACGCACCCACCGGGACTCAG"
indexes: [
"ACAGTC",
"TGATGC",
"TCTCAG"
]
There is a handle at one end of each sequence which is as follows:
TTAGTCTCCGACGGCAGGCTTCAAT-ACAGTC-ACGCACCCACCGGGACTCAG
prefix -index - postfix
"""
forwardIdx= [] # the result array to collect handle sequence strings
handlePrefix = anIndexCfg["handles"]["prefix"]
handlePostfix = anIndexCfg["handles"]["postfix"]
for index in anIndexCfg["indexes"]:
forwardIdx.append(handlePrefix + index + handlePostfix)
reverseIdx = [] # to collect reverse complements
for handle in forwardIdx:
seq = Seq(handle)
rc = str(seq.reverse_complement())
reverseIdx.append(rc)
return (forwardIdx,reverseIdx)
def getBestMatches(aSeq, aHandleList, anSWAligner, aMismatch):
bestMatches = []
for handle in aHandleList:
al = anSWAligner.align(aSeq,handle)
if al.mismatches < aMismatch:
bestMatches.append(handle)
return bestMatches
def swFactory():
match = 2
mismatch = -1
gap_penalty = -1
gap_extension_penalty = -1
gap_extension_decay = 0.0
scoring = swalign.NucleotideScoringMatrix(match, mismatch)
return swalign.LocalAlignment( (scoring),
gap_penalty,
gap_extension_penalty,
gap_extension_decay=gap_extension_decay,
verbose=False,
globalalign=False,
full_query=False)
def fileFactory(aHandleList):
filesToSave = {}
for seq in aHandleList:
fh = open(seq+".dmx.fastq","w")
filesToSave[seq] = fh
return filesToSave
def writeFASTQRecord(aRecord,aFASTQFile):
readId = aRecord.id
seqStr = aRecord.__dict__['_seq'].__dict__['_data']
quals = aRecord.__dict__['_per_letter_annotations']['phred_quality']
qualsStr = ""
for c in quals:
qualsStr += chr(c+33)
#import pdb;pdb.set_trace()
aFASTQFile.write("@" + readId+" "+ str(len(seqStr))+ " "+ str(len(qualsStr)) +"\n")
aFASTQFile.write(seqStr+"\n")
aFASTQFile.write("+\n")
aFASTQFile.write(qualsStr+"\n")
aFASTQFile.flush()
def demultiplexFastqByBestMatch(aFASTQFile,aHandleList,aMismatch,isForward=True):
# we are exporting each handle into a different file
# this dictionary has the sequence handles as keys and the files as values
exportFiles = fileFactory(aHandleList)
sw = swFactory()
fh = open(aFASTQFile,'rU')
for idx, record in enumerate(SeqIO.parse(fh, "fastq")):
seqStr = str(record.seq)
# if we are looking for reverse sequence, get it from the end
if isForward:
seqStr = seqStr[0:100]
else:
#import pdb; pdb.set_trace()
seqStr = seqStr[len(seqStr)-99:]
# bestMatches is a list of handles having the same alignment score
# if there is only one, save it, else ignore ambiguities
bestMatches = getBestMatches(seqStr, aHandleList, sw, aMismatch) # get the best matches for all the handles
if len(bestMatches) == 1: # there is a single best match: store it
# unfortunately FASTQ export for very long reads looks to be buggy.
# So we have to export records ourselves
#SeqIO.write(record,exportFiles[bestMatches[0]],"fastq")
writeFASTQRecord(record,exportFiles[bestMatches[0]])
print "Wrote record " +str(idx)+" "+ record.id + " to " + (exportFiles[bestMatches[0]]).name
fh.close()
# be nice and close the exported files
for seq in aHandleList:
exportFiles[seq].close()
print "ready "
def main():
args = parse_args() # parse commmand-line
indexes = readYAMLConf(args.indexes) # read sequences from yaml config file
(forwardHnd,reverseHnd)= generateSeqHandles(indexes) # generate forward and reverse complement indexes
demultiplexFastqByBestMatch(args.reads, reverseHnd, int(args.mm),isForward=False)
demultiplexFastqByBestMatch(args.reads, forwardHnd, int(args.mm),isForward=True)
if __name__ == '__main__':
main()