forked from cbg-ethz/pangolin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cooc-mutbamscan
executable file
·209 lines (177 loc) · 6.85 KB
/
cooc-mutbamscan
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import sys
import os
#import re
import argparse
import csv
import json
import yaml
import gzip
import pysam
# parse command line
argparser = argparse.ArgumentParser(description="scan amplicon (covered by long read pairs) for mutation cooccurence")
inputgroup = argparser.add_mutually_exclusive_group(required=True)
inputgroup.add_argument('-s', '--samples', metavar='TSV',
type=str, dest='samples', help="samples list tsv")
inputgroup.add_argument('-a', '--alignments', metavar='BAM/CRAM', nargs='+',
dest='alignments', help="alignment files")
argparser.add_argument('-p', '--prefix', metavar='PATH', required=False, default="working/samples",
type=str, dest='prefix', help="prefix for where to look at align files when using TSV samples list")
argparser.add_argument('-j', '--json', metavar='JSON', required=False, default=None,
type=str, dest='json', help="output results to as JSON file")
argparser.add_argument('-y', '--yaml', metavar='YAML', required=False, default=None,
type=str, dest='yaml', help="output results to as yaml file")
argparser.add_argument('-t', '--tsv', metavar='TSV', required=False, default=None,
type=str, dest='tsv', help="output results to as (raw) tsv file")
argparser.add_argument('-d', '--dump',
action="store_true", dest='dump', help = "dump the python object to the terminal")
args = argparser.parse_args()
if args.samples is not None:
assert os.path.isfile(args.samples), f"cannot find sample file {args.samples}"
else:
for align in args.alignments:
assert os.path.isfile(align), f"cannot find alginment file {align}"
def test_read(read, mut_dict):
"""
test if mutations listed in mut_dict are present in the pysam read
returns a list with:
found_site: list of site present in the read (no matter content)
found_mut: list of those position which have the mutations variant
"""
# WARNING pysam is 0-based! (see here: https://pysam.readthedocs.io/en/latest/faq.html#pysam-coordinates-are-wrong )
# 1. check which mutation' sites are in range of that read
found_site = [p for p,m in mut_dict.items() if read.reference_start <= (p-1) <= (read.reference_end-len(m))]
if not len(found_site):
return (None, None) # sites aren't present no point checking variants
# 2. of those sites, check which content mutations' variants
read_dict = dict(zip(read.get_reference_positions(full_length=True), read.query_sequence))
found_mut = []
for p in found_site:
l=len(mut_dict[p]) #lenght
# base present ?
srch=[(p-1+i) in read_dict for i in range(l)]
if all(srch): # all positions found!
# check if it's the expected mutation(s)
if all([read_dict[p-1+i] == mut_dict[p][i] for i in range(l)]):
found_mut.append(p)
elif not any(srch): # none position found! (entire deletion)
# check if we're hunting for a string of deletions (-)
if '-' * l == mut_dict[p]:
found_mut.append(p)
# TODO give some thoughs about partial deletions
if len(found_mut): # found mutation as sites
return (found_site, found_mut)
else: # sites present, but no mutation found
return (found_site, None)
# scan an amplicon for a specific set of mutations
def scanamplicon(read_iter, mut_dict):
# TODO inefficient, could be streamed (with an accumulator for pairs)
reads = dict()
for read in read_iter:
name=str(read.query_name)
R='R1' if read.is_read1 else 'R2'
if name in reads:
reads[name][R] = read
else:
reads[name] = { R: read }
print("amplion:", len(reads))
# tally the mutation sites and the presence of variant in there accross all reads
# TODO merge with above
all_muts=[]
all_sites=[]
for rds in reads:
val=reads[rds]
site_out = []
mut_out = []
for s in val:
R=val[s]
(t_pos, t_read) = test_read(R, mut_dict)
if t_pos is not None:
site_out.extend(t_pos)
if t_read is not None:
mut_out.extend(t_read)
if (len(site_out)):
all_sites.append(site_out)
if (len(mut_out)):
all_muts.append(mut_out)
sites_cnt=np.unique([len(set(mut)) for mut in all_sites], return_counts=True)
print("sites:", len(all_sites), sites_cnt)
muts_cnt=np.unique([len(set(sit)) for sit in all_muts], return_counts=True)
print("muts:", len(all_muts), muts_cnt)
# look at last column only
return {
"sites": dict(zip(sites_cnt[0].tolist(),sites_cnt[1].tolist())) if len(all_sites) else {},
"muts": dict(zip(muts_cnt[0].tolist(),muts_cnt[1].tolist())) if len(all_muts) else {}
}
# TODO better naming
def findbam(prefix, batch, sample):
'''function to find bam from prefix, batch and sample'''
alnfname=f"{prefix}/{sample}/{batch}/alignments/REF_aln.bam"
#TODO: assertion?
return alnfname
def scanbam(alnfname, amplicons):
'''scan a bamfile found at alnfname'''
amp_results={}
with pysam.AlignmentFile(alnfname, "rb") as alnfile:
for amp_name,amp in amplicons.items():
(rq_b,rq_e,mut_dict)=amp
# we need at least 2 to compute co-occurence
if len(mut_dict) < 1: # HACK 2:
continue
print(f"amplicon_{amp_name}", rq_b, rq_e, mut_dict, sep='\t', end='\t')
amplicon_iter = alnfile.fetch(rq_chr, rq_b, rq_e)
amp_results[amp_name] = scanamplicon(amplicon_iter, mut_dict)
return amp_results
# hardcode
# TODO compute from mutations
amplicons={
'72_UK': [21718, 21988, {21765: '------', 21991: '---'}],
'78_UK': [23502, 23810, {23604: 'A', 23709: 'T'}],
'92_UK': [27827, 28102, {27972: 'T', 28048: 'T', 28111: 'G'}],
'93_UK': [28147, 28414, {28111: 'G', 28280: 'CTA'}],
'76_SA': [22879, 23142, {23012: 'A', 23063: 'T'}],
# '76_UK': [22879, 23142, {23063: 'T',23271: 'A'}],
'77_EU': [23194, 23464, {23403: 'G'}],
}
rq_chr='NC_045512.2'
# loop for if samples are given through the TSV list
if args.samples is not None:
i=0
table={}
# TODO real jobs instead of a loop
with open(args.samples,'rt',encoding='utf-8', newline='') as tf: # this file has the same content as the original experiment
for r in csv.reader(tf, delimiter='\t'): #dialect='excel-tab'):
sample,batch=r[:2]
#i+=1
#if (i==3):
#break;
print(sample)
alnfname = findbam(args.prefix, batch, sample)
table[sample]=scanbam(alnfname, amplicons)
# loop for if samples are given through -a option
else:
i=0
table={}
for alnfname in args.alignments:
sample = alnfname # Is that right?
table[sample]=scanbam(alnfname, amplicons)
#
# dumps, for being able to take it from here
#
if (args.dump) or (not (args.json or args.yaml)):
print(table)
if args.json:
with open(args.json, 'wt') as jf:
json.dump(obj=table, fp=jf)
if args.yaml:
with open(args.yaml, 'wt') as yf:
print(yaml.dump(table, sort_keys=False), file=yf)
# raw data into pands.DataFrame
if args.tsv:
raw_table_df=pd.DataFrame.from_dict(data={i: {(j,k): table[i][j][k] for j in table[i] for k in table[i][j]} for i in table},
orient='index')
#with pd.option_context('display.max_rows', None): #, 'display.max_columns', None):
#print(raw_table_df)
raw_table_df.to_csv(args.tsv, sep="\t", compression={'method':'infer'})