-
Notifications
You must be signed in to change notification settings - Fork 180
/
Copy pathdigest_genome.py
executable file
·195 lines (167 loc) · 6.68 KB
/
digest_genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python
# HiC-Pro
# Copyleft 2015 Institut Curie
# Author(s): Nelle Varoquaux, Nicolas Servant
# Contact: nicolas.servant@curie.fr
# This software is distributed without any guarantee under the terms of the
# GNU General
# Public License, either Version 2, June 1991 or Version 3, June 2007.
"""
Script to extract restriction fragment from a fasta file and output a BED file
"""
import argparse
import re
import os
import sys
import numpy as np
RE_cutsite = {
"mboi": ["^GATC"],
"dpnii": ["^GATC"],
"bglii": ["A^GATCT"],
"hindiii": ["A^AGCTT"]}
def find_re_sites(filename, sequences, offset):
with open(filename, 'r') as infile:
chr_id = None
big_str = ""
indices = []
all_indices = []
contig_names = []
c = 0
for line in infile:
c += 1
if line.startswith(">"):
print("{}...".format(line.split()[0][1:]))
# If this is not the first chromosome, find the indices and append
# them to the list
if chr_id is not None:
for rs in range(len(sequences)):
pattern = "(?={})".format(sequences[rs].lower())
indices += [m.start() + offset[rs]\
for m in re.finditer(pattern, big_str)]
indices.sort()
all_indices.append(indices)
indices = []
# This is a new chromosome. Empty the sequence string, and add the
# correct chrom id
big_str = ""
chr_id = line.split()[0][1:]
if chr_id in contig_names:
print("The fasta file contains several instance of {}. Exit.".format(chr_id))
sys.exit(-1)
contig_names.append(chr_id)
else:
# As long as we don't change chromosomes, continue reading the
# file, and appending the sequences
big_str += line.lower().strip()
# Add the indices for the last chromosome
for rs in range(len(sequences)):
pattern = "(?={})".format(sequences[rs].lower())
indices += [m.start() + offset[rs]
for m in re.finditer(pattern, big_str)]
indices.sort()
all_indices.append(indices)
return contig_names, all_indices
def find_chromsomose_lengths(reference_filename):
chromosome_lengths = []
chromosome_names = []
length = None
with open(reference_filename, 'r') as infile:
for line in infile:
if line.startswith(">"):
chromosome_names.append(line[1:].strip())
if length is not None:
chromosome_lengths.append(length)
length = 0
else:
length += len(line.strip())
chromosome_lengths.append(length)
return chromosome_names, np.array(chromosome_lengths)
def replaceN(cs):
npos = int(cs.find('N'))
cseql = []
if npos != -1:
for nuc in ["A","C","G","T"]:
tmp = cs.replace('N', nuc, 1)
tmpl = replaceN(tmp)
if type(tmpl) == list:
cseql = cseql + tmpl
else:
cseql.append(tmpl)
else:
cseql.append(cs)
return cseql
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('fastafile')
parser.add_argument('-r', '--restriction_sites',
dest='res_sites',
nargs='+',
help=("The cutting position has to be specified using "
"'^'. For instance, -r A^AGCTT for HindIII "
"digestion. Several restriction enzyme can be "
"specified."))
parser.add_argument('-o', '--out', default=None)
args = parser.parse_args()
filename = args.fastafile
out = args.out
# Split restriction sites if comma-separated
cutsites=[]
for s in args.res_sites:
for m in s.split(','):
cutsites.append(m)
# process args and get restriction enzyme sequences
sequences = []
offset = []
for cs in cutsites:
if cs.lower() in RE_cutsite:
cseq = ''.join(RE_cutsite[cs.lower()])
else:
cseq = cs
offpos = int(cseq.find('^'))
if offpos == -1:
print("Unable to detect offset for {}. Please, use '^' to specify the cutting position,\
i.e A^GATCT for HindIII digestion.".format(cseq))
sys.exit(-1)
for nuc in list(set(cseq)):
if nuc not in ['A','T','G','C','N','^']:
print("Find unexpected character ['{}']in restriction motif".format(nuc))
print("Note that multiple motifs should be separated by a space (not a comma !)")
sys.exit(-1)
offset.append(offpos)
sequences.append(re.sub('\^', '', cseq))
# replace all N in restriction motif
sequences_without_N = []
offset_without_N = []
for rs in range(len(sequences)):
nrs = replaceN(sequences[rs])
sequences_without_N = sequences_without_N + nrs
offset_without_N = offset_without_N + [offset[rs]] * len(nrs)
sequences = sequences_without_N
offset = offset_without_N
if out is None:
out = os.path.splitext(filename)[0] + "_fragments.bed"
print("Analyzing", filename)
print("Restriction site(s)", ",".join(sequences))
print("Offset(s)", ','.join(str(x) for x in offset))
# Read fasta file and look for rs per chromosome
contig_names, all_indices = find_re_sites(filename, sequences, offset=offset)
_, lengths = find_chromsomose_lengths(filename)
valid_fragments = []
for i, indices in enumerate(all_indices):
valid_fragments_chr = np.concatenate(
[np.concatenate([[0], indices])[:, np.newaxis],
np.concatenate([indices, [lengths[i]]])[:, np.newaxis]],
axis=1)
valid_fragments.append(valid_fragments_chr)
# Write results
print("Writing to {} ...".format(out))
with open(out, 'w') as outfile:
for chrom_name, indices in zip(contig_names, valid_fragments):
frag_id = 0
for begin, end in indices:
# allow to remove cases where the enzyme cut at
# the first position of the chromosome
if end > begin:
frag_id += 1
frag_name = "HIC_{}_{}".format(str(chrom_name), int(frag_id))
outfile.write("{}\t{}\t{}\t{}\t0\t+\n".format(str(chrom_name), int(begin), int(end), str(frag_name)))