-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMetaCC.py
329 lines (261 loc) · 14.9 KB
/
MetaCC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#########The structure of the main script is modified from bin3C########
from Script.raw_contact import ContactMatrix, ContactMatrix_LC
from Script.normalized_contact import NormCCMap, NormCCMap_LC
from Script.predict_species_number import gen_bestk
from Script.cluster import ClusterBin
from Script.post_processing import Postprocess, merge_bin
from Script.exceptions import ApplicationException
from Script.utils import load_object, save_object, make_dir, gen_bins, gen_sub_bins, make_random_seed
from Script.normcc import normcc, normcc_LC
import scipy.sparse as scisp
import argparse
import warnings
import logging
import shutil
import sys
import os
##Ignore the warning information of package deprecation##
warnings.filterwarnings("ignore")
__version__ = '1.0.0, released at 03/2023'
if __name__ == '__main__':
def mk_version():
return 'MetaCC v{}'.format(__version__)
def out_name(base, suffix):
return '{}{}'.format(base, suffix)
def ifelse(arg, default):
if arg is None:
return default
else:
return arg
runtime_defaults = {
'min_len': 1000,
'min_signal': 2,
'min_mapq': 30,
'min_match': 30,
'thres': 0.05,
'min_binsize':150000
}
script_directory = os.path.dirname(os.path.abspath(sys.argv[0]))
global_parser = argparse.ArgumentParser(add_help=False)
global_parser.add_argument('-V', '--version', default=False, action='store_true', help='Show the application version')
global_parser.add_argument('-v', '--verbose', default=False, action='store_true', help='Verbose output')
global_parser.add_argument('--cover', default=False, action='store_true', help='Cover existing files')
global_parser.add_argument('--log', help='Log file path [OUTDIR/MetaCC.log]')
parser = argparse.ArgumentParser(description='MetaCC: a scalable and integrative analysis framework for both short-read and long-read metagenomic Hi-C datasets')
subparsers = parser.add_subparsers(title='commands', dest='command', description='Valid commands',
help='choose an analysis stage for further options')
cmd_norm = subparsers.add_parser('norm', parents=[global_parser],
description='Normalize contacts.')
cmd_cl = subparsers.add_parser('bin', parents=[global_parser],
description='Do the binning.')
cmd_pp = subparsers.add_parser('postprocess', parents=[global_parser],
description='post-processing step on partially containminated bins.')
cmd_test = subparsers.add_parser('test', parents=[global_parser],
description='pipeline testing.')
'''
Normalization subparser input
'''
cmd_norm.add_argument('--min-len', type=int,
help='Minimum acceptable contig length [1000]')
cmd_norm.add_argument('--min-signal', type=int,
help='Minimum acceptable Hi-C signal [2]')
cmd_norm.add_argument('--min-mapq', type=int,
help='Minimum acceptable mapping quality [30]')
cmd_norm.add_argument('--min-match', type=int,
help='Accepted alignments must being N matches [30]')
cmd_norm.add_argument('-e', '--enzyme', metavar='NEB_NAME', action='append',
help='Case-sensitive enzyme name. Use multiple times for multiple enzymes')
cmd_norm.add_argument('--thres', type=float,
help='the fraction of discarded NormCC-normalized Hi-C contacts [0.05]')
cmd_norm.add_argument('FASTA', help='Reference fasta sequence')
cmd_norm.add_argument('BAM', help='Input bam file in query order')
cmd_norm.add_argument('OUTDIR', help='Output directory')
'''
Clutering subsparser input
'''
cmd_cl.add_argument('--min-binsize', type=int,
help='Minimum bin size used in output [150000]')
cmd_cl.add_argument('--num-gene', type=int,
help='Number of maker genes detected, automatically detected if not input')
cmd_cl.add_argument('--seed', type=int, default=None,
help='Random seed')
cmd_cl.add_argument('FASTA', help='Reference fasta sequence')
cmd_cl.add_argument('OUTDIR', help='Output directory of sub bins')
'''
Post-processing step
'''
cmd_pp.add_argument('--min-binsize', type=int,
help='Minimum bin size used in output [150000]')
cmd_pp.add_argument('FASTA', help='Reference fasta sequence')
cmd_pp.add_argument('CHECKM', help='CheckM result')
cmd_pp.add_argument('OUTDIR', help='Output directory of sub bins')
'''
Testing of NormCC software
'''
cmd_test.add_argument('--OUTDIR', type=str, default='Test/out_test', help='Output directory of testing results')
args = parser.parse_args()
if args.version:
print(mk_version())
sys.exit(0)
try:
make_dir(args.OUTDIR, args.cover)
except IOError:
print('Error: cannot find out directory or the directory already exists')
sys.exit(1)
# Create temp folder
temp_folder = os.path.join(args.OUTDIR , 'tmp')
if not os.path.exists(temp_folder):
os.mkdir(temp_folder)
else:
shutil.rmtree(temp_folder)
os.mkdir(temp_folder)
logging.captureWarnings(True)
logger = logging.getLogger('main')
# root log listens to everything
root = logging.getLogger('')
root.setLevel(logging.DEBUG)
# log message format
formatter = logging.Formatter(fmt='%(levelname)-8s | %(asctime)s | %(name)7s | %(message)s')
# Runtime console listens to INFO by default
ch = logging.StreamHandler()
if args.verbose:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
root.addHandler(ch)
# File log listens to all levels from root
if args.log is not None:
log_path = args.log
else:
log_path = os.path.join(args.OUTDIR, 'MetaCC.log')
fh = logging.FileHandler(log_path, mode='a')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
root.addHandler(fh)
# Add some environmental details
logger.debug(mk_version())
logger.debug(sys.version.replace('\n', ' '))
logger.debug('Command line: {}'.format(' '.join(sys.argv)))
try:
if args.command == 'norm':
if args.enzyme is not None:
logger.info('Begin constructing raw contact matrix...')
cm = ContactMatrix(args.BAM,
args.enzyme,
args.FASTA,
args.OUTDIR,
min_mapq=ifelse(args.min_mapq, runtime_defaults['min_mapq']),
min_len=ifelse(args.min_len, runtime_defaults['min_len']),
min_match=ifelse(args.min_match, runtime_defaults['min_match']),
min_signal=ifelse(args.min_signal, runtime_defaults['min_signal']))
logger.info('Raw contact matrix construction finished')
logger.info('Begin normalizing raw contacts by NormCC...')
contig_file = os.path.join(temp_folder , 'contig_info.csv')
norm_result = normcc(contig_file)
######Construct normalized matrix of Hi-C interaction maps#############
hzmap = NormCCMap(args.OUTDIR,
cm.seq_info,
cm.seq_map,
norm_result,
thres = ifelse(args.thres, runtime_defaults['thres']))
logger.info('NormCC normalization finished')
else:
logger.info('Begin constructing raw contact matrix...')
cm = ContactMatrix_LC(args.BAM,
args.FASTA,
args.OUTDIR,
min_mapq=ifelse(args.min_mapq, runtime_defaults['min_mapq']),
min_len=ifelse(args.min_len, runtime_defaults['min_len']),
min_match=ifelse(args.min_match, runtime_defaults['min_match']),
min_signal=ifelse(args.min_signal, runtime_defaults['min_signal']))
logger.info('Raw contact matrix construction finished')
logger.info('Begin normalizing raw contacts by site-free NormCC due to no enzyme input detected...')
contig_file = os.path.join(temp_folder , 'contig_info.csv')
norm_result = normcc_LC(contig_file)
######Construct normalized matrix of Hi-C interaction maps#############
hzmap = NormCCMap_LC(args.OUTDIR,
cm.seq_info,
cm.seq_map,
norm_result,
thres = ifelse(args.thres, runtime_defaults['thres']))
logger.info('Site-free NormCC normalization finished')
shutil.rmtree(temp_folder) ######Remove all intermediate files#######
scisp.save_npz(os.path.join(args.OUTDIR, 'Normalized_contact_matrix.npz'), hzmap.seq_map.tocsr())
save_object(os.path.join(args.OUTDIR, 'NormCC_normalized_contact'), hzmap)
logger.info('Normalization results have been saved')
if args.command == 'bin':
if not os.path.exists(os.path.join(args.OUTDIR , 'NormCC_normalized_contact.gz')):
raise IOError('Please run the NormCC normalization step before binning')
###########Load the normalization instance to get access to the normalized Hi-C contact maps##########
logger.info('Loading normalized contact maps by NormCC from: {}'.format(os.path.join(args.OUTDIR , 'NormCC_normalized_contact.gz')))
hzmap = load_object(os.path.join(args.OUTDIR , 'NormCC_normalized_contact.gz'))
#########Scan the marker gene to determine the hyperparameter in the Leiden clustering#########
if args.num_gene is None:
logger.info('Begin scanning marker genes...')
args.num_gene = gen_bestk(args.OUTDIR , args.FASTA)
if args.num_gene == 0:
logger.warning('No marker gene is detected from the assembled contigs!')
logger.info('There are {} marker genes in the assembled contigs'.format(args.num_gene))
if not args.seed:
args.seed = make_random_seed()
logger.info('The random seed for clustering is {}'.format(args.seed))
cluster_process = ClusterBin(args.OUTDIR , hzmap.name , hzmap.len , hzmap.seq_map ,
ifelse(args.min_binsize, runtime_defaults['min_binsize']), args.num_gene, args.seed)
logger.info('Writing bins...')
gen_bins(args.FASTA , os.path.join(temp_folder , 'cluster.txt') , os.path.join(args.OUTDIR ,'BIN'))
shutil.rmtree(temp_folder) ######Remove all intermediate files#######
logger.info('MetaCC binning fininshed.')
if args.command == 'postprocess':
if not os.path.exists(os.path.join(args.OUTDIR , 'NormCC_normalized_contact.gz')):
logger.error('File NormCC_normalized_contact.gz is not found')
raise IOError('Please run the NormCC normalization step before postprocessing the contaminated bins')
if not os.path.exists(os.path.join(args.OUTDIR , 'BIN')):
logger.error('BIN directory is not found')
raise IOError('Please run the MagCC binning step before postprocessing the contaminated bins')
logger.info('Loading normalized contact map instance from: {}'.format(os.path.join(args.OUTDIR , 'NormCC_normalized_contact.gz')))
hzmap = load_object(os.path.join(args.OUTDIR , 'NormCC_normalized_contact.gz'))
post = Postprocess(args.OUTDIR , args.CHECKM , hzmap.name , hzmap.len , hzmap.seq_map, ifelse(args.min_binsize, runtime_defaults['min_binsize']))
logger.info('Writing final bins...')
gen_sub_bins(args.FASTA , os.path.join(args.OUTDIR ,'tmp','cluster_sub.txt') , os.path.join(args.OUTDIR ,'tmp','SUB_BIN'))
merge_bin(args.OUTDIR , args.CHECKM)
shutil.rmtree(temp_folder) ######Remove all intermediate files#######
logger.info('Post-processing finished.')
if args.command == 'test':
logger.info('Begin to test MetaCC...')
ENZ = 'HindIII'
FASTA = 'Test/final.contigs.fa'
BAM = 'Test/MAP_SORTED.bam'
OUT = args.OUTDIR
logger.info('Begin to test the contact map construction section...')
cm = ContactMatrix(BAM,
ENZ,
FASTA,
OUT,
min_mapq=runtime_defaults['min_mapq'],
min_len=runtime_defaults['min_len'],
min_match=runtime_defaults['min_match'],
min_signal=0)
logger.info('Contact map construction section works!')
logger.info('Begin to test the NormCC normalization module...')
logger.info('Normalizing raw contacts by NormCC...')
contig_file = 'Test/contig_info_test.csv'
norm_result = normcc(contig_file)
######Construct normalized matrix of Hi-C interaction maps#############
hzmap = NormCCMap(OUT,
cm.seq_info,
cm.seq_map,
norm_result,
thres = runtime_defaults['thres'])
logger.info('NormCC Normalization module works!')
logger.info('Begin to test the MetaCC binning module...')
logger.info('Begin scanning marker genes...')
logger.info('Leiden clustering starts...')
cluster_process = ClusterBin(OUT , hzmap.name , hzmap.len , hzmap.seq_map ,
0, 0, 0)
logger.info('MetaCC binning module works!')
shutil.rmtree(OUT, ignore_errors=True)
logger.info('Testing finished!')
except ApplicationException:
logger.error('ApplicationException Error')
sys.exit(1)