-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathModified-PropagAtE_script.py
968 lines (850 loc) · 47.8 KB
/
Modified-PropagAtE_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
#! /usr/bin/env python3
# Author: Kristopher Kieft, UW-Madison
# kieft@wisc.edu
# PropagAtE: Prophage Activity Estimator
# Version: v1.0.0
# Release date: January 21 2021
# dev: 22
# Modified by Steven Sutcliffe
# Allows prophages with no flanking regions to be included using the bacterial MAG mean coverage
# Imports
try:
import warnings
warnings.filterwarnings("ignore")
import argparse
import subprocess
import sys
import random
import time
import datetime
from datetime import date
from scipy import stats
import statistics
from scipy.stats import mannwhitneyu
from scipy.stats import combine_pvalues
import math
import uuid
import logging
import os
from Bio.SeqIO.FastaIO import SimpleFastaParser
except Exception as e:
sys.stderr.write("\nError: please verify dependancy imports are installed and up to date:\n\n")
sys.stderr.write(str(e) + "\n\n")
exit(1)
from sys import exit
# Set up variables
start = time.time()
start_time = str(datetime.datetime.now().time()).rsplit(".",1)[0]
u = str(uuid.uuid1()).split("-")[0]
# propagate = argparse.ArgumentParser(description='Example 1: python3 PropagAtE_run.py -f scaffolds.fasta -r forward.fastq reverse.fastq -o output.tsv -v prophage_coordinates.tsv -t threads -i ID . . . Example 2: python3 PropagAtE_run.py -s alignment.sam -o output.tsv -v prophage_coordinates.tsv')
# propagate.add_argument('--version', action='version', version='PropagAtE v1.0.0')
# # Input required
# propagate.add_argument('-s', type=str, nargs=1, default = [''], help='input SAM (.sam) sequence alignment file (skip -sb/-b/-r/-f).')
# propagate.add_argument('-b', type=str, nargs=1, default = [''], help='input BAM (.bam) sequence alignment file (skip -sb/-s/-r/-f).')
# propagate.add_argument('-sb', type=str, nargs=1, default = [''], help='input sorted BAM (.bam) sequence alignment file (skip -b/-s/-r/-f).')
# propagate.add_argument('-r', type=str, nargs=2, default = ['',''], help='input forward and reverse read files (.fastq) separated by a space (skip -sb/-b/-s, requires -f)')
# propagate.add_argument('-v', type=str, nargs=1, required = True, help='VIBRANT "integrated_prophage_coordinates" file or custom prophage coordinates file.')
# # Optional
# propagate.add_argument('-o', type=str, nargs=1, default = ['PropagAtE_result.' + str(u) + '.tsv'], help='name of output tab-separated file for results.')
# propagate.add_argument('-f', type=str, nargs=1, default = [''], help='if input is -r provide genomes/scaffolds (.fna, .fastq, .fa, .fsa).')
# propagate.add_argument('-t', type=str, nargs=1, default = ['1'], help='threads used for Bowtie2 mapping (if input is -r and -f).')
# propagate.add_argument('-i', type=str, nargs=1, default = [''], help='unique ID to append to Bowtie2 SAM output for reference to read sets used (not requied, useful when analyzing multiple read sets).')
# propagate.add_argument('-p', type=str, nargs=1, default = ['0.05'], help='p-value cutoff for Mann-Whitney statistical test (default=0.05).')
# propagate.add_argument('-g', type=str, nargs=1, default = ['1'], choices=["0", "1", "2", "3"], help='number of gap extensions allowed in read alignment (per read) [default=1, options=0-3].')
# propagate.add_argument('-m', type=str, nargs=1, default = ['3'], choices=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], help='number of mismatches allowed in read alignment (per read) [default=3, options=0-10].')
# propagate.add_argument('-n', type=str, nargs=1, default = ['5'], help='number of coverage sub-samples to take for Mann-Whitney statistical analysis [default=5, minimum=5].')
# propagate.add_argument('-a', type=str, nargs=1, default = ['4'], choices=["0", "1", "2", "3", "4"], help='remove outlier coverage values "a" standard deviations from the average [default=4].')
# propagate.add_argument('-e', type=str, nargs=1, default = ['0.75'], help="minimum effect size for significance by Cohen's d test [default=0.75, minimum=0.6].")
# propagate.add_argument('-c', type=str, nargs=1, default = ['1.65'], help="minimum prophage:host coverage ratio for significance [default=1.65, minimum=1.5].")
# propagate.add_argument('-y', type=str, nargs=1, default = [''], help='term to distinguish host from prophage name in genome/scaffold definition lines [default="_fragment_"].')
# propagate.add_argument('-x', type=str, nargs=1, default = [''], help="path to Samtools executable if not in PATH.")
# propagate.add_argument('-clean', action='store_true', help='use this setting to remove (if applicable) any generated SAM, unsorted BAM and Bowtie2 index files. Will retain any user input files, raw reads and sorted BAM files (default=off).')
# propagate.add_argument('-all', action='store_true', help='use this setting to keep all aligned reads (ignore -g and -m, default=off).')
# propagate.add_argument('-z', type=str, nargs=1, default = [''], help='character(s) used to replace spaces in genome/scaffold names. Use this setting if spaces were replaced in the alignment file (SAM/BAM) but not in the prophage coordinates file.')
# Parse arguments
args = False
samfile = ''
bamfile = ''
sortbamfile = "SRR828661_das_check_bin_abundance.gap-1_mm-3.sorted.bam"
fasta = ''
forward = ''
reverse = ''
threads = ''
vibe = "propagate_coordinates"
id = ''
pval = 0.05
pval = 0.050
gaps = 1
mismatches = 3
subs = 5
outliers = 4
effect = 0.75
ratio_cutoff = 1.65
termsplit = ''
###Making a split for the bin as well:
binsplit = '_k119_'
outfile = "test.tsv"
outpath = "Test_Bowtie"
clean = False
args_all = False
try:
temp_out = str(outfile).rsplit("/",1)[1]
except Exception:
temp_out = str(outfile)
outfile = outpath + str(temp_out)
sampath = '/usr/local/bin/samtools'
spaces = ''
replace = False
if termsplit == '':
termsplit = "_fragment_"
if spaces == '':
spaces == '~~'
else:
replace = True
if sampath != '':
if "samtools" == sampath[-8:]:
sampath = sampath[:-8]
if sampath[-1] != '/':
sampath += '/'
# log file
logfilename = str(outfile).rsplit(".",1)[0] + ".log"
subprocess.run("rm " + logfilename + " 2> /dev/null", shell=True)
logging.basicConfig(filename=logfilename, level=logging.INFO, format='%(message)s')
# verify inputs and versions
if termsplit[0] == " ":
sys.stderr.write("\nThe -y input term cannot start with a space. If you tried '-y -gene' then try '-y-gene' instead. Exiting.\n")
logging.info("\nThe -y input term cannot start with a space. If you tried '-y -gene' then try '-y-gene' instead. Exiting.\n")
exit(1)
if outfile.rsplit(".",1)[1] == "log":
sys.stderr.write("\nThe output file cannot end in .log, suggested is .tsv. Exiting.\n")
logging.info("\nThe output file cannot end in .log, suggested is .tsv. Exiting.\n")
exit(1)
if (samfile != '' and bamfile != '') or (samfile != '' and forward != '') or (bamfile != '' and forward != '') or (samfile != '' and sortbamfile != '') or (sortbamfile != '' and bamfile != '') or (forward != '' and sortbamfile != ''):
sys.stderr.write("\nOnly one input file (-s, -sb, -b, -r) is allowed. Exiting.\n")
logging.info("\nOnly one input file (-s, -sb, -b, -r) is allowed. Exiting.\n")
exit(1)
try:
if (forward.rsplit('.',1)[1] != 'fastq' or reverse.rsplit('.',1)[1] != 'fastq') and ('.'.join(forward.rsplit('.',2)[1:]) != 'fastq.gz' or '.'.join(reverse.rsplit('.',2)[1:]) != 'fastq.gz'):
sys.stderr.write("\nError: Provided reads files must both have the extension .fastq or .fastq.gz. Exiting.\n")
logging.info("\nError: Provided reads files must both have the extension .fastq or .fastq.gz. Exiting.\n")
exit(1)
except Exception:
pass
try:
if forward != '' and reverse != '' and fasta.rsplit('.',1)[1] != 'fa' and fasta.rsplit('.',1)[1] != 'fna' and fasta.rsplit('.',1)[1] != 'fasta' and fasta.rsplit('.',1)[1] != 'fsa':
sys.stderr.write("\nError: Provided fasta file must have the extension .fasta, .fna, .fa or .fsa. Exiting.\n")
logging.info("\nError: Provided fasta file must have the extension .fasta, .fna, .fa or .fsa. Exiting.\n")
exit(1)
except Exception:
pass
if forward != '' and reverse != '' and fasta == '':
sys.stderr.write("\nError: Reads (-r) were provided but a sequence file (-f) was not. Exiting.\n")
logging.info("\nError: Reads (-r) were provided but a sequence file (-f) was not. Exiting.\n")
exit(1)
try:
if samfile.rsplit('.',1)[1] != 'sam':
sys.stderr.write("\nError: Provided sam file must have the extension .sam. Exiting.\n")
logging.info("\nError: Provided sam file must have the extension .sam. Exiting.\n")
exit(1)
except Exception:
pass
try:
if bamfile.rsplit('.',1)[1] != 'bam':
sys.stderr.write("\nError: Provided bam file must have the extension .bam. Exiting.\n")
logging.info("\nError: Provided bam file must have the extension .bam. Exiting.\n")
exit(1)
except Exception:
pass
try:
if sortbamfile.rsplit('.',1)[1] != 'bam':
sys.stderr.write("\nError: Provided bam (sorted) file must have the extension .bam. Exiting.\n")
logging.info("\nError: Provided bam (sorted) file must have the extension .bam. Exiting.\n")
exit(1)
except Exception:
pass
if not os.path.exists(vibe):
sys.stderr.write("\nError: could not identify prophage coordinates file (-v). Exiting.\n")
logging.info("\nError: could not identify prophage coordinates file (-v). Exiting.\n")
exit(1)
if effect < 0.6:
sys.stderr.write("\nError: Cohen's d effect size should not be set below 0.6. Exiting.\n")
logging.info("\nError: Cohen's d effect size should not be set below 0.6. Exiting.\n")
exit(1)
if ratio_cutoff < 1.5:
sys.stderr.write("\nError: ratio cutoff should not be set below 1.5. Exiting.\n")
logging.info("\nError: ratio cutoff should not be set below 1.5. Exiting.\n")
exit(1)
if subs < 5:
sys.stderr.write("\nError: subsample number should not be set below 5. Exiting.\n")
logging.info("\nError: subsample number should not be set below 5. Exiting.\n")
exit(1)
try:
subprocess.check_output("which " + sampath + "samtools", shell=True)
except Exception:
sys.stderr.write("\nError: samtools does not appear to be installed, is not in the system's PATH or is not in the indicated path (-x). Exiting.\n")
logging.info("\nError: samtools does not appear to be installed, is not in the system's PATH or is not in the indicated path (-x). Exiting.\n")
exit(1)
try:
subprocess.check_output(sampath + "samtools depth -a placeholder.bam 2> " + outpath + u + "_samtools_checkfile.txt", shell=True)
except Exception as e:
with open(outpath + u + "_samtools_checkfile.txt",'r') as checkfile:
for line in checkfile:
if "depth: invalid option -- 'a'" in line:
sys.stderr.write("\nError: samtools version is incorrect. Please update and verify samtools '-a' flag is available.")
sys.stderr.write("\n Alternatively use the '-x' flag to provide the location of the correct samtools executable.")
sys.stderr.write("\n Example update using conda: 'conda install -c bioconda samtools'. Exiting.\n")
logging.info("\nError: samtools version is incorrect. Please update and verify samtools '-a' flag is available.")
logging.info("\n Alternatively use the '-x' flag to provide the location of the correct samtools executable.")
subprocess.run("rm " + outpath + u + "_samtools_checkfile.txt 2> /dev/null", shell=True)
exit(1)
subprocess.run("rm " + outpath + u + "_samtools_checkfile.txt 2> /dev/null", shell=True)
if forward == '' and fasta != '':
logging.info("Note: Reads not provided, ignoring input fasta file.\n")
if forward == '' and threads != '1':
logging.info("Note: Reads not provided, using 1 thread.\n")
if forward == '' and id != '':
logging.info("Note: Reads not provided, ignoring identifier ID input.\n")
# Statistics
def MWdownsampling(phage, background):
n = 1
pvalue_list = []
while n <= subs:
cov_p_100 = random.sample(phage, 100)
cov_h_100 = random.sample(background, 100)
if sum(cov_p_100) != 0 and sum(cov_h_100) != 0:
stat, pvalue = mannwhitneyu(cov_p_100, cov_h_100)
pvalue_list.append(pvalue)
n += 1
result = combine_pvalues(pvalue_list)
return result
def cohenD(phage_mean, phage_sd, host_mean, host_sd):
pool = math.sqrt((phage_sd**2+host_sd**2)/2)
d = (host_mean-phage_mean)/pool
return abs(d)
##### ----------------------------------------------------------------------------------------------------------------------- #####
logging.info("Command: %s" % ' '.join(sys.argv))
logging.info("Outfolder: %s" % outpath)
logging.info("")
logging.info("Date: %s" % str(date.today()))
logging.info("Time: %s" % str(datetime.datetime.now().time()).rsplit(".",1)[0])
logging.info("Program: PropagAtE v1.0.0")
logging.info("Run ID: %s\n" % u)
logging.info("Time (min) | Log ")
logging.info("--------------------------------------------------------------------")
reads_input = False
sam_input = False
bam_input = False
sortbam_input = False
# If input is reads/fasta run Bowtie2
if forward != '' and fasta != '':
reads_input = True
logging.info("%s Reads input identified, using %s threads to run Bowtie2." % (str(round((time.time() - float(start))/60,1)),threads))
try:
subprocess.check_output("which bowtie2", shell=True)
except Exception:
sys.stderr.write("\nError: Bowtie2 does not appear to be installed or is not in the system's PATH. Exiting.\n")
logging.info("\nError: Bowtie2 does not appear to be installed or is not in the system's PATH. Exiting.\n")
exit(1)
try:
temp = fasta.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = fasta.rsplit(".",1)[0]
logging.info("%s Checking format of FASTA file, replacing spaces if necessary." % str(round((time.time() - float(start))/60,1)))
with open(fasta, 'r') as fasta_in, open(outpath + base + ".no-spaces.fasta", 'w') as fasta_out:
for name,seq in SimpleFastaParser(fasta_in):
if " " in name:
replace = True
name = name.replace(" ", str(spaces))
fasta_out.write(">" + str(name) + "\n" + str(seq) + "\n")
if replace == True:
logging.info('%s Spaces identified in sequence names. Replaced with "~~".' % str(round((time.time() - float(start))/60,1)))
fasta = outpath + base + ".no-spaces.fasta"
if not os.path.exists(outpath + str(base)+".bowtie2.index.1.bt2"):
logging.info("%s Building index ..." % str(round((time.time() - float(start))/60,1)))
build = True
subprocess.run("bowtie2-build " + str(fasta) + " " + outpath + str(base)+".bowtie2.index > /dev/null 2> /dev/null", shell=True)
else:
build = False
logging.info("%s Existing index identified, skipping build ..." % str(round((time.time() - float(start))/60,1)))
logging.info("%s Mapping reads ..." % str(round((time.time() - float(start))/60,1)))
if id != '':
subprocess.run("bowtie2 -x " + outpath + str(base)+".bowtie2.index -1 " + str(forward) + " -2 " + str(reverse) + " -S " + outpath + str(base)+"." + str(id) + ".sam -q -p " + str(threads) + " --no-unal > /dev/null 2> /dev/null", shell=True)
else:
subprocess.run("bowtie2 -x " + outpath + str(base)+".bowtie2.index -1 " + str(forward) + " -2 " + str(reverse) + " -S " + outpath + str(base)+".sam -q -p " + str(threads) + " --no-unal > /dev/null 2> /dev/null", shell=True)
if clean == True and build == True:
subprocess.run("rm " + outpath + str(base)+".bowtie2.index.* 2> /dev/null", shell=True)
logging.info("%s Bowtie2 done." % str(round((time.time() - float(start))/60,1)))
if str(id) != "":
samfile = outpath + str(base)+"." + str(id) + ".sam"
else:
samfile = outpath + str(base)+".sam"
subprocess.run("rm " + outpath + base + ".no-spaces.fasta 2> /dev/null", shell=True)
# Read in prophage coordinate data
prophage_dict = {}
prophage_dict_frags = {}
vibrant_genomes_full = []
logging.info("%s Generating a list of all prophage regions ..." % str(round((time.time() - float(start))/60,1)))
logging.info("%s Distinguishing host from prophage names by the term '%s' ..." % (str(round((time.time() - float(start))/60,1)),str(termsplit)))
with open(vibe, 'r') as vibrant:
phage_list = vibrant.read().replace("\n","\t").split("\t")
if phage_list[0] == "scaffold" and phage_list[1] == "fragment" and phage_list[5] == "nucleotide start" and phage_list[6] == "nucleotide stop":
n = 9
while n < len(phage_list):
name = str(phage_list[n]).split(termsplit)[0]
if replace == True:
name = name.replace(" ", str(spaces))
if name[0] == "'" or name[0] == '"':
name = name[1:]
if replace == True:
frag = str(phage_list[n]).replace(" ", str(spaces))
else:
frag = str(phage_list[n])
if frag[0] == "'" or frag[0] == '"':
frag = frag[1:-1]
if n == 9: # in case of error
save_name = name
save_frag = frag
if name != '' and frag != '' and name == frag: # names were unable to be split
sys.stderr.write("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
sys.stderr.write("Host name detected: %s\n" % name)
sys.stderr.write("Prophage name detected: %s\n" % frag)
sys.stderr.write("-y flag input detected: %s\n" % termsplit)
logging.info("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
logging.info("Host name detected: %s" % name)
logging.info("Prophage name detected: %s" % frag)
logging.info("-y flag input detected: %s" % termsplit)
exit(1)
prophage_dict.update({str(name) + "~" + str(phage_list[n+4]):str(frag)})
prophage_dict.update({str(name) + "~" + str(phage_list[n+5]):str(frag)})
prophage_dict_frags.update({str(name):str(frag)})
vibrant_genomes_full.append(str(name))
n += 8
elif phage_list[0] == "scaffold" and phage_list[1] == "fragment" and phage_list[2] == "start" and phage_list[3] == "stop":
n = 5
while n < len(phage_list):
name = str(phage_list[n]).split(termsplit)[0]
if replace == True:
name = name.replace(" ", str(spaces))
if name[0] == "'" or name[0] == '"':
name = name[1:]
if replace == True:
frag = str(phage_list[n]).replace(" ", str(spaces))
else:
frag = str(phage_list[n])
if frag[0] == "'" or frag[0] == '"':
frag = frag[1:-1]
if n == 5: # in case of error
save_name = name
save_frag = frag
if name != '' and frag != '' and name == frag: # names were unable to be split
sys.stderr.write("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
sys.stderr.write("Host name detected: %s\n" % name)
sys.stderr.write("Prophage name detected: %s\n" % frag)
sys.stderr.write("-y flag input detected: %s\n" % termsplit)
logging.info("\nError: prophage names could not be distinguished from host names. Check accuracy of -y flag input. Below are the names that caused the issue. Exiting.\n")
logging.info("Host name detected: %s" % name)
logging.info("Prophage name detected: %s" % frag)
logging.info("-y flag input detected: %s" % termsplit)
exit(1)
prophage_dict.update({str(name) + "~" + str(phage_list[n+1]):str(frag)})
prophage_dict.update({str(name) + "~" + str(phage_list[n+2]):str(frag)})
prophage_dict_frags.update({str(name):str(frag)})
vibrant_genomes_full.append(str(name))
n += 4
else:
sys.stderr.write("\nError: -v coordinates file appears to be incorrect. Either input a VIBRANT 'integrated_prophage_coordinates' file or a custom tab separated coordinates file with the header 'scaffold/fragment/start/stop'. Exiting.\n")
logging.info("\nError: -v coordinates file appears to be incorrect. Either input a VIBRANT 'integrated_prophage_coordinates' file or a custom tab separated coordinates file with the header 'scaffold/fragment/start/stop'. Exiting.\n")
exit(1)
number_hosts = list(set(vibrant_genomes_full))
logging.info("%s Number of prophage regions identified: %s" % (str(round((time.time() - float(start))/60,1)),len(vibrant_genomes_full)))
logging.info("%s Number of unique host regions identified: %s" % (str(round((time.time() - float(start))/60,1)),len(number_hosts)))
vibrant_genomes_full = list(set(vibrant_genomes_full))
###### Make a bin list not scaffold list
bin_list = []
for i in vibrant_genomes_full:
name = str(i).split(binsplit)[0]
bin_list.append(str(name))
bin_list = list(set(bin_list))
# process sam/bam files
logging.info("%s Processing alignment (.sam/.bam) file." % str(round((time.time() - float(start))/60,1)))
# Gap/mismatch processing for sam file, convert sam to bam
processed = False
if samfile != '':
sam_input = True
try:
temp = samfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = samfile.rsplit(".",1)[0]
if args_all == False:
processed = True
logging.info("%s Alignment .sam input identified, processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1)))
tossed = 0
kept = 0
with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf:
for line in sf:
if "XG:i:" in line and "XM:i:" in line:
xg = line.split("\t")[15]
xg = xg.split(":")[2]
xm = line.split("\t")[13]
xm = xm.split(":")[2]
if int(xg) <= gaps and int(xm) <= mismatches:
psf.write(line)
kept += 1
else:
tossed += 1
else:
psf.write(line)
if reads_input == True and clean == True:
subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam
samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam'
try:
temp = samfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = samfile.rsplit(".",1)[0]
logging.info("%s %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept)))
else:
logging.info("%s Alignment .sam input identified, skipping gaps/mismatches processing ..." % str(round((time.time() - float(start))/60,1)))
logging.info("%s Converting .sam to .bam ..." % str(round((time.time() - float(start))/60,1)))
subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True)
logging.info("%s Conversion done." % str(round((time.time() - float(start))/60,1)))
bamfile = outpath + str(base) + ".bam"
samfile_cleaned = False
if reads_input == True and clean == True:
samfile_cleaned = True
subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam
# Gap/mismatch processing for bam file
if bamfile != '':
bam_input = True
if sam_input == False and args_all == False:
try:
temp = bamfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = bamfile.rsplit(".",1)[0]
logging.info("%s Alignment .bam input identified, coverting to .sam for gap/mismatch filtering ..." % str(round((time.time() - float(start))/60,1)))
subprocess.run(sampath + "samtools view -h -o " + outpath + str(base) + ".sam " + str(bamfile) + " 2> /dev/null", shell=True)
samfile = outpath + str(base) + ".sam"
processed = True
logging.info("%s Alignment .sam input identified, processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1)))
tossed = 0
kept = 0
with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf:
for line in sf:
if "XG:i:" in line and "XM:i:" in line:
xg = line.split("\t")[15]
xg = xg.split(":")[2]
xm = line.split("\t")[13]
xm = xm.split(":")[2]
if int(xg) <= gaps and int(xm) <= mismatches:
psf.write(line)
kept += 1
else:
tossed += 1
else:
psf.write(line)
if clean == True:
subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam
samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam'
try:
temp = samfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = samfile.rsplit(".",1)[0]
logging.info("%s %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept)))
logging.info("%s Converting processed .sam to .bam ..." % str(round((time.time() - float(start))/60,1)))
subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True)
logging.info("%s Conversion done." % str(round((time.time() - float(start))/60,1)))
bamfile = outpath + str(base) + ".bam"
if clean == True:
samfile_cleaned = True
subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam
if sam_input == False and args_all == True:
logging.info("%s Alignment .bam input identified, sorting .bam file ..." % str(round((time.time() - float(start))/60,1)))
try:
temp = bamfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = bamfile.rsplit(".",1)[0]
subprocess.run(sampath + "samtools sort -o " + outpath + str(base) + ".sorted.bam " + str(bamfile) + " 2> /dev/null", shell=True)
logging.info("%s Sorting done." % str(round((time.time() - float(start))/60,1)))
sortbam = outpath + str(base) + ".sorted.bam"
bamfile_cleaned = False
if sam_input == True and clean == True:
bamfile_cleaned = True
subprocess.run("rm " + str(bamfile), shell=True)
# Gap/mismatch processing for sorted bam file
if sortbamfile != '':
if args_all == False and processed == False:
try:
temp = sortbamfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = sortbamfile.rsplit(".",1)[0]
logging.info("%s Sorted .bam input identified, coverting to .sam for gap/mismatch filtering ..." % str(round((time.time() - float(start))/60,1)))
subprocess.run(sampath + "samtools view -h -o " + outpath + str(base) + ".sam " + str(sortbamfile) + " 2> /dev/null", shell=True)
samfile = outpath + str(base) + ".sam"
processed = True
logging.info("%s Processing gaps/mismatches ..." % str(round((time.time() - float(start))/60,1)))
tossed = 0
kept = 0
with open(samfile, 'r') as sf, open(outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam', 'w') as psf:
for line in sf:
if "XG:i:" in line and "XM:i:" in line:
xg = line.split("\t")[15]
xg = xg.split(":")[2]
xm = line.split("\t")[13]
xm = xm.split(":")[2]
if int(xg) <= gaps and int(xm) <= mismatches:
psf.write(line)
kept += 1
else:
tossed += 1
else:
psf.write(line)
if clean == True:
subprocess.run("rm " + str(samfile), shell=True) # remove non-processed sam
samfile = outpath + str(base)+'.gap-' + str(gaps) + '_mm-' + str(mismatches) + '.sam'
try:
temp = samfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = samfile.rsplit(".",1)[0]
logging.info("%s %s of %s total aligned reads were tossed for having too many gaps/mismatches ..." % (str(round((time.time() - float(start))/60,1)),str(tossed),str(tossed+kept)))
logging.info("%s Converting processed .sam to .bam ..." % str(round((time.time() - float(start))/60,1)))
subprocess.run(sampath + "samtools view -S -b " + str(samfile) + " > " + outpath + str(base) + ".bam 2> /dev/null", shell=True)
logging.info("%s Conversion done." % str(round((time.time() - float(start))/60,1)))
bamfile = outpath + str(base) + ".bam"
if clean == True:
samfile_cleaned = True
subprocess.run("rm " + str(samfile), shell=True) # remove processed or non-processed sam
logging.info("%s Sorting .bam file ..." % str(round((time.time() - float(start))/60,1)))
try:
temp = bamfile.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = bamfile.rsplit(".",1)[0]
subprocess.run(sampath + "samtools sort -o " + outpath + str(base) + ".sorted.bam " + str(bamfile) + " 2> /dev/null", shell=True)
logging.info("%s Sorting done." % str(round((time.time() - float(start))/60,1)))
sortbam = outpath + str(base) + ".sorted.bam"
if clean == True:
samfile_cleaned = True
subprocess.run("rm " + str(bamfile), shell=True) # remove processed or non-processed sam
sortbam = sortbamfile
if args_all == True:
try:
temp = sortbam.rsplit("/",1)[1]
base = temp.rsplit(".",1)[0]
except Exception:
base = sortbam.rsplit(".",1)[0]
logging.info("%s Sorted alignment .bam input identified, skipping sorting and gap/mismatch processing." % str(round((time.time() - float(start))/60,1)))
# Extract coverage for each nucleotide
temp = outpath + str(base) + ".temp-list.txt"
subprocess.run("echo '" + str(sortbam) + "' > " + str(temp), shell=True)
cov_file = str(sortbam).rsplit(".",1)[0] + ".coverage-by-nt.tsv"
logging.info("%s Indexing sorted .bam file ..." % str(round((time.time() - float(start))/60,1)))
subprocess.run(sampath + "samtools index " + str(sortbam), shell=True) # index bam file
logging.info("%s Extracting coverage information from sorted .bam file ..." % str(round((time.time() - float(start))/60,1)))
# take the whole scaffold
for genome in vibrant_genomes_full:
subprocess.run(sampath + "samtools depth -r " + str(genome) + ":0-999999999 -a -f " + str(temp) + " >> " + cov_file + " 2> /dev/null", shell=True) # extract coverage info
#subprocess.run("rm " + str(temp) + " 2> /dev/null", shell=True)
#subprocess.run("rm " + str(sortbam) + ".bai 2> /dev/null", shell=True) # remove index
subprocess.run("echo 'next\t0\t0\n' >> " + str(cov_file), shell=True)
logging.info("%s Coverage extraction done." % str(round((time.time() - float(start))/60,1)))
##### Make a list of all the scafolds
scaffold_file = str(sortbam).rsplit(".",1)[0] + ".scaffold_list.tsv"
scaffold_cov_file = str(sortbam).rsplit(".",1)[0] + ".scaffold-coverage-by-nt.tsv"
subprocess.run(sampath + "samtools idxstats " + str(sortbam) + " | cut -f 1 >> " + scaffold_file + " 2> /dev/null", shell=True)
scaffold_list = []
a_file = open(scaffold_file, "r")
for line in a_file:
stripped_line = line.strip()
scaffold_list.append(stripped_line)
a_file.close()
##### Take coverage for all scaffolds not just those that have prophages
for scaffold in scaffold_list:
subprocess.run(sampath + "samtools depth -r " + str(scaffold) + ":0-999999999 -a -f " + str(temp) + " >> " + scaffold_cov_file + " 2> /dev/null", shell=True) # extract coverage info
subprocess.run("echo 'next\t0\t0' >> " + str(scaffold_cov_file), shell=True)
if int(os.stat(cov_file).st_size) < 50: # Exit if file is empty (< 50 bytes)
if samfile == "":
samfile = "N/A"
if bamfile == "":
bamfile = "N/A"
subprocess.run("rm " + str(cov_file), shell=True)
logging.info("%s No coverage identified for scaffolds with integrated prophages. Analysis finished." % str(round((time.time() - float(start))/60,1)))
logging.info("")
logging.info("")
logging.info("CAUTION:")
logging.info(" 'No coverage identified' may be due to an incorrect -y flag.")
logging.info(" Please verify -y input correctly distinguishes host from prophage sequences.")
logging.info(" This can be verified by checking the -v or -f inputs.")
logging.info(" Example host name detected: %s" % save_name)
logging.info(" Example prophage name detected: %s" % save_frag)
logging.info(" -y flag input: %s" % termsplit)
logging.info("")
logging.info("")
logging.info("Log file: %s" % logfilename)
logging.info("Output results file: N/A")
if samfile_cleaned == False:
logging.info("sam file: %s" % samfile)
else:
logging.info("sam file: N/A")
if bamfile_cleaned == False:
logging.info("bam file: %s" % bamfile)
else:
logging.info("bam file: N/A")
logging.info("sorted bam file: %s" % sortbam)
logging.info("")
logging.info("")
logging.info("Number of active prophages identified: 0")
logging.info("")
logging.info(' ##')
logging.info(' ## ##')
logging.info(' ## ##')
logging.info('###### ## ## ## ####### ###### ##### ## ##')
logging.info('## ## ## ## ## ## ## ## ## ## ##')
logging.info('###### ###### ###### ## ### ###### ### ##')
logging.info('## ## ## ## ## ## ## ## ## ##')
logging.info('## ## ## ## ## ####### ###### ##### ##')
logging.info(' # ## #')
logging.info(' # # ## # #')
logging.info(' # # # #')
logging.info(' # #')
logging.info("")
exit(0)
# calculate average read length
total = subprocess.check_output(sampath + "samtools view -c " + str(sortbam), shell=True) # if there are less than 10k reads, use total reads to length check
total = int(float(str(total.strip()).split("'")[1]))
if int(total) < 10000:
head = total
else:
head = 10000
length = subprocess.check_output(sampath + "samtools view " + str(sortbam) + " | head -n " + str(head) + " | cut -f 10 | wc -c", shell=True)
length = int(float(str(length.strip()).split("'")[1])/int(head))
logging.info("%s Number of valid reads used: %s." % (str(round((time.time() - float(start))/60,1)),str(total)))
logging.info("%s Estimated read length to trim by: %s." % (str(round((time.time() - float(start))/60,1)),str(length)))
### Calculate the cover for every scaffold
scaffold_coverage = {}
b_file = open(scaffold_cov_file, "r")
prev_genome = ''
host = False
temp_host = []
for line in b_file:
line = line.strip()
if str(line.split("\t")[0]) != str(prev_genome):
if host == True: # scaffold is done. Update coverage
if len(temp_host) > 0:
scaffold_coverage.update({str(prev_genome):(temp_host[:])}) # trim the last N nucleotides from the last section added
host = False
temp_host = []
prev_genome = ''
if (line.split("\t")[0] != "next"):
prev_genome = str(line.split("\t")[0])
temp_host.append(str(line.split("\t")[0]))
temp_host.append(str(line.split("\t")[2]).strip("\n"))
host = True
else:
temp_host.append(str(line.split("\t")[2]).strip("\n"))
b_file.close()
###Add up all the coverages of scaffolds per bin
bin_coverage = {}
for i in bin_list:
bin_coverage.update({str(i):[]})
for key in scaffold_coverage.keys():
host = str(key.split(binsplit)[0])
if host in bin_coverage.keys():
old_line = bin_coverage[host][:]
new_line = scaffold_coverage[key][1:]
combined_list = old_line + new_line
bin_coverage[host] = combined_list
##I will still run this to get a dictionary of "prophage coverage"
#grab only relevant genomes and count genome lengths
prophage_coverage = {}
host_coverage = {}
temp_host = []
temp_phage = []
with open(cov_file, 'r') as covfile:
logging.info("%s Identifying coverages for host/prophage regions ..." % str(round((time.time() - float(start))/60,1)))
prev_genome = ''
n = 0
host = False
phage = False
host_start = False
phage_start = False
host_name = False
phage_name = False
for line in covfile:
if str(line.split("\t")[0]) != str(prev_genome):
# scaffold is done. Update coverage
if host == True:
if len(temp_host) > 0:
host_coverage.update({str(prev_genome):temp_host[:-length]}) # trim the last N nucleotides from the last section added
if len(temp_phage) > 0:
prophage_coverage.update({prophage_dict_frags[str(prev_genome)]:temp_phage})
elif phage == True:
if len(temp_phage) > 0:
prophage_coverage.update({prophage_dict_frags[str(prev_genome)]:temp_phage[:-length]}) # trim the last N nucleotides from the last section added
if len(temp_host) > 0:
host_coverage.update({str(prev_genome):temp_host})
n = 0
temp_phage = []
temp_host = []
host = False
phage = False
host_name = False
phage_name = False
prev_genome = str(line.split("\t")[0])
n += 1 # needs to come after previous statement of writing and length counting
if (line.split("\t")[0] in vibrant_genomes_full or line.split("\t")[0] == "next"): # trim by first length, only include phages, "next" is for including last scaffold
if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) not in prophage_dict.keys() and n == 1: # is the first nucleotide host?
if host_name == False:
temp_host.append(str(line.split("\t")[0]))
host_name = True
temp_host.append(str(line.split("\t")[2]).strip("\n"))
host = True
phage = False
elif str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) in prophage_dict.keys() and n == 1:
if phage_name == False:
temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
phage_name = True
temp_phage.append(str(line.split("\t")[2]).strip("\n").strip("\n"))
phage = True
host = False
if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) not in prophage_dict.keys() and n > 1:
if phage == True:
if phage_name == False:
temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
phage_name = True
temp_phage.append(str(line.split("\t")[2]).strip("\n"))
elif host == True:
if host_name == False:
temp_host.append(str(line.split("\t")[0]))
host_name = True
temp_host.append(str(line.split("\t")[2]).strip("\n"))
if str(line.split("\t")[0]) + "~" + str(line.split("\t")[1]) in prophage_dict.keys() and n > 1: # is the nucleotide phage?
if host == True: # must mean that host is finished and phage starting
host = False
phage = True
if phage_name == False:
temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
phage_name = True
temp_phage.append(str(line.split("\t")[2]).strip("\n"))
elif phage == True: # phage must be done
if phage_name == False:
temp_phage.append(prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])])
phage_name = True
temp_phage.append(str(line.split("\t")[2]).strip("\n"))
prophage_coverage.update({prophage_dict[str(line.split("\t")[0]) + "~" + str(line.split("\t")[1])]:temp_phage}) # write to phage coverage, reset phage
temp_phage = []
host = True
phage = False
phage_name = False
# Statistics and metrics
### First I will make the statistics per bacterial bin to save some time:
bin_stats = {}
for key in bin_coverage:
cov_h = ([int(i) for i in bin_coverage[key][1:]])
avg_h = (statistics.mean(cov_h))
sd_h = (statistics.stdev(cov_h))
med_h = (statistics.median(cov_h))
stat_collection = [cov_h, avg_h, sd_h, med_h]
bin_stats.update({str(key):stat_collection})
#host_coverage.update({str(prev_genome):temp_host})
###I modified this section so that I can add in my host statistics
logging.info("%s Coverage identification done." % str(round((time.time() - float(start))/60,1)))
logging.info("%s Performing statistical tests ..." % str(round((time.time() - float(start))/60,1)))
total = []
total_adj = []
with open(outfile, 'w') as output:
output.write("prophage\thost\tactive\tMW_pvalue\tCohenD\tprophage-host_ratio\tmean_difference\tprophage_len\tprophage_mean_cov\tprophage_median_cov\tprophage_sd_cov\thost_len\thost_mean_cov\thost_median_cov\thost_sd_cov\n")
for key in prophage_coverage.keys():
host = str(key.split(binsplit)[0])
if host in bin_coverage.keys():
cov_p = [int(i) for i in prophage_coverage[key][1:]]
cov_h = bin_stats[host][0]
avg_p = statistics.mean(cov_p)
sd_p = statistics.stdev(cov_p)
avg_h = bin_stats[host][1]
sd_h = bin_stats[host][2]
# if outliers > 0:
# cov_p = [x for x in cov_p if x >= (avg_p-outliers*sd_p) and x <= (avg_p+outliers*sd_p)]
# cov_h = [x for x in cov_h if x >= (avg_h-outliers*sd_h) and x <= (avg_h+outliers*sd_h)]
# #
# avg_p = statistics.mean(cov_p)
# sd_p = statistics.stdev(cov_p)
# avg_h = statistics.mean(cov_h)
# sd_h = statistics.stdev(cov_h)
med_h = bin_stats[host][2]
med_p = statistics.median(cov_p)
mean_diff = avg_p - avg_h
result = MWdownsampling(cov_p, cov_h)
if mean_diff > 0:
cd = cohenD(avg_p, sd_p, avg_h, sd_h)
else:
cd = 0
host_zero_ratio = False
if avg_h > 0:
ratio = avg_p/avg_h
else:
ratio = avg_p
if avg_p >= 0.5:
host_zero_ratio = True
#
active = "no"
if (float(result[1]) <= float(pval) or str(result[1]) == 'nan') and cd >= effect and mean_diff > 0 and (ratio >= ratio_cutoff or host_zero_ratio == True):
active = "yes"
total.append("yes")
if len(cov_h) < 1000 or len(cov_p) < 1000: # minimum host/phage length is 1000 bp
active = 'NA'
active_adj = 'NA'
if replace == True:
output.write(str(key).replace(str(spaces), " ")+"\t"+str(host).replace(str(spaces), " ")+"\t"+str(active)+"\t"+str(result[1])+"\t"+str(cd)+"\t"+str(ratio)+"\t"+str(mean_diff)+"\t"+str(len(cov_p))+"\t"+str(avg_p)+"\t"+str(med_p)+"\t"+str(sd_p)+"\t"+str(len(cov_h))+"\t"+str(avg_h)+"\t"+str(med_h)+"\t"+str(sd_h)+"\n")
else:
output.write(str(key)+"\t"+str(host)+"\t"+str(active)+"\t"+str(result[1])+"\t"+str(cd)+"\t"+str(ratio)+"\t"+str(mean_diff)+"\t"+str(len(cov_p))+"\t"+str(avg_p)+"\t"+str(med_p)+"\t"+str(sd_p)+"\t"+str(len(cov_h))+"\t"+str(avg_h)+"\t"+str(med_h)+"\t"+str(sd_h)+"\n")
else:
if replace == True:
output.write(str(key).replace(str(spaces), " ")+"\t"+str(host).replace(str(spaces), " ")+"\tno_coverage\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\n")
else:
output.write(str(key)+"\t"+str(host)+"\tno_coverage\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\tna\n")
if samfile == "":
samfile = "N/A"
if bamfile == "":
bamfile = "N/A"
subprocess.run("rm " + str(cov_file), shell=True)
logging.info("%s Statistical tests done. Analysis finished." % str(round((time.time() - float(start))/60,1)))
logging.info("")
logging.info("")
logging.info("Log file: %s" % logfilename)
logging.info("Output results file: %s" % outfile)
if samfile_cleaned == False:
logging.info("sam file: %s" % samfile)
else:
logging.info("sam file: N/A")
if bamfile_cleaned == False:
logging.info("bam file: %s" % bamfile)
else:
logging.info("bam file: N/A")
logging.info("sorted bam file: %s" % sortbam)
logging.info("")
logging.info("")
logging.info("Number of active prophages identified: %s" % len(total))
logging.info("")
logging.info(' ##')
logging.info(' ## ##')
logging.info(' ## ##')
logging.info('###### ## ## ## ####### ###### ##### ## ##')
logging.info('## ## ## ## ## ## ## ## ## ## ##')
logging.info('###### ###### ###### ## ### ###### ### ##')
logging.info('## ## ## ## ## ## ## ## ## ##')
logging.info('## ## ## ## ## ####### ###### ##### ##')
logging.info(' # ## #')
logging.info(' # # ## # #')
logging.info(' # # # #')
logging.info(' # #')
logging.info("")
#
#
#