Merge pull request connor-lab#2 from mgcam/qc_extension

Optionally compute and save to the QC summary additional QC metrics.
jidur · Jun 30, 2021 · 8ec11fd · 8ec11fd
2 parents 8af5152 + e5fdace
commit 8ec11fd
Showing 1 changed file with 89 additions and 38 deletions.
diff --git a/bin/qc.py b/bin/qc.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import shlex
+import argparse
 
 """
 This script can incorporate as many QC checks as required
@@ -106,26 +107,11 @@ def get_num_reads(bamfile):
     what = shlex.split(command)
 
     return subprocess.check_output(what).decode().strip()
-
-def go(args):
-    if args.illumina:
-        depth = 10
-    elif args.nanopore:
-        depth = 20
 
-    ## Depth calcs
-    ref_length = get_ref_length(args.ref)
-    depth_pos = read_depth_file(args.bam)
-
-    depth_covered_bases = get_covered_pos(depth_pos, depth)
-
-    pct_covered_bases = depth_covered_bases / ref_length * 100
-
-    ## Number of aligned reads calculaton
-    num_reads = get_num_reads(args.bam)
+def assess(fasta_file, bam_file=None, ref_length=None, depth=None):
 
     # Unknown base calcs
-    fasta = SeqIO.read(args.fasta, "fasta")
+    fasta = SeqIO.read(fasta_file, "fasta")
 
     pct_N_bases   = 0
     largest_N_gap = 0
@@ -136,42 +122,107 @@ def go(args):
         pct_N_bases = get_pct_N_bases(fasta)
         largest_N_gap = get_largest_N_gap(fasta)
 
-    	# QC PASS / FAIL
+        # QC PASS / FAIL
         if largest_N_gap >= 10000 or pct_N_bases < 50.0:
-                qc_pass = "TRUE"
-
+            qc_pass = "TRUE"
 
-    qc_line = { 'sample_name' : args.sample,
-                'pct_N_bases' : "{:.2f}".format(pct_N_bases),
-          'pct_covered_bases' : "{:.2f}".format(pct_covered_bases), 
-           'longest_no_N_run' : largest_N_gap,
-          'num_aligned_reads' : num_reads,
-                       'fasta': args.fasta, 
-                        'bam' : args.bam,
-                    'qc_pass' : qc_pass}
+    N_density = sliding_window_N_density(fasta)
 
+    # The order of keys is important
+    pairs = [('pct_N_bases', "{:.2f}".format(pct_N_bases)),
+             ('longest_no_N_run', largest_N_gap),
+             ('fasta', fasta_file),
+             ('qc_pass', qc_pass)]
+
+    depth_pos = None
+    if bam_file != None:
+       depth_pos = read_depth_file(bam_file)
+       depth_covered_bases = get_covered_pos(depth_pos, depth)
+       pct_covered_bases = depth_covered_bases / ref_length * 100
+       # Number of aligned reads calculaton
+       num_reads = get_num_reads(bam_file)
+
+       pairs.insert(1,
+           ('pct_covered_bases', "{:.2f}".format(pct_covered_bases)))
+       pairs.insert(3, ('num_aligned_reads', num_reads)) 
+       pairs.insert(5, ('bam', bam_file))
+    else:
+       # Remap key names
+       pairs = [ (pair[0]+'_amd', pair[1]) for pair in pairs]
+
+    return (dict(pairs), N_density, depth_pos) 
+
+def go(args):
+    if args.illumina:
+        depth = 10
+    elif args.nanopore:
+        depth = 20
 
+    ## Depth calcs
+    ref_length = get_ref_length(args.ref)
+
+    ## Get QC values for a pair of bam-fasta files
+    (qc_values, N_density, depth_pos) = assess(
+        args.fasta, args.bam, ref_length, depth);
+    ## Get the keys in the order they were inserted
+    column_names = list(qc_values)
+    if args.ivar_md != None:
+        qc_values['ivar_md'] = args.ivar_md
+        column_names.insert(-1, 'ivar_md')  
+    ## Prepend sample name column
+    column_names.insert(0, 'sample_name')
+    qc_values['sample_name'] = args.sample
+    qc_line = qc_values
+
+    ## If appropriate, get QC values for another pair of bam-fasta files
+    if args.fasta_amd:
+        (qc_values_amd, tmp1, tmp2) = assess(args.fasta_amd);
+        if args.ivar_md != None:
+            qc_values_amd['ivar_amd'] = args.ivar_amd
+        ## Combine two dictionaries
+        qc_line = {**qc_values, **qc_values_amd};
+        ## Set correct order for the list of column names
+        qc_pass_column = column_names.pop()
+        column_names.extend(list(qc_values_amd))
+        ## Reinstall qc pass columns as the last column
+        column_names.append(qc_pass_column)
+
+    ## Write all QC values to a CSV file
     with open(args.outfile, 'w') as csvfile:
-        header = qc_line.keys()
+        header = column_names
         writer = csv.DictWriter(csvfile, fieldnames=header)
         writer.writeheader()
         writer.writerow(qc_line)
 
-    N_density = sliding_window_N_density(fasta)
     make_qc_plot(depth_pos, N_density, args.sample)
 
 def main():
-    import argparse
 
     parser = argparse.ArgumentParser()
     group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument('--nanopore', action='store_true')
-    group.add_argument('--illumina', action='store_true')
-    parser.add_argument('--outfile', required=True)
-    parser.add_argument('--sample', required=True)
-    parser.add_argument('--ref', required=True)
-    parser.add_argument('--bam', required=True)
-    parser.add_argument('--fasta', required=True)
+    group.add_argument('--nanopore', action='store_true',
+        help='''A boolean flag describing the sequencing platform used.''')
+    group.add_argument('--illumina', action='store_true',
+        help='''A boolean flag describing the sequencing platform used.''')
+    parser.add_argument('--outfile', required=True,
+        help='''The path of the output QC summary file''')
+    parser.add_argument('--sample', required=True, help='Sample name.')
+    parser.add_argument('--ref', required=True,
+        help='''The path of the reference FASTA file.''')
+    parser.add_argument('--bam', required=True,
+        help='''The path of the aligned and filtered BAM file.''')
+    parser.add_argument('--fasta', required=True,
+        help='''The path of a consensus fasta file produced by ivar using the
+                minimum depth given by the --ivar_amd argument, required.''')
+    parser.add_argument('--fasta_amd', required=False, default=None,
+        help='''The path of a consensus fasta file produced by ivar using the
+                minimum depth given by the --ivar_amd argument, optional.''')
+    parser.add_argument('--ivar_md', required=False, default=None,
+        help='''Minimum depth value used for ivar when generating the consensus
+                file given by the --fasta argument, optional.''')
+    parser.add_argument('--ivar_amd', required=False, default=None,
+        help='''Minimum depth value used for ivar when generating the consensus
+                file given by the --fasta_amd argument, optional.''') 
 
     args = parser.parse_args()
     go(args)