Skip to content

Commit

Permalink
Merge pull request connor-lab#2 from mgcam/qc_extension
Browse files Browse the repository at this point in the history
Optionally compute and save to the QC summary additional QC metrics.
  • Loading branch information
jidur authored Jun 30, 2021
2 parents 8af5152 + e5fdace commit 8ec11fd
Showing 1 changed file with 89 additions and 38 deletions.
127 changes: 89 additions & 38 deletions bin/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import matplotlib.pyplot as plt
import shlex
import argparse

"""
This script can incorporate as many QC checks as required
Expand Down Expand Up @@ -106,26 +107,11 @@ def get_num_reads(bamfile):
what = shlex.split(command)

return subprocess.check_output(what).decode().strip()

def go(args):
if args.illumina:
depth = 10
elif args.nanopore:
depth = 20

## Depth calcs
ref_length = get_ref_length(args.ref)
depth_pos = read_depth_file(args.bam)

depth_covered_bases = get_covered_pos(depth_pos, depth)

pct_covered_bases = depth_covered_bases / ref_length * 100

## Number of aligned reads calculaton
num_reads = get_num_reads(args.bam)
def assess(fasta_file, bam_file=None, ref_length=None, depth=None):

# Unknown base calcs
fasta = SeqIO.read(args.fasta, "fasta")
fasta = SeqIO.read(fasta_file, "fasta")

pct_N_bases = 0
largest_N_gap = 0
Expand All @@ -136,42 +122,107 @@ def go(args):
pct_N_bases = get_pct_N_bases(fasta)
largest_N_gap = get_largest_N_gap(fasta)

# QC PASS / FAIL
# QC PASS / FAIL
if largest_N_gap >= 10000 or pct_N_bases < 50.0:
qc_pass = "TRUE"

qc_pass = "TRUE"

qc_line = { 'sample_name' : args.sample,
'pct_N_bases' : "{:.2f}".format(pct_N_bases),
'pct_covered_bases' : "{:.2f}".format(pct_covered_bases),
'longest_no_N_run' : largest_N_gap,
'num_aligned_reads' : num_reads,
'fasta': args.fasta,
'bam' : args.bam,
'qc_pass' : qc_pass}
N_density = sliding_window_N_density(fasta)

# The order of keys is important
pairs = [('pct_N_bases', "{:.2f}".format(pct_N_bases)),
('longest_no_N_run', largest_N_gap),
('fasta', fasta_file),
('qc_pass', qc_pass)]

depth_pos = None
if bam_file != None:
depth_pos = read_depth_file(bam_file)
depth_covered_bases = get_covered_pos(depth_pos, depth)
pct_covered_bases = depth_covered_bases / ref_length * 100
# Number of aligned reads calculaton
num_reads = get_num_reads(bam_file)

pairs.insert(1,
('pct_covered_bases', "{:.2f}".format(pct_covered_bases)))
pairs.insert(3, ('num_aligned_reads', num_reads))
pairs.insert(5, ('bam', bam_file))
else:
# Remap key names
pairs = [ (pair[0]+'_amd', pair[1]) for pair in pairs]

return (dict(pairs), N_density, depth_pos)

def go(args):
if args.illumina:
depth = 10
elif args.nanopore:
depth = 20

## Depth calcs
ref_length = get_ref_length(args.ref)

## Get QC values for a pair of bam-fasta files
(qc_values, N_density, depth_pos) = assess(
args.fasta, args.bam, ref_length, depth);
## Get the keys in the order they were inserted
column_names = list(qc_values)
if args.ivar_md != None:
qc_values['ivar_md'] = args.ivar_md
column_names.insert(-1, 'ivar_md')
## Prepend sample name column
column_names.insert(0, 'sample_name')
qc_values['sample_name'] = args.sample
qc_line = qc_values

## If appropriate, get QC values for another pair of bam-fasta files
if args.fasta_amd:
(qc_values_amd, tmp1, tmp2) = assess(args.fasta_amd);
if args.ivar_md != None:
qc_values_amd['ivar_amd'] = args.ivar_amd
## Combine two dictionaries
qc_line = {**qc_values, **qc_values_amd};
## Set correct order for the list of column names
qc_pass_column = column_names.pop()
column_names.extend(list(qc_values_amd))
## Reinstall qc pass columns as the last column
column_names.append(qc_pass_column)

## Write all QC values to a CSV file
with open(args.outfile, 'w') as csvfile:
header = qc_line.keys()
header = column_names
writer = csv.DictWriter(csvfile, fieldnames=header)
writer.writeheader()
writer.writerow(qc_line)

N_density = sliding_window_N_density(fasta)
make_qc_plot(depth_pos, N_density, args.sample)

def main():
import argparse

parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--nanopore', action='store_true')
group.add_argument('--illumina', action='store_true')
parser.add_argument('--outfile', required=True)
parser.add_argument('--sample', required=True)
parser.add_argument('--ref', required=True)
parser.add_argument('--bam', required=True)
parser.add_argument('--fasta', required=True)
group.add_argument('--nanopore', action='store_true',
help='''A boolean flag describing the sequencing platform used.''')
group.add_argument('--illumina', action='store_true',
help='''A boolean flag describing the sequencing platform used.''')
parser.add_argument('--outfile', required=True,
help='''The path of the output QC summary file''')
parser.add_argument('--sample', required=True, help='Sample name.')
parser.add_argument('--ref', required=True,
help='''The path of the reference FASTA file.''')
parser.add_argument('--bam', required=True,
help='''The path of the aligned and filtered BAM file.''')
parser.add_argument('--fasta', required=True,
help='''The path of a consensus fasta file produced by ivar using the
minimum depth given by the --ivar_amd argument, required.''')
parser.add_argument('--fasta_amd', required=False, default=None,
help='''The path of a consensus fasta file produced by ivar using the
minimum depth given by the --ivar_amd argument, optional.''')
parser.add_argument('--ivar_md', required=False, default=None,
help='''Minimum depth value used for ivar when generating the consensus
file given by the --fasta argument, optional.''')
parser.add_argument('--ivar_amd', required=False, default=None,
help='''Minimum depth value used for ivar when generating the consensus
file given by the --fasta_amd argument, optional.''')

args = parser.parse_args()
go(args)
Expand Down

0 comments on commit 8ec11fd

Please sign in to comment.