Skip to content

Commit

Permalink
Merge pull request #68 from artic-network/1.3.0-dev
Browse files Browse the repository at this point in the history
patch 1.2.0 > 1.2.1
  • Loading branch information
nickloman authored Jan 14, 2021
2 parents d5a0cb8 + 90cc68c commit 23a8460
Show file tree
Hide file tree
Showing 7 changed files with 229 additions and 9 deletions.
210 changes: 210 additions & 0 deletions artic/artic_mqc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env python
import json
import re
import sys
from collections import OrderedDict

from .vcftagprimersites import read_bed_file

# Alignment_Length_Threshold drops binned reads that are <X% of amplicon length)
Alignment_Length_Threshold = 0.95

# Amplicon_Dropout_Val will report amplicon dropout in any amplicon which has fewer than X reads
Amplicon_Dropout_Val = 50

# Template for the amplicon plot data
amplicon_plot_template = {
"id": "custom_data_lineplot",
"section_name": "ARTIC: Amplicon Coverage",
"description": "This plot summarises the number of reads that were assigned to each amplicon in the primer scheme.\nWe use the align_trim report file from the ARTIC pipeline and group each read by its assigned amplicon.\nIf the length of alignment between read and reference is <{}% of the amplicon length, the read discarded from the coverage plot.\nIf the total number of reads assigned to an amplicon is below {} (red dashed line),\nthe amplicon is marked as dropped out." .format(Alignment_Length_Threshold, Amplicon_Dropout_Val),
"plot_type": "linegraph",
"pconfig": {
"id": "custom_data_linegraph",
"title": "",
"categories": "True",
"yDecimals": "False",
"xDecimals": "False",
"ylab": "# reads",
"xlab": "amplicon",
"yPlotLines": [{
"color": "#FF0000",
"width": 2,
"dashStyle": "LongDash",
"label": "Amplicon dropout",
"value": Amplicon_Dropout_Val
}]
},
"data": {}
}

# Template for the stats table data
amplicon_stats_template = {
"id": "custom_data_json_table",
"section_name": "ARTIC: General Stats",
"description": "A summary of stats from the consensus genome pipeline.",
"plot_type": "table",
"pconfig": {
"id": "custom_data_json_table_table",
"title": "",
"min": 0,
"scale": "RdYlGn-rev",
"format": "{:,.0f}"
},
"data": {}
}

def getSchemeAmplicons(schemeFile):
"""Get the expected amplicon names from the provided scheme.
Parameters
----------
schemeFile : string
The filename of the primer scheme
Returns
-------
dict
A dict of amplicon names -> zeroed counter
"""
amplicons = {}
primer_scheme = read_bed_file(schemeFile)
for primer in primer_scheme:
amplicon = ""
if primer["direction"] == "+":
amplicon = primer["Primer_ID"].split("_LEFT")[0]
else:
amplicon = primer["Primer_ID"].split("_RIGHT")[0]
if amplicon not in amplicons:
amplicons[amplicon] = 0
amplicons[amplicon] += 1
named_amplicons = {}
for amplicon in amplicons:
if amplicons[amplicon] != 2:
print("in correct numbers of primer for {}" .format(amplicon), file=sys.stderr)
raise SystemExit(1)
named_amplicons[("{}_LEFT_{}_RIGHT" .format(amplicon, amplicon))] = 0
return named_amplicons

def getAmpliconCounts(amplicons, align_trim_report):
"""Get the read counts per amplicon.
Parameters
----------
amplicons : list
Dict of amplicon names found in scheme, linked to a zeroed counter
align_trim_report: string
File path to the align_trim report
Returns
-------
dict
Dict of amplicon names -> populated read counts
"""
# process the align_trim report
with open(align_trim_report, "r") as fh:

# skip the first line (header)
fh.readline()

# process each line and add to counts
for l in fh:
fields = l.rstrip().split('\t')

# check read is from a properly paired amplicon
if int(fields[12]) != 1:
continue

# check the read alignment length covers enough of the amplicon
aLen = int(fields[11]) - int(fields[10])
rLen = int(fields[2]) - int(fields[1])
if aLen < (Alignment_Length_Threshold * rLen):
continue

# increment the read count for this amplicon
if fields[3] not in amplicons:
print("amplicon in align_trim report but not in primer scheme {}" .format(fields[3]), file=sys.stderr)
raise SystemExit(1)
amplicons[fields[3]] += 1
return amplicons

def getVCFreportInfo(vcf_report):
"""Get the read counts per amplicon.
Parameters
----------
vcf_report: string
File path to the vcf_report
Returns
-------
dict
Dict of vcf stats -> values
"""
# Read vcfcheck report and get important stuff out (NOTE: more to be added in next release)
stats = dict()
total_vars = 0
passed_vars = 0
with open(vcf_report, "r") as fh:
for l in fh:
match = re.search(r'.*\t(\d+)\svariant\srecords\sprocessed', l)
if match:
total_vars = int(match.group(1))
match = re.search(r'.*\t(\d+)\svariant\srecords\spassed\schecks', l)
if match:
passed_vars = int(match.group(1))
stats["# overlap var. fails"] = total_vars - passed_vars
return stats

def run(args):
"""Collect stats from ARTIC pipeline output and generate files for use by MultiQC.
"""
# get a list of expected amplicon names
amplicons = getSchemeAmplicons(args.scheme)

# open align trim output and count reads per amplicon in scheme
amplicon_counts = getAmpliconCounts(amplicons, args.align_report)

# replace amplicon names with ints and count number of dropouts
dropouts = 0
amplicon_renamed_counts = dict()
for amplicon, count in amplicon_counts.items():
amplicon_renamed_counts[int(amplicon.split('_')[1])] = count
if count < Amplicon_Dropout_Val:
dropouts += 1

# add counts to multiqc amplicon plot template
amplicon_plot_template["data"][args.sample] = amplicon_renamed_counts

# write the amplicon plot output
with open("{}.amplicon_plot_data_mqc.json" .format(args.sample), "w") as amplicon_plot_mqc_file:
json.dump(amplicon_plot_template, amplicon_plot_mqc_file, indent=4, sort_keys=False)
amplicon_plot_mqc_file.close()

# add counts to multiqc stats template
amplicon_stats_template["data"][args.sample] = dict()
amplicon_stats_template["data"][args.sample]["# low cov. amplicons"] = dropouts

# parse VCF report if provided and add to the stats template
if args.vcf_report:
for stat, value in getVCFreportInfo(args.vcf_report).items():
amplicon_stats_template["data"][args.sample][stat] = value

# write the stats output
with open("{}.amplicon_stats_data_mqc.json" .format(args.sample), "w") as amplicon_stats_mqc_file:
json.dump(amplicon_stats_template, amplicon_stats_mqc_file, indent=4, sort_keys=False)
amplicon_stats_mqc_file.close()

def main():
import argparse
parser = argparse.ArgumentParser(description='Collect stats from ARTIC pipeline output and generate files for use by MultiQC')
parser.add_argument('--scheme', required=True, type=str, help='the amplicon scheme used')
parser.add_argument('--align-report', required=True, type=str, help='the report file from align_trim (*.alignreport.txt')
parser.add_argument('--vcf-report', required=False, type=str, help='the report file from vcf_check (*.vcfreport.txt')
parser.add_argument('sample', type=str, help='the sample name')
args = parser.parse_args()
run(args)

if __name__ == "__main__":
main()

12 changes: 9 additions & 3 deletions artic/minion.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,15 @@ def run(parser, args):
# 8) check and filter the VCFs
## if using strict, run the vcf checker to remove vars present only once in overlap regions (this replaces the original merged vcf from the previous step)
if args.strict:
cmds.append("artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf %s 2> %s.vcfreport.txt" % (args.sample, args.sample, bed, args.sample))
cmds.append("bgzip -f %s.merged.vcf" % (args.sample))
cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample))
cmds.append("artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf.gz %s 2> %s.vcfreport.txt" % (args.sample, args.sample, bed, args.sample))
cmds.append("mv %s.merged.filtered.vcf %s.merged.vcf" % (args.sample, args.sample))

## if doing the medaka workflow and longshot required, do it on the merged VCF
if args.medaka and not args.no_longshot:
cmds.append("bgzip -f %s.merged.vcf" % (args.sample))
cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample))
cmds.append("tabix -f -p vcf %s.merged.vcf.gz" % (args.sample))
cmds.append("longshot -P 0 -F -A --no_haps --bam %s.primertrimmed.rg.sorted.bam --ref %s --out %s.merged.vcf --potential_variants %s.merged.vcf.gz" % (args.sample, ref, args.sample, args.sample))

## set up some name holder vars for ease
Expand Down Expand Up @@ -220,7 +222,11 @@ def run(parser, args):
cmds.append("cat %s.consensus.fasta %s > %s.muscle.in.fasta" % (args.sample, ref, args.sample))
cmds.append("muscle -in %s.muscle.in.fasta -out %s.muscle.out.fasta" % (args.sample, args.sample))

# 12) setup the log file and run the pipeline commands
# 12) get some QC stats
if args.strict:
cmds.append("artic_get_stats --scheme {} --align-report {}.alignreport.txt --vcf-report {}.vcfreport.txt {}" .format(bed, args.sample, args.sample, args.sample))

# 13) setup the log file and run the pipeline commands
log = "%s.minion.log.txt" % (args.sample)
logfh = open(log, 'w')
for cmd in cmds:
Expand Down
2 changes: 1 addition & 1 deletion artic/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.2.0"
__version__ = "1.2.1"
6 changes: 5 additions & 1 deletion docs/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ artic minion <scheme> <sample>
| scheme | Y | NA | The name of the primer scheme |
| sample | Y | NA | The name of the sample |
| --medaka | N | False | Use medaka instead of nanopolish for variants |
| --medaka-model | - | NA | Medaka model to use (required if --medaka set) |
| --medaka-model | * | NA | Medaka model to use (required if --medaka set) |
| --minimap2 | N | True | Use minimap2 |
| --bwa | N | False | Use bwa instead of minimap2 |
| --normalise | N | 100 | Normalise down to moderate coverage to save runtime |
Expand All @@ -245,8 +245,12 @@ artic minion <scheme> <sample>
| --fast5-directory | N | NA | FAST5 Directory |
| --sequencing-summary | N | NA | Path to Guppy sequencing summary |
| --skip-nanopolish | N | False | Skip nanopolish |
| --no-longshot | N | False | Use medaka variant instead of longshot (experimental feautre from v1.2.0) |
| --strict | N | False | Enables experimental features (from v1.2.0), including VFC overlap checks and stats |
| --dry-run | N | False | Perform a dry run of the minion pipeline, outputing commands to a log but not executing them |

* `--medaka-model` is required if `--medaka` is set.

---

## rampart
Expand Down
4 changes: 2 additions & 2 deletions docs/minion.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Finally, the consensus sequence is aligned against the reference sequence using

| file name | description |
| -------------------------- | --------------------------------------------------------------------- |
| `$SAMPLE.*.png` | bar and box plots of amplicon coverage |
| `$SAMPLE.*_mqc.json` | stats files which MultiQC can use to make a report |
| `$SAMPLE.consensus.fasta` | the consensus sequence for the input sample |
| `$SAMPLE.muscle.out.fasta` | an alignment of the consensus sequence against the reference sequence |

Expand All @@ -129,7 +129,7 @@ Finally, the consensus sequence is aligned against the reference sequence using

## Optional pipeline report

As of version 1.2.0, you can run the artic fork of MultiQC (which should be installed as part of the artic conda environment) and this will produce a report containing amplicon coverage plots and variant call information. To generate a report from within your pipeline output directory:
As of version 1.2.1, if you run the pipeline with `--strict`, you can run MultiQC (which should be installed as part of the artic conda environment) on the pipeline output directory and this will produce a report containing amplicon coverage plots and variant call information. To generate a report from within your pipeline output directory:

```
multiqc .
Expand Down
3 changes: 1 addition & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- longshot=0.4.1
- medaka=1.0.3
- minimap2=2.17
- multiqc
- muscle=3.8
- nanopolish=0.13.2
- nomkl
Expand All @@ -25,5 +26,3 @@ dependencies:
- pyvcf=0.6.8
- samtools=1.10
- tqdm
- pip:
- git+https://github.com/will-rowe/MultiQC.git@artic
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
'artic_make_depth_mask=artic.make_depth_mask:main',
'artic_fasta_header=artic.fasta_header:main',
'artic_mask=artic.mask:main',
'artic_get_stats=artic.artic_mqc:main',
],
},
author_email="n.j.loman@bham.ac.uk",
Expand Down

0 comments on commit 23a8460

Please sign in to comment.