Merge pull request #68 from artic-network/1.3.0-dev

patch 1.2.0 > 1.2.1
artic-network · Jan 14, 2021 · 23a8460 · 23a8460
2 parents d5a0cb8 + 90cc68c
commit 23a8460
Show file tree

Hide file tree

Showing 7 changed files with 229 additions and 9 deletions.
diff --git a/artic/artic_mqc.py b/artic/artic_mqc.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+import json
+import re
+import sys
+from collections import OrderedDict
+
+from .vcftagprimersites import read_bed_file
+
+# Alignment_Length_Threshold drops binned reads that are <X% of amplicon length)
+Alignment_Length_Threshold = 0.95
+
+# Amplicon_Dropout_Val will report amplicon dropout in any amplicon which has fewer than X reads
+Amplicon_Dropout_Val = 50
+
+# Template for the amplicon plot data
+amplicon_plot_template = {
+    "id": "custom_data_lineplot",
+    "section_name": "ARTIC: Amplicon Coverage",
+    "description": "This plot summarises the number of reads that were assigned to each amplicon in the primer scheme.\nWe use the align_trim report file from the ARTIC pipeline and group each read by its assigned amplicon.\nIf the length of alignment between read and reference is <{}% of the amplicon length, the read discarded from the coverage plot.\nIf the total number of reads assigned to an amplicon is below {} (red dashed line),\nthe amplicon is marked as dropped out." .format(Alignment_Length_Threshold, Amplicon_Dropout_Val),
+    "plot_type": "linegraph",
+    "pconfig": {
+        "id": "custom_data_linegraph",
+        "title": "",
+        "categories": "True",
+        "yDecimals": "False",
+        "xDecimals": "False",
+        "ylab": "# reads",
+        "xlab": "amplicon",
+        "yPlotLines": [{
+            "color": "#FF0000",
+            "width": 2,
+            "dashStyle": "LongDash",
+            "label": "Amplicon dropout",
+            "value": Amplicon_Dropout_Val
+        }]
+    },
+    "data": {}
+}
+
+# Template for the stats table data
+amplicon_stats_template = {
+    "id": "custom_data_json_table",
+    "section_name": "ARTIC: General Stats",
+    "description": "A summary of stats from the consensus genome pipeline.",
+    "plot_type": "table",
+    "pconfig": {
+        "id": "custom_data_json_table_table",
+        "title": "",
+        "min": 0,
+        "scale": "RdYlGn-rev",
+        "format": "{:,.0f}"
+    },
+    "data": {}
+}
+
+def getSchemeAmplicons(schemeFile):
+    """Get the expected amplicon names from the provided scheme.
+
+    Parameters
+    ----------
+    schemeFile : string
+        The filename of the primer scheme
+    
+    Returns
+    -------
+    dict
+        A dict of amplicon names -> zeroed counter
+    """
+    amplicons = {}
+    primer_scheme = read_bed_file(schemeFile)
+    for primer in primer_scheme:
+        amplicon = ""
+        if primer["direction"] == "+":
+            amplicon = primer["Primer_ID"].split("_LEFT")[0]
+        else:
+            amplicon = primer["Primer_ID"].split("_RIGHT")[0]
+        if amplicon not in amplicons:
+            amplicons[amplicon] = 0
+        amplicons[amplicon] += 1
+    named_amplicons = {}
+    for amplicon in amplicons:
+        if amplicons[amplicon] != 2:
+            print("in correct numbers of primer for {}" .format(amplicon), file=sys.stderr)
+            raise SystemExit(1)
+        named_amplicons[("{}_LEFT_{}_RIGHT" .format(amplicon, amplicon))] = 0
+    return named_amplicons
+
+def getAmpliconCounts(amplicons, align_trim_report):
+    """Get the read counts per amplicon.
+
+    Parameters
+    ----------
+    amplicons : list
+        Dict of amplicon names found in scheme, linked to a zeroed counter
+
+    align_trim_report: string
+        File path to the align_trim report
+
+    Returns
+    -------
+    dict
+        Dict of amplicon names -> populated read counts
+    """
+    # process the align_trim report
+    with open(align_trim_report, "r") as fh:
+
+        # skip the first line (header)
+        fh.readline()
+
+        # process each line and add to counts
+        for l in fh:
+            fields = l.rstrip().split('\t')
+
+            # check read is from a properly paired amplicon
+            if int(fields[12]) != 1:
+                continue
+
+            # check the read alignment length covers enough of the amplicon
+            aLen = int(fields[11]) - int(fields[10])
+            rLen = int(fields[2]) - int(fields[1])
+            if aLen < (Alignment_Length_Threshold * rLen):
+                continue
+
+            # increment the read count for this amplicon
+            if fields[3] not in amplicons:
+                print("amplicon in align_trim report but not in primer scheme {}" .format(fields[3]), file=sys.stderr)
+                raise SystemExit(1)
+            amplicons[fields[3]] += 1
+    return amplicons
+
+def getVCFreportInfo(vcf_report):
+    """Get the read counts per amplicon.
+
+    Parameters
+    ----------
+    vcf_report: string
+        File path to the vcf_report
+
+    Returns
+    -------
+    dict
+        Dict of vcf stats -> values
+    """
+    # Read vcfcheck report and get important stuff out (NOTE: more to be added in next release)
+    stats = dict()
+    total_vars = 0
+    passed_vars = 0
+    with open(vcf_report, "r") as fh:
+        for l in fh:
+            match = re.search(r'.*\t(\d+)\svariant\srecords\sprocessed', l)
+            if match:
+                total_vars = int(match.group(1))
+            match = re.search(r'.*\t(\d+)\svariant\srecords\spassed\schecks', l)
+            if match:
+                passed_vars = int(match.group(1))
+        stats["# overlap var. fails"] = total_vars - passed_vars
+    return stats
+
+def run(args):
+    """Collect stats from ARTIC pipeline output and generate files for use by MultiQC.
+    """
+    # get a list of expected amplicon names
+    amplicons = getSchemeAmplicons(args.scheme)
+
+    # open align trim output and count reads per amplicon in scheme
+    amplicon_counts = getAmpliconCounts(amplicons, args.align_report)
+
+    # replace amplicon names with ints and count number of dropouts
+    dropouts = 0
+    amplicon_renamed_counts = dict()
+    for amplicon, count in amplicon_counts.items():
+        amplicon_renamed_counts[int(amplicon.split('_')[1])] = count
+        if count < Amplicon_Dropout_Val:
+            dropouts += 1
+
+    # add counts to multiqc amplicon plot template
+    amplicon_plot_template["data"][args.sample] = amplicon_renamed_counts
+
+    # write the amplicon plot output
+    with open("{}.amplicon_plot_data_mqc.json" .format(args.sample), "w") as amplicon_plot_mqc_file:
+        json.dump(amplicon_plot_template, amplicon_plot_mqc_file, indent=4, sort_keys=False)
+    amplicon_plot_mqc_file.close()
+
+    # add counts to multiqc stats template
+    amplicon_stats_template["data"][args.sample] = dict()
+    amplicon_stats_template["data"][args.sample]["# low cov. amplicons"] = dropouts
+
+    # parse VCF report if provided and add to the stats template
+    if args.vcf_report:
+        for stat, value in getVCFreportInfo(args.vcf_report).items():
+            amplicon_stats_template["data"][args.sample][stat] = value
+
+    # write the stats output
+    with open("{}.amplicon_stats_data_mqc.json" .format(args.sample), "w") as amplicon_stats_mqc_file:
+        json.dump(amplicon_stats_template, amplicon_stats_mqc_file, indent=4, sort_keys=False)
+    amplicon_stats_mqc_file.close()
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description='Collect stats from ARTIC pipeline output and generate files for use by MultiQC')
+    parser.add_argument('--scheme', required=True, type=str, help='the amplicon scheme used')
+    parser.add_argument('--align-report', required=True, type=str, help='the report file from align_trim (*.alignreport.txt')
+    parser.add_argument('--vcf-report', required=False, type=str, help='the report file from vcf_check (*.vcfreport.txt')
+    parser.add_argument('sample', type=str, help='the sample name')
+    args = parser.parse_args()
+    run(args)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/artic/minion.py b/artic/minion.py
@@ -183,13 +183,15 @@ def run(parser, args):
     # 8) check and filter the VCFs
     ## if using strict, run the vcf checker to remove vars present only once in overlap regions (this replaces the original merged vcf from the previous step)
     if args.strict:
-        cmds.append("artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf %s 2> %s.vcfreport.txt" % (args.sample, args.sample, bed, args.sample))
+        cmds.append("bgzip -f %s.merged.vcf" % (args.sample))
+        cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample))
+        cmds.append("artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf.gz %s 2> %s.vcfreport.txt" % (args.sample, args.sample, bed, args.sample))
         cmds.append("mv %s.merged.filtered.vcf %s.merged.vcf" % (args.sample, args.sample))
 
     ## if doing the medaka workflow and longshot required, do it on the merged VCF
     if args.medaka and not args.no_longshot:
         cmds.append("bgzip -f %s.merged.vcf" % (args.sample))
-        cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample))
+        cmds.append("tabix -f -p vcf %s.merged.vcf.gz" % (args.sample))
         cmds.append("longshot -P 0 -F -A --no_haps --bam %s.primertrimmed.rg.sorted.bam --ref %s --out %s.merged.vcf --potential_variants %s.merged.vcf.gz" % (args.sample, ref, args.sample, args.sample))
 
     ## set up some name holder vars for ease
@@ -220,7 +222,11 @@ def run(parser, args):
     cmds.append("cat %s.consensus.fasta %s > %s.muscle.in.fasta" % (args.sample, ref, args.sample))
     cmds.append("muscle -in %s.muscle.in.fasta -out %s.muscle.out.fasta" % (args.sample, args.sample))
 
-    # 12) setup the log file and run the pipeline commands
+    # 12) get some QC stats
+    if args.strict:
+        cmds.append("artic_get_stats --scheme {} --align-report {}.alignreport.txt --vcf-report {}.vcfreport.txt {}" .format(bed, args.sample, args.sample, args.sample))
+
+    # 13) setup the log file and run the pipeline commands
     log = "%s.minion.log.txt" % (args.sample)
     logfh = open(log, 'w')
     for cmd in cmds:

diff --git a/artic/version.py b/artic/version.py
@@ -1 +1 @@
-__version__ = "1.2.0"
+__version__ = "1.2.1"
diff --git a/docs/commands.md b/docs/commands.md
@@ -234,7 +234,7 @@ artic minion <scheme> <sample>
 | scheme               | Y        | NA             | The name of the primer scheme                                                                |
 | sample               | Y        | NA             | The name of the sample                                                                       |
 | --medaka             | N        | False          | Use medaka instead of nanopolish for variants                                                |
-| --medaka-model       | -        | NA             | Medaka model to use (required if --medaka set)                                               |
+| --medaka-model       | *        | NA             | Medaka model to use (required if --medaka set)                                               |
 | --minimap2           | N        | True           | Use minimap2                                                                                 |
 | --bwa                | N        | False          | Use bwa instead of minimap2                                                                  |
 | --normalise          | N        | 100            | Normalise down to moderate coverage to save runtime                                          |
@@ -245,8 +245,12 @@ artic minion <scheme> <sample>
 | --fast5-directory    | N        | NA             | FAST5 Directory                                                                              |
 | --sequencing-summary | N        | NA             | Path to Guppy sequencing summary                                                             |
 | --skip-nanopolish    | N        | False          | Skip nanopolish                                                                              |
+| --no-longshot        | N        | False          | Use medaka variant instead of longshot (experimental feautre from v1.2.0)                    |
+| --strict             | N        | False          | Enables experimental features (from v1.2.0), including VFC overlap checks and stats          |
 | --dry-run            | N        | False          | Perform a dry run of the minion pipeline, outputing commands to a log but not executing them |
 
+* `--medaka-model` is required if `--medaka` is set.
+
 ---
 
 ## rampart

diff --git a/docs/minion.md b/docs/minion.md
@@ -112,7 +112,7 @@ Finally, the consensus sequence is aligned against the reference sequence using
 
 | file name                  | description                                                           |
 | -------------------------- | --------------------------------------------------------------------- |
-| `$SAMPLE.*.png`            | bar and box plots of amplicon coverage                                |
+| `$SAMPLE.*_mqc.json`       | stats files which MultiQC can use to make a report                    |
 | `$SAMPLE.consensus.fasta`  | the consensus sequence for the input sample                           |
 | `$SAMPLE.muscle.out.fasta` | an alignment of the consensus sequence against the reference sequence |
 
@@ -129,7 +129,7 @@ Finally, the consensus sequence is aligned against the reference sequence using
 
 ## Optional pipeline report
 
-As of version 1.2.0, you can run the artic fork of MultiQC (which should be installed as part of the artic conda environment) and this will produce a report containing amplicon coverage plots and variant call information. To generate a report from within your pipeline output directory:
+As of version 1.2.1, if you run the pipeline with `--strict`, you can run MultiQC (which should be installed as part of the artic conda environment) on the pipeline output directory and this will produce a report containing amplicon coverage plots and variant call information. To generate a report from within your pipeline output directory:
 
 ```
 multiqc .

diff --git a/environment.yml b/environment.yml
@@ -14,6 +14,7 @@ dependencies:
   - longshot=0.4.1
   - medaka=1.0.3
   - minimap2=2.17
+  - multiqc
   - muscle=3.8
   - nanopolish=0.13.2
   - nomkl
@@ -25,5 +26,3 @@ dependencies:
   - pyvcf=0.6.8
   - samtools=1.10
   - tqdm
-  - pip:
-      - git+https://github.com/will-rowe/MultiQC.git@artic
diff --git a/setup.py b/setup.py
@@ -40,6 +40,7 @@
             'artic_make_depth_mask=artic.make_depth_mask:main',
             'artic_fasta_header=artic.fasta_header:main',
             'artic_mask=artic.mask:main',
+            'artic_get_stats=artic.artic_mqc:main',
         ],
     },
     author_email="n.j.loman@bham.ac.uk",