Move VCF functions to io, add io_support for shell_command_runner

These VCF functions are specifically IO-related utilities, so moving them to a better scope.
nextstrain · May 24, 2022 · 4279b66 · 4279b66
1 parent 88d155f
commit 4279b66
Show file tree

Hide file tree

Showing 12 changed files with 254 additions and 241 deletions.
diff --git a/augur/align.py b/augur/align.py
@@ -6,7 +6,8 @@
 from shutil import copyfile
 import numpy as np
 from Bio import AlignIO, SeqIO, Seq, Align
-from .utils import run_shell_command, nthreads_value, shquote
+from .io import run_shell_command, shquote
+from .utils import nthreads_value
 from collections import defaultdict
 
 class AlignmentError(Exception):

diff --git a/augur/filter.py b/augur/filter.py
@@ -19,8 +19,8 @@
 
 from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates
 from .index import index_sequences, index_vcf
-from .io import open_file, read_metadata, read_sequences, write_sequences
-from .utils import AugurError, is_vcf as filename_is_vcf, write_vcf, read_strains
+from .io import open_file, read_metadata, read_sequences, write_sequences, is_vcf as filename_is_vcf, write_vcf
+from .utils import AugurError, read_strains
 
 comment_char = '#'
 

diff --git a/augur/index.py b/augur/index.py
@@ -8,8 +8,7 @@
 import sys
 import csv
 
-from .io import open_file, read_sequences
-from .utils import is_vcf, read_vcf
+from .io import open_file, read_sequences, is_vcf, read_vcf
 
 
 def register_arguments(parser):

diff --git a/augur/io.py b/augur/io.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 """Interfaces for reading and writing data also known as input/output (I/O)
 """
+import os
+import shlex
 import Bio.SeqIO
 import Bio.SeqRecord
 import sys
@@ -9,6 +11,11 @@
 from pathlib import Path
 from xopen import xopen
 
+from augur.io_support.shell_command_runner import ShellCommandRunner
+
+
+shquote = shlex.quote
+
 
 @contextmanager
 def open_file(path_or_buffer, mode="r", **kwargs):
@@ -199,3 +206,164 @@ def write_sequences(sequences, path_or_buffer, format="fasta"):
 
 def print_err(*args):
     print(*args, file=sys.stderr)
+
+
+def is_vcf(filename):
+    """Convenience method to check if a file is a vcf file.
+
+    >>> is_vcf(None)
+    False
+    >>> is_vcf("./foo")
+    False
+    >>> is_vcf("./foo.vcf")
+    True
+    >>> is_vcf("./foo.vcf.GZ")
+    True
+    """
+    return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))
+
+
+def read_vcf(filename):
+    if filename.lower().endswith(".gz"):
+        import gzip
+        file = gzip.open(filename, mode="rt", encoding='utf-8')
+    else:
+        file = open(filename, encoding='utf-8')
+
+    chrom_line = next(line for line in file if line.startswith("#C"))
+    file.close()
+    headers = chrom_line.strip().split("\t")
+    sequences = headers[headers.index("FORMAT") + 1:]
+
+    # because we need 'seqs to remove' for VCF
+    return sequences, sequences.copy()
+
+
+def write_vcf(input_filename, output_filename, dropped_samps):
+    if _filename_gz(input_filename):
+        input_arg = "--gzvcf"
+    else:
+        input_arg = "--vcf"
+
+    if _filename_gz(output_filename):
+        output_pipe = "| gzip -c"
+    else:
+        output_pipe = ""
+
+    drop_args = ["--remove-indv " + shquote(s) for s in dropped_samps]
+
+    call = ["vcftools"] + drop_args + [input_arg, shquote(input_filename), "--recode --stdout", output_pipe, ">", shquote(output_filename)]
+
+    print("Filtering samples using VCFTools with the call:")
+    print(" ".join(call))
+    run_shell_command(" ".join(call), raise_errors = True)
+    # remove vcftools log file
+    try:
+        os.remove('out.log')
+    except OSError:
+        pass
+
+
+def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
+    """
+    Writes out a VCF-style file (which seems to be minimally handleable
+    by vcftools and pyvcf) of the AA differences between sequences and the reference.
+    This is a similar format created/used by read_in_vcf except that there is one
+    of these dicts (with sequences, reference, positions) for EACH gene.
+
+    Also writes out a fasta of the reference alignment.
+
+    EBH 12 Dec 2017
+    """
+    import numpy as np
+
+    #for the header
+    seqNames = list(prot_dict[list(prot_dict.keys())[0]]['sequences'].keys())
+
+    #prepare the header of the VCF & write out
+    header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
+    with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
+        the_file.write( "##fileformat=VCFv4.2\n"+
+                        "##source=NextStrain_Protein_Translation\n"+
+                        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
+        the_file.write("\t".join(header)+"\n")
+
+    refWrite = []
+    vcfWrite = []
+
+    #go through for every gene/protein
+    for fname, prot in prot_dict.items():
+        sequences = prot['sequences']
+        ref = prot['reference']
+        positions = prot['positions']
+
+        #write out the reference fasta
+        refWrite.append(">"+fname)
+        refWrite.append(ref)
+
+        #go through every variable position
+        #There are no deletions here, so it's simpler than for VCF nuc sequenes!
+        for pi in positions:
+            pos = pi+1 #change numbering to match VCF not python
+            refb = ref[pi] #reference base at this position
+
+            #try/except is (much) faster than list comprehension!
+            pattern = []
+            for k,v in sequences.items():
+                try:
+                    pattern.append(sequences[k][pi])
+                except KeyError:
+                    pattern.append('.')
+            pattern = np.array(pattern)
+
+            #get the list of ALTs - minus any '.'!
+            uniques = np.unique(pattern)
+            uniques = uniques[np.where(uniques!='.')]
+
+            #Convert bases to the number that matches the ALT
+            j=1
+            for u in uniques:
+                pattern[np.where(pattern==u)[0]] = str(j)
+                j+=1
+            #Now convert these calls to #/# (VCF format)
+            calls = [ j+"/"+j if j!='.' else '.' for j in pattern ]
+            if len(uniques)==0:
+                print("UNEXPECTED ERROR WHILE CONVERTING TO VCF AT POSITION {}".format(str(pi)))
+                break
+
+            #put it all together and write it out
+            output = [fname, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
+
+            vcfWrite.append("\t".join(output))
+
+    #write it all out
+    with open(ref_file_name, 'w', encoding='utf-8') as the_file:
+        the_file.write("\n".join(refWrite))
+
+    with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
+        the_file.write("\n".join(vcfWrite))
+
+    if vcf_file_name.lower().endswith('.gz'):
+        import os
+        #must temporarily remove .gz ending, or gzip won't zip it!
+        os.rename(vcf_file_name, vcf_file_name[:-3])
+        call = ["gzip", vcf_file_name[:-3]]
+        run_shell_command(" ".join(call), raise_errors = True)
+
+
+def run_shell_command(cmd, raise_errors=False, extra_env=None):
+    """
+    Run the given command string via Bash with error checking.
+
+    Returns True if the command exits normally.  Returns False if the command
+    exits with failure and "raise_errors" is False (the default).  When
+    "raise_errors" is True, exceptions are rethrown.
+
+    If an *extra_env* mapping is passed, the provided keys and values are
+    overlayed onto the default subprocess environment.
+    """
+    return ShellCommandRunner(cmd, raise_errors=raise_errors, extra_env=extra_env).run()
+
+
+def _filename_gz(filename):
+    return filename.lower().endswith(".gz")
diff --git a/augur/util_support/shell_command_runner.py → augur/io_support/shell_command_runner.py b/augur/util_support/shell_command_runner.py → augur/io_support/shell_command_runner.py
diff --git a/augur/mask.py b/augur/mask.py
@@ -10,8 +10,8 @@
 from Bio import SeqIO
 from Bio.Seq import MutableSeq
 
-from .io import open_file, read_sequences, write_sequences
-from .utils import run_shell_command, shquote, is_vcf, load_mask_sites, VALID_NUCLEOTIDES
+from .io import open_file, read_sequences, write_sequences, run_shell_command, shquote, is_vcf
+from .utils import load_mask_sites, VALID_NUCLEOTIDES
 
 def get_chrom_name(vcf_file):
     """Read the CHROM field from the first non-header line of a vcf file.

diff --git a/augur/translate.py b/augur/translate.py
@@ -5,7 +5,8 @@
 import os, sys
 import numpy as np
 from Bio import SeqIO, SeqFeature, Seq, SeqRecord, Phylo
-from .utils import read_node_data, load_features, write_json, write_VCF_translation, get_json_name
+from .io import write_VCF_translation
+from .utils import read_node_data, load_features, write_json, get_json_name
 from treetime.vcf_utils import read_vcf
 
 class MissingNodeError(Exception):

diff --git a/augur/tree.py b/augur/tree.py
@@ -15,8 +15,8 @@
 from treetime.vcf_utils import read_vcf
 from pathlib import Path
 
-from .io import read_sequences
-from .utils import run_shell_command, nthreads_value, shquote, load_mask_sites
+from .io import read_sequences, run_shell_command, shquote
+from .utils import nthreads_value, load_mask_sites
 
 DEFAULT_ARGS = {
     "fasttree": "-nt -nosupport",