Skip to content

Commit

Permalink
Move VCF functions to io, add io_support for shell_command_runner
Browse files Browse the repository at this point in the history
These VCF functions are specifically IO-related utilities, so moving them to a better scope.
  • Loading branch information
victorlin committed May 24, 2022
1 parent 88d155f commit 4279b66
Show file tree
Hide file tree
Showing 12 changed files with 254 additions and 241 deletions.
3 changes: 2 additions & 1 deletion augur/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from shutil import copyfile
import numpy as np
from Bio import AlignIO, SeqIO, Seq, Align
from .utils import run_shell_command, nthreads_value, shquote
from .io import run_shell_command, shquote
from .utils import nthreads_value
from collections import defaultdict

class AlignmentError(Exception):
Expand Down
4 changes: 2 additions & 2 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates
from .index import index_sequences, index_vcf
from .io import open_file, read_metadata, read_sequences, write_sequences
from .utils import AugurError, is_vcf as filename_is_vcf, write_vcf, read_strains
from .io import open_file, read_metadata, read_sequences, write_sequences, is_vcf as filename_is_vcf, write_vcf
from .utils import AugurError, read_strains

comment_char = '#'

Expand Down
3 changes: 1 addition & 2 deletions augur/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import sys
import csv

from .io import open_file, read_sequences
from .utils import is_vcf, read_vcf
from .io import open_file, read_sequences, is_vcf, read_vcf


def register_arguments(parser):
Expand Down
168 changes: 168 additions & 0 deletions augur/io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python3
"""Interfaces for reading and writing data also known as input/output (I/O)
"""
import os
import shlex
import Bio.SeqIO
import Bio.SeqRecord
import sys
Expand All @@ -9,6 +11,11 @@
from pathlib import Path
from xopen import xopen

from augur.io_support.shell_command_runner import ShellCommandRunner


shquote = shlex.quote


@contextmanager
def open_file(path_or_buffer, mode="r", **kwargs):
Expand Down Expand Up @@ -199,3 +206,164 @@ def write_sequences(sequences, path_or_buffer, format="fasta"):

def print_err(*args):
print(*args, file=sys.stderr)


def is_vcf(filename):
"""Convenience method to check if a file is a vcf file.
>>> is_vcf(None)
False
>>> is_vcf("./foo")
False
>>> is_vcf("./foo.vcf")
True
>>> is_vcf("./foo.vcf.GZ")
True
"""
return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))


def read_vcf(filename):
if filename.lower().endswith(".gz"):
import gzip
file = gzip.open(filename, mode="rt", encoding='utf-8')
else:
file = open(filename, encoding='utf-8')

chrom_line = next(line for line in file if line.startswith("#C"))
file.close()
headers = chrom_line.strip().split("\t")
sequences = headers[headers.index("FORMAT") + 1:]

# because we need 'seqs to remove' for VCF
return sequences, sequences.copy()


def write_vcf(input_filename, output_filename, dropped_samps):
if _filename_gz(input_filename):
input_arg = "--gzvcf"
else:
input_arg = "--vcf"

if _filename_gz(output_filename):
output_pipe = "| gzip -c"
else:
output_pipe = ""

drop_args = ["--remove-indv " + shquote(s) for s in dropped_samps]

call = ["vcftools"] + drop_args + [input_arg, shquote(input_filename), "--recode --stdout", output_pipe, ">", shquote(output_filename)]

print("Filtering samples using VCFTools with the call:")
print(" ".join(call))
run_shell_command(" ".join(call), raise_errors = True)
# remove vcftools log file
try:
os.remove('out.log')
except OSError:
pass


def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
"""
Writes out a VCF-style file (which seems to be minimally handleable
by vcftools and pyvcf) of the AA differences between sequences and the reference.
This is a similar format created/used by read_in_vcf except that there is one
of these dicts (with sequences, reference, positions) for EACH gene.
Also writes out a fasta of the reference alignment.
EBH 12 Dec 2017
"""
import numpy as np

#for the header
seqNames = list(prot_dict[list(prot_dict.keys())[0]]['sequences'].keys())

#prepare the header of the VCF & write out
header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
the_file.write( "##fileformat=VCFv4.2\n"+
"##source=NextStrain_Protein_Translation\n"+
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
the_file.write("\t".join(header)+"\n")

refWrite = []
vcfWrite = []

#go through for every gene/protein
for fname, prot in prot_dict.items():
sequences = prot['sequences']
ref = prot['reference']
positions = prot['positions']

#write out the reference fasta
refWrite.append(">"+fname)
refWrite.append(ref)

#go through every variable position
#There are no deletions here, so it's simpler than for VCF nuc sequenes!
for pi in positions:
pos = pi+1 #change numbering to match VCF not python
refb = ref[pi] #reference base at this position

#try/except is (much) faster than list comprehension!
pattern = []
for k,v in sequences.items():
try:
pattern.append(sequences[k][pi])
except KeyError:
pattern.append('.')
pattern = np.array(pattern)

#get the list of ALTs - minus any '.'!
uniques = np.unique(pattern)
uniques = uniques[np.where(uniques!='.')]

#Convert bases to the number that matches the ALT
j=1
for u in uniques:
pattern[np.where(pattern==u)[0]] = str(j)
j+=1
#Now convert these calls to #/# (VCF format)
calls = [ j+"/"+j if j!='.' else '.' for j in pattern ]
if len(uniques)==0:
print("UNEXPECTED ERROR WHILE CONVERTING TO VCF AT POSITION {}".format(str(pi)))
break

#put it all together and write it out
output = [fname, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls

vcfWrite.append("\t".join(output))

#write it all out
with open(ref_file_name, 'w', encoding='utf-8') as the_file:
the_file.write("\n".join(refWrite))

with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
the_file.write("\n".join(vcfWrite))

if vcf_file_name.lower().endswith('.gz'):
import os
#must temporarily remove .gz ending, or gzip won't zip it!
os.rename(vcf_file_name, vcf_file_name[:-3])
call = ["gzip", vcf_file_name[:-3]]
run_shell_command(" ".join(call), raise_errors = True)


def run_shell_command(cmd, raise_errors=False, extra_env=None):
"""
Run the given command string via Bash with error checking.
Returns True if the command exits normally. Returns False if the command
exits with failure and "raise_errors" is False (the default). When
"raise_errors" is True, exceptions are rethrown.
If an *extra_env* mapping is passed, the provided keys and values are
overlayed onto the default subprocess environment.
"""
return ShellCommandRunner(cmd, raise_errors=raise_errors, extra_env=extra_env).run()


def _filename_gz(filename):
return filename.lower().endswith(".gz")
File renamed without changes.
4 changes: 2 additions & 2 deletions augur/mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from Bio import SeqIO
from Bio.Seq import MutableSeq

from .io import open_file, read_sequences, write_sequences
from .utils import run_shell_command, shquote, is_vcf, load_mask_sites, VALID_NUCLEOTIDES
from .io import open_file, read_sequences, write_sequences, run_shell_command, shquote, is_vcf
from .utils import load_mask_sites, VALID_NUCLEOTIDES

def get_chrom_name(vcf_file):
"""Read the CHROM field from the first non-header line of a vcf file.
Expand Down
3 changes: 2 additions & 1 deletion augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import os, sys
import numpy as np
from Bio import SeqIO, SeqFeature, Seq, SeqRecord, Phylo
from .utils import read_node_data, load_features, write_json, write_VCF_translation, get_json_name
from .io import write_VCF_translation
from .utils import read_node_data, load_features, write_json, get_json_name
from treetime.vcf_utils import read_vcf

class MissingNodeError(Exception):
Expand Down
4 changes: 2 additions & 2 deletions augur/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from treetime.vcf_utils import read_vcf
from pathlib import Path

from .io import read_sequences
from .utils import run_shell_command, nthreads_value, shquote, load_mask_sites
from .io import read_sequences, run_shell_command, shquote
from .utils import nthreads_value, load_mask_sites

DEFAULT_ARGS = {
"fasttree": "-nt -nosupport",
Expand Down
Loading

0 comments on commit 4279b66

Please sign in to comment.