Skip to content

Commit

Permalink
[translate] refactor sequence extraction
Browse files Browse the repository at this point in the history
Simplifies the reading of the main function, and follows the established
pattern in this file of using a pair of functions (one for VCF, one for
JSON/FASTA). Added docstrings. The only code change is to throw an
AugurError rather than `return 1`, but the result is unchanged.
  • Loading branch information
jameshadfield committed Dec 19, 2023
1 parent efb43b0 commit 0a01c59
Showing 1 changed file with 34 additions and 19 deletions.
53 changes: 34 additions & 19 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,34 @@ def get_genes_from_file(fname):

return unique_genes

def sequences_vcf(reference_fasta, vcf):
"""
Extract the nucleotide variation in the VCF
Returns a tuple
[0] The sequences as a dict of dicts. sequences → <NODE_NAME> → <POS> → <ALT_NUC> where <POS> is a 0-based int
[1] The sequence of the provided `reference_fasta` (string)
"""
if not reference_fasta:
raise AugurError("A reference Fasta is required with VCF-format input")
compress_seq = read_vcf(vcf, reference_fasta)
sequences = compress_seq['sequences']
ref = compress_seq['reference']
return (sequences, ref)

def sequences_json(node_data_json, tree):
"""
Extract the full nuc sequence for each node in the provided node-data JSON.
Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
"""
node_data = read_node_data(node_data_json, tree)
if node_data is None:
raise AugurError("could not read node data (incl sequences)")
# extract sequences from node meta data
sequences = {}
for k,v in node_data['nodes'].items():
if 'sequence' in v:
sequences[k] = v['sequence']
return sequences

def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("translate", help=__doc__)
Expand Down Expand Up @@ -358,27 +386,14 @@ def run(args):
genes = args.genes

## check file format and read in sequences
is_vcf = False
if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
if not args.vcf_reference:
print("ERROR: a reference Fasta is required with VCF-format input")
return 1
compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference)
sequences = compress_seq['sequences']
ref = compress_seq['reference']
is_vcf = True
is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
check_arg_combinations(args, is_vcf)

if is_vcf:
(sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
else:
node_data = read_node_data(args.ancestral_sequences, args.tree)
if node_data is None:
print("ERROR: could not read node data (incl sequences)")
return 1
# extract sequences from node meta data
sequences = {}
for k,v in node_data['nodes'].items():
if 'sequence' in v:
sequences[k] = v['sequence']
sequences = sequences_json(args.ancestral_sequences, args.tree)

check_arg_combinations(args, is_vcf)

## load features; only requested features if genes given
features = load_features(args.reference_sequence, genes)
Expand Down

0 comments on commit 0a01c59

Please sign in to comment.