Skip to content

Commit

Permalink
[translate] refactor & improve readability
Browse files Browse the repository at this point in the history
There should be no functional changes. Co-locating the sequence reading
& feature extraction is easier to read, and were Python's scoping to be
different it'd be even nicer as we wouldn't leave around variables which
are never re-used.

As part of this `translate_vcf_feature` has changed from using an
undocumented `return None` to raising a (documented) error which (IMO)
is easier to reason with.
  • Loading branch information
jameshadfield committed Dec 19, 2023
1 parent 0a01c59 commit 03c1965
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 39 deletions.
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
* ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield)
* Incompatible arguments are now checked, especially related to VCF vs FASTA inputs.
* `--vcf-reference` and `--root-sequence` are now mutually exclusive.

* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield)

### Bug Fixes

* translate: The 'source' ID for GFF files is now ignored as a potential gene feature (it is still used for overall nuc coords). [#1348][] (@jameshadfield)
* translate: Improvements to command line arguments. [#1348][] (@jameshadfield)
* `--tree` and `--ancestral-sequences` are now required arguments.
* separate VCF-only arguments into their own group
Expand Down
72 changes: 40 additions & 32 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,17 @@
from .utils import read_node_data, load_features, write_json, get_json_name
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent

class MissingNodeError(Exception):
pass

class MismatchNodeError(Exception):
pass

class NoVariationError(Exception):
pass

def safe_translate(sequence, report_exceptions=False):
"""Returns an amino acid translation of the given nucleotide sequence accounting
for gaps in the given sequence.
Expand Down Expand Up @@ -125,7 +129,7 @@ def translate_feature(aln, feature):
return translations


def translate_vcf_feature(sequences, ref, feature):
def translate_vcf_feature(sequences, ref, feature, feature_name):
'''Translates a subsequence of input nucleotide sequences.
Parameters
Expand All @@ -145,6 +149,9 @@ def translate_vcf_feature(sequences, ref, feature):
translated reference gene, positions of AA differences, and AA
differences indexed by node name
Raises
------
NoVariationError : if no variable sites within this feature (across all sequences)
'''
def str_reverse_comp(str_seq):
#gets reverse-compliment of a string and returns it as a string
Expand All @@ -161,7 +168,7 @@ def str_reverse_comp(str_seq):
# Need to get ref translation to store. check if multiple of 3 for sanity.
# will be padded in safe_translate if not
if len(refNuc)%3:
print("Gene length of {} is not a multiple of 3. will pad with N".format(feature.qualifiers['Name'][0]), file=sys.stderr)
print(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N", file=sys.stderr)

ref_aa_seq = safe_translate(refNuc)
prot['reference'] = ref_aa_seq
Expand Down Expand Up @@ -205,11 +212,10 @@ def str_reverse_comp(str_seq):

prot['positions'].sort()

#if no variable sites, exclude this gene
# raise an error if no variable sites observed
if len(prot['positions']) == 0:
return None
else:
return prot
raise NoVariationError()
return prot

def construct_mut(start, pos, end):
return str(start) + str(pos) + str(end)
Expand Down Expand Up @@ -334,14 +340,22 @@ def sequences_json(node_data_json, tree):
Extract the full nuc sequence for each node in the provided node-data JSON.
Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
"""
node_data = read_node_data(node_data_json, tree)
node_data = read_node_data(node_data_json)
if node_data is None:
raise AugurError("could not read node data (incl sequences)")
# extract sequences from node meta data
sequences = {}
for k,v in node_data['nodes'].items():
if 'sequence' in v:
sequences[k] = v['sequence']
tree_nodes = {c.name for c in tree.find_clades()}
tree_nodes_missing_sequences = tree_nodes - set(sequences.keys())
if len(tree_nodes_missing_sequences):
raise AugurError(dedent(f"""\
{len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON.
These must be present under 'nodes' → <node_name> → 'sequence'.
This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
"""))
return sequences

def register_parser(parent_subparsers):
Expand Down Expand Up @@ -378,46 +392,40 @@ def check_arg_combinations(args, is_vcf):
def run(args):
## read tree and data, if reading data fails, return with error code
tree = Phylo.read(args.tree, 'newick')
is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
check_arg_combinations(args, is_vcf)

# If genes is a file, read in the genes to translate
if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]):
genes = get_genes_from_file(args.genes[0])
else:
genes = args.genes

## check file format and read in sequences
is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
check_arg_combinations(args, is_vcf)

if is_vcf:
(sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
else:
sequences = sequences_json(args.ancestral_sequences, args.tree)


## load features; only requested features if genes given
features = load_features(args.reference_sequence, genes)
if features is None:
print("ERROR: could not read features of reference sequence file")
return 1
print("Read in {} features from reference sequence file".format(len(features)))

### translate every feature - but not 'nuc'!
## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature
## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank.
translations = {}
deleted = []
for fname, feat in features.items():
if is_vcf:
trans = translate_vcf_feature(sequences, ref, feat)
if trans:
translations[fname] = trans
else:
deleted.append(fname)
else:
if feat.type != 'source':
translations[fname] = translate_feature(sequences, feat)

if len(deleted) != 0:
print("{} genes had no mutations and so have been be excluded.".format(len(deleted)))
if is_vcf:
(sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
features_without_variation = []
for fname, feat in features.items():
if feat.type=='source':
continue
try:
translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
except NoVariationError:
features_without_variation.append(fname)
if len(features_without_variation):
print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))
else:
sequences = sequences_json(args.ancestral_sequences, tree)
translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}

## glob the annotations for later auspice export
#
Expand Down
4 changes: 2 additions & 2 deletions tests/functional/translate/cram/genes.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ as a feature ('nuc' in this case)
> --reference-sequence "$DATA/reference.source.gff" \
> --genes gene2 gene3 \
> --output-node-data "aa_muts.genes-args.json"
Validating schema of .+ (re)
Couldn't find gene gene3 in GFF or GenBank file
Read in 2 features from reference sequence file
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$SCRIPTS/diff_jsons.py" \
Expand All @@ -37,9 +37,9 @@ Using a text file rather than command line arguments
> --genes "genes.txt" \
> --output-node-data "aa_muts.genes-txt.json"
Read in 2 specified genes to translate.
Validating schema of .+ (re)
Couldn't find gene gene3 in GFF or GenBank file
Read in 2 features from reference sequence file
Validating schema of .+ (re)
amino acid mutations written to .+ (re)

$ python3 "$SCRIPTS/diff_jsons.py" \
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/translate/cram/translate-with-genbank.t
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Translate amino acids for genes using a GenBank file.
> --reference-sequence "$DATA/zika/zika_outgroup.gb" \
> --genes CA PRO \
> --output-node-data aa_muts.json
Validating schema of '.+nt_muts.json'... (re)
Read in 3 features from reference sequence file
Validating schema of '.+nt_muts.json'... (re)
amino acid mutations written to .* (re)

$ python3 "$SCRIPTS/diff_jsons.py" $DATA/zika/aa_muts_genbank.json aa_muts.json \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --ancestral-sequences "${DATA}/zika/nt_muts.json" \
> --reference-sequence "genemap.gff" \
> --output-node-data aa_muts.json
Validating schema of '.+/nt_muts.json'... (re)
Read in 2 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
amino acid mutations written to .* (re)

Other than the sequence ids which will include a temporary path, the JSONs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --ancestral-sequences "${DATA}/zika/nt_muts.json" \
> --reference-sequence genemap.gff \
> --output-node-data aa_muts.json
Validating schema of '.+/nt_muts.json'... (re)
Read in 2 features from reference sequence file
Validating schema of '.+/nt_muts.json'... (re)
amino acid mutations written to .* (re)

$ python3 "${SCRIPTS}/diff_jsons.py" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
> --output-node-data aa_muts.json \
> --alignment-output translations.vcf \
> --vcf-reference-output translations_reference.fasta
Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N
Gene length of 'rrs' is not a multiple of 3. will pad with N
Read in 187 specified genes to translate.
Read in 187 features from reference sequence file
162 genes had no mutations and so have been be excluded.
Expand Down

1 comment on commit 03c1965

@victorlin
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jameshadfield added in #1348 (comment):

Unfortunately a few commits were squashed together in this PR. It's now been merged so I'm not going to change anything.

03c1965 contains what was meant to be three commits - a small refactor + the following commits

[translate] require nodes to have sequences

This is the implicit expectation, and is true for all of our pipelines.
In theory it's possible to translate without having the full sequence
attached to the node by reconstructing mutations on the root, but that
is not the approach taken by the current code.

As we explicitly check the tree nodes against the sequences in the
node-data JSON we can skip the automatic tests optionally performed when
reading the node-data JSON.

Closes #1345
diff --git a/CHANGES.md b/CHANGES.md
index 64f40df5..1cfe73a2 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -8,7 +8,7 @@
 * ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield)
      * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. 
      * `--vcf-reference` and `--root-sequence` are now mutually exclusive.
-
+* translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield)
 * translate: Improvements to command line arguments.  [#1348][] (@jameshadfield)
     * `--tree` and `--ancestral-sequences` are now required arguments.
     * separate VCF-only arguments into their own group
diff --git a/augur/translate.py b/augur/translate.py
index 461738b2..1fc73040 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -20,6 +20,7 @@ from .io.vcf import write_VCF_translation
 from .utils import read_node_data, load_features, write_json, get_json_name
 from treetime.vcf_utils import read_vcf
 from augur.errors import AugurError
+from textwrap import dedent
 
 class MissingNodeError(Exception):
     pass
@@ -339,7 +340,7 @@ def sequences_json(node_data_json, tree):
     Extract the full nuc sequence for each node in the provided node-data JSON.
     Returns a dict, keys are node names and values are a string of the genome sequence (nuc)
     """
-    node_data = read_node_data(node_data_json, tree)
+    node_data = read_node_data(node_data_json)
     if node_data is None:
         raise AugurError("could not read node data (incl sequences)")
     # extract sequences from node meta data
@@ -347,6 +348,14 @@ def sequences_json(node_data_json, tree):
     for k,v in node_data['nodes'].items():
         if 'sequence' in v:
             sequences[k] = v['sequence']
+    tree_nodes = {c.name for c in tree.find_clades()}
+    tree_nodes_missing_sequences = tree_nodes - set(sequences.keys())
+    if len(tree_nodes_missing_sequences):
+        raise AugurError(dedent(f"""\
+            {len(tree_nodes_missing_sequences)} nodes on the tree are missing nucleotide sequences in the node-data JSON.
+            These must be present under 'nodes' → <node_name> → 'sequence'.
+            This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
+            """))
     return sequences
 
 def register_parser(parent_subparsers):
@@ -412,7 +421,7 @@ def run(args):
         if len(features_without_variation):
             print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
     else:
-        sequences = sequences_json(args.ancestral_sequences, args.tree)
+        sequences = sequences_json(args.ancestral_sequences, tree)
         translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if feat.type != 'source'}
 
     ## glob the annotations for later auspice export
[translate] skip parsing source (nuc) as a feature

This check is already in place for non-VCF inputs, and my guess is it
was omitted here as the TB pipeline's GFF file didn't include a 'source'
annotation. I don't think 'source' is actually a valid GFF ID and I
suspect we've just been applying the INSDC/GenBank term to GFF files,
but it is one of the two fields parsed by `load_features` and there are
GFF files in Nextstrain build pipelines which use it. Modifying the
underlying `load_features` would be a better solution, but that's a
bigger project for another day.

We additionally update the error message to use the same feature name we
export.

Closes #591
Supersedes #1109
diff --git a/CHANGES.md b/CHANGES.md
index 1cfe73a2..54b14f8d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -9,6 +9,7 @@
      * Incompatible arguments are now checked, especially related to VCF vs FASTA inputs. 
      * `--vcf-reference` and `--root-sequence` are now mutually exclusive.
 * translate: Tree nodes are checked against the node-data JSON input to ensure sequences are present. [#1348][] (@jameshadfield)
+* translate: The 'source' ID for GFF files is now ignored as a potential gene feature. [#1348][] (@jameshadfield)
 * translate: Improvements to command line arguments.  [#1348][] (@jameshadfield)
     * `--tree` and `--ancestral-sequences` are now required arguments.
     * separate VCF-only arguments into their own group
diff --git a/augur/translate.py b/augur/translate.py
index 1fc73040..8c7253c3 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -129,7 +129,7 @@ def translate_feature(aln, feature):
     return translations
 
 
-def translate_vcf_feature(sequences, ref, feature):
+def translate_vcf_feature(sequences, ref, feature, feature_name):
     '''Translates a subsequence of input nucleotide sequences.
 
     Parameters
@@ -168,7 +168,7 @@ def translate_vcf_feature(sequences, ref, feature):
     # Need to get ref translation to store. check if multiple of 3 for sanity.
     # will be padded in safe_translate if not
     if len(refNuc)%3:
-        print("Gene length of {} is not a multiple of 3. will pad with N".format(feature.qualifiers['Name'][0]), file=sys.stderr)
+        print(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N", file=sys.stderr)
 
     ref_aa_seq = safe_translate(refNuc)
     prot['reference'] = ref_aa_seq
@@ -409,13 +409,16 @@ def run(args):
     print("Read in {} features from reference sequence file".format(len(features)))
 
     ## Read in sequences & for each sequence translate each feature _except for_ the source (nuc) feature
+    ## Note that `load_features` _only_ extracts {'gene', 'source'} for GFF files, {'CDS', 'source'} for GenBank.
     translations = {}
     if is_vcf:
         (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
         features_without_variation = []
         for fname, feat in features.items():
+            if feat.type=='source':
+                continue
             try:
-                translations[fname] = translate_vcf_feature(sequences, ref, feat)
+                translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
             except NoVariationError:
                 features_without_variation.append(fname)
         if len(features_without_variation):
diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
index f0d25bee..1de2856e 100644
--- a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
+++ b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
@@ -15,7 +15,7 @@ Translate amino acids for genes using a GFF3 file where the gene names are store
   >   --output-node-data aa_muts.json \
   >   --alignment-output translations.vcf \
   >   --vcf-reference-output translations_reference.fasta
-  Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N
+  Gene length of 'rrs' is not a multiple of 3. will pad with N
   Read in 187 specified genes to translate.
   Read in 187 features from reference sequence file
   162 genes had no mutations and so have been be excluded.

Please sign in to comment.