Skip to content

Commit

Permalink
[translate] guarantee nuc annotation produced
Browse files Browse the repository at this point in the history
This builds off the preceding 3 commits which guarantee that a 'nuc'
feature will be parsed from the reference file. We now guarantee it'll
be exported in the node-data JSON.

Note that the change to the TB aa_muts.json test file was due to a bug
in the previous code, where `'type: feat['type']` would incorrectly
reuse the last defined `feat` in the preceding loop. (I think this is a
pitfall of using large "real-life" test files as it's impractical to
manually check the source-of-truth we are comparing against.)

Since the 'nuc' feature is guaranteed to exist, we can also check it
against the existing nuc annotation within `augur ancestral`, where
applicable.

Closes #953, although there is good commentary in that issue about
improving our parsing of GFFs more generally than that implemented here.

Closes #1346
  • Loading branch information
jameshadfield committed Dec 5, 2023
1 parent c965e92 commit d379136
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 8 deletions.
6 changes: 6 additions & 0 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,12 @@ def run(args):
from .utils import load_features
## load features; only requested features if genes given
features = load_features(args.annotation, args.genes)
# Ensure the already-created nuc annotation coordinates match those parsed from the reference file
if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or
features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")

print("Read in {} features from reference sequence file".format(len(features)))
for gene in args.genes:
print(f"Processing gene: {gene}")
Expand Down
14 changes: 7 additions & 7 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,19 +429,19 @@ def run(args):
# Note that BioPython FeatureLocations use
# "Pythonic" coordinates: [zero-origin, half-open)
# Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
annotations = {}
annotations = {
'nuc': {'start': features['nuc'].location.start+1,
'end': features['nuc'].location.end,
'strand': '+',
'type': features['nuc'].type, # (unused by auspice)
'seqid': args.reference_sequence} # (unused by auspice)
}
for fname, feat in features.items():
annotations[fname] = {'seqid':args.reference_sequence,
'type':feat.type,
'start':int(feat.location.start)+1,
'end':int(feat.location.end),
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
if is_vcf: #need to add our own nuc
annotations['nuc'] = {'seqid':args.reference_sequence,
'type':feat.type,
'start': 1,
'end': len(ref),
'strand': '+'}

## determine amino acid mutations for each node
try:
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/translate/data/tb/aa_muts.json
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,7 @@
"seqid": "translate/data/tb/Mtb_H37Rv_NCBI_Annot.gff",
"start": 1,
"strand": "+",
"type": "gene"
"type": "region"
},
"opcA": {
"end": 1625365,
Expand Down

0 comments on commit d379136

Please sign in to comment.