[translate] guarantee nuc annotation produced

This builds off the preceding 3 commits which guarantee that a 'nuc' feature will be parsed from the reference file. We now guarantee it'll be exported in the node-data JSON. Note that the change to the TB aa_muts.json test file was due to a bug in the previous code, where `'type: feat['type']` would incorrectly reuse the last defined `feat` in the preceding loop. (I think this is a pitfall of using large "real-life" test files as it's impractical to manually check the source-of-truth we are comparing against.) Since the 'nuc' feature is guaranteed to exist, we can also check it against the existing nuc annotation within `augur ancestral`, where applicable. Closes #953, although there is good commentary in that issue about improving our parsing of GFFs more generally than that implemented here. Closes #1346
nextstrain · Dec 5, 2023 · d379136 · d379136
1 parent c965e92
commit d379136
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 8 deletions.
diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -323,6 +323,12 @@ def run(args):
         from .utils import load_features
         ## load features; only requested features if genes given
         features = load_features(args.annotation, args.genes)
+        # Ensure the already-created nuc annotation coordinates match those parsed from the reference file
+        if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or
+            features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
+            raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
+                f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")
+
         print("Read in {} features from reference sequence file".format(len(features)))
         for gene in args.genes:
             print(f"Processing gene: {gene}")

diff --git a/augur/translate.py b/augur/translate.py
@@ -429,19 +429,19 @@ def run(args):
     # Note that BioPython FeatureLocations use
     # "Pythonic" coordinates: [zero-origin, half-open)
     # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
-    annotations = {}
+    annotations = {
+        'nuc': {'start': features['nuc'].location.start+1,
+                'end':   features['nuc'].location.end,
+                'strand': '+',
+                'type':  features['nuc'].type,     # (unused by auspice)
+                'seqid': args.reference_sequence}  # (unused by auspice)
+    }
     for fname, feat in features.items():
         annotations[fname] = {'seqid':args.reference_sequence,
                               'type':feat.type,
                               'start':int(feat.location.start)+1,
                               'end':int(feat.location.end),
                               'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
-    if is_vcf: #need to add our own nuc
-        annotations['nuc'] = {'seqid':args.reference_sequence,
-                              'type':feat.type,
-                              'start': 1,
-                              'end': len(ref),
-                              'strand': '+'}
 
     ## determine amino acid mutations for each node
     try:

diff --git a/tests/functional/translate/data/tb/aa_muts.json b/tests/functional/translate/data/tb/aa_muts.json
@@ -1055,7 +1055,7 @@
       "seqid": "translate/data/tb/Mtb_H37Rv_NCBI_Annot.gff",
       "start": 1,
       "strand": "+",
-      "type": "gene"
+      "type": "region"
     },
     "opcA": {
       "end": 1625365,