[utils] only allow GFFs with one record

Also catches the edge case where a GFF has no valid rows. The printed error messages should be helpful enough to identify the GFF formatting error(s).
nextstrain · Dec 20, 2023 · 4b670f3 · 4b670f3
1 parent d03ce41
commit 4b670f3
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 27 deletions.
diff --git a/augur/utils.py b/augur/utils.py
@@ -164,7 +164,7 @@ def load_features(reference, feature_names=None):
     Raises
     ------
     AugurError
-        If the reference file doesn't exist
+        If the reference file doesn't exist, or is malformed / empty
     """
     #checks explicitly for GFF otherwise assumes Genbank
     if not os.path.isfile(reference):
@@ -197,43 +197,60 @@ def _read_gff(reference, feature_names):
     features : dict
         keys: feature names, values: <class 'Bio.SeqFeature.SeqFeature'>
         Note that feature names may not equivalent to GenBank feature keys
+
+    Raises
+    ------
+    AugurError
+        If the reference file contains no IDs or multiple different seqids
     """
     from BCBio import GFF
     valid_types = ['gene', 'source']
     features = {}
+
     with open(reference, encoding='utf-8') as in_handle:
-        for rec in GFF.parse(in_handle, limit_info={"gff_type": valid_types}):
-            for feat in rec.features:
-                # Check for gene names stored in qualifiers commonly used by
-                # virus-specific gene maps first (e.g., 'gene',
-                # 'gene_name'). Then, check for qualifiers used by non-viral
-                # pathogens (e.g., 'locus_tag').
-                if feature_names is not None:
-                    if "gene" in feat.qualifiers and feat.qualifiers["gene"][0] in feature_names:
-                        fname = feat.qualifiers["gene"][0]
-                    elif "gene_name" in feat.qualifiers and feat.qualifiers["gene_name"][0] in feature_names:
-                        fname = feat.qualifiers["gene_name"][0]
-                    elif "locus_tag" in feat.qualifiers and feat.qualifiers["locus_tag"][0] in feature_names:
-                        fname = feat.qualifiers["locus_tag"][0]
-                    else:
-                        fname = None
+        # Note that `GFF.parse` doesn't always yield GFF records in the order
+        # one may expect, but since we raise AugurError if there are multiple
+        # this doesn't matter.
+        gff_entries = list(GFF.parse(in_handle, limit_info={'gff_type': valid_types}))
+        if len(gff_entries) == 0:
+            raise AugurError(f"Reference {reference!r} contains no valid data rows. Valid GFF types (3rd column) are {', '.join(valid_types)}.")
+        elif len(gff_entries) > 1:
+            raise AugurError(f"Reference {reference!r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid.")
+        else:
+            rec = gff_entries[0]
+
+        for feat in rec.features:
+            # Check for gene names stored in qualifiers commonly used by
+            # virus-specific gene maps first (e.g., 'gene',
+            # 'gene_name'). Then, check for qualifiers used by non-viral
+            # pathogens (e.g., 'locus_tag').
+            if feature_names is not None:
+                if "gene" in feat.qualifiers and feat.qualifiers["gene"][0] in feature_names:
+                    fname = feat.qualifiers["gene"][0]
+                elif "gene_name" in feat.qualifiers and feat.qualifiers["gene_name"][0] in feature_names:
+                    fname = feat.qualifiers["gene_name"][0]
+                elif "locus_tag" in feat.qualifiers and feat.qualifiers["locus_tag"][0] in feature_names:
+                    fname = feat.qualifiers["locus_tag"][0]
                 else:
-                    if "gene" in feat.qualifiers:
-                        fname = feat.qualifiers["gene"][0]
-                    elif "gene_name" in feat.qualifiers:
-                        fname = feat.qualifiers["gene_name"][0]
-                    else:
-                        fname = feat.qualifiers["locus_tag"][0]
-                if feat.type == "source":
-                    fname = "nuc"
-
-                if fname:
-                    features[fname] = feat
+                    fname = None
+            else:
+                if "gene" in feat.qualifiers:
+                    fname = feat.qualifiers["gene"][0]
+                elif "gene_name" in feat.qualifiers:
+                    fname = feat.qualifiers["gene_name"][0]
+                else:
+                    fname = feat.qualifiers["locus_tag"][0]
+            if feat.type == "source":
+                fname = "nuc"
+
+            if fname:
+                features[fname] = feat
 
         if feature_names is not None:
             for fe in feature_names:
                 if fe not in features:
                     print("Couldn't find gene {} in GFF or GenBank file".format(fe))
+
     return features
 
 def _read_genbank(reference, feature_names):

diff --git a/tests/functional/translate/cram/gff.t b/tests/functional/translate/cram/gff.t
@@ -0,0 +1,35 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
+  $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome"
+  $ export DATA="$TESTDIR/../data/simple-genome"
+
+These tests are intended to test variants of GFF formatting
+
+
+GFF file with no valid rows
+
+  $ head -n 3  $DATA/reference.source.gff > "reference.empty.gff"
+
+  $ ${AUGUR} translate \
+  >  --tree $ANC_DATA/tree.nwk \
+  >  --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \
+  >  --reference-sequence "reference.empty.gff" \
+  >  --output-node-data "aa_muts.json" > /dev/null
+  ERROR: Reference 'reference.empty.gff' contains no valid data rows. .+ (re)
+  [2]
+
+GFF file with an extra record
+
+  $ cp $DATA/reference.source.gff "reference.double.gff"
+
+  $ echo -e "additional\tRefSeq\tsource\t1\t10\t.\t+\t.\tID=additional" >> "reference.double.gff"
+
+  $ ${AUGUR} translate \
+  >  --tree $ANC_DATA/tree.nwk \
+  >  --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \
+  >  --reference-sequence "reference.double.gff" \
+  >  --output-node-data "aa_muts.json"
+  ERROR: Reference 'reference.double.gff' contains multiple seqids .+ (re)
+  [2]