io: Define potential strain column names in a new file

This avoids re-defining these values at each use case and prevents them from getting out of sync.
nextstrain · Jan 3, 2023 · 28001ac · 28001ac
1 parent f8f15ff
commit 28001ac
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 2 deletions.
diff --git a/augur/filter.py b/augur/filter.py
@@ -18,9 +18,11 @@
 from tempfile import NamedTemporaryFile
 from typing import Collection
 
+
 from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates, get_iso_year_week
 from .errors import AugurError
 from .index import index_sequences, index_vcf
+from .io.defaults import POTENTIAL_STRAIN_ID_COLUMNS
 from .io.file import open_file
 from .io.metadata import read_metadata
 from .io.sequences import read_sequences, write_sequences
@@ -48,7 +50,7 @@ def register_arguments(parser):
     input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
     input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
     input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
-    input_group.add_argument('--metadata-id-columns', default=["strain", "name"], nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'")
+    input_group.add_argument('--metadata-id-columns', default=POTENTIAL_STRAIN_ID_COLUMNS, nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'")
 
     metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
     metadata_filter_group.add_argument(

diff --git a/augur/io/defaults.py b/augur/io/defaults.py
@@ -0,0 +1,3 @@
+# For tabular files, accept the following column names to represent the unique
+# row ID, in order of preference.
+POTENTIAL_STRAIN_ID_COLUMNS = ("strain", "name")
diff --git a/augur/io/metadata.py b/augur/io/metadata.py
@@ -7,12 +7,13 @@
 from itertools import chain
 
 from augur.errors import AugurError
+from augur.io.defaults import POTENTIAL_STRAIN_ID_COLUMNS
 from augur.io.print import print_err
 from augur.types import DataErrorMethod
 from .file import open_file
 
 
-def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None):
+def read_metadata(metadata_file, id_columns=POTENTIAL_STRAIN_ID_COLUMNS, chunk_size=None):
     """Read metadata from a given filename and into a pandas `DataFrame` or
     `TextFileReader` object.