Merge branch 'trs/merge/polish'

nextstrain · Aug 21, 2024 · 242d67f · 242d67f
2 parents d8faf01 + ddca760
commit 242d67f
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 15 deletions.
diff --git a/augur/io/metadata.py b/augur/io/metadata.py
@@ -69,7 +69,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
     >>> read_metadata("tests/functional/filter/data/metadata.tsv", id_columns=("Virus name",))
     Traceback (most recent call last):
       ...
-    Exception: None of the possible id columns (('Virus name',)) were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url')
+    Exception: None of the possible id columns ('Virus name') were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url')
 
     We also allow iterating through metadata in fixed chunk sizes.
 
@@ -110,7 +110,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
 
     # If we couldn't find a valid index column in the metadata, alert the user.
     if not id_columns_present:
-        raise Exception(f"None of the possible id columns ({id_columns!r}) were found in the metadata's columns {tuple(chunk.columns)!r}")
+        raise Exception(f"None of the possible id columns ({', '.join(map(repr, id_columns))}) were found in the metadata's columns ({', '.join(map(repr, chunk.columns))})")
     else:
         index_col = id_columns_present[0]
 
@@ -599,19 +599,19 @@ def __init__(self, path: str, delimiters: Sequence[str], id_columns: Sequence[st
                 raise AugurError(f"{self.path}: Expected a header row but it is empty.")
 
         # Infer the ID column.
-        self.id_column = self._find_first(id_columns)
+        self.id_column = self._find_id_column(id_columns)
 
     def open(self, **kwargs):
         """Open the file with auto-compression/decompression."""
         return open_file(self.path, newline='', **kwargs)
 
-    def _find_first(self, columns: Sequence[str]):
+    def _find_id_column(self, columns: Sequence[str]):
         """Return the first column in `columns` that is present in the metadata.
         """
         for column in columns:
             if column in self.columns:
                 return column
-        raise AugurError(f"{self.path}: None of ({columns!r}) are in the columns {tuple(self.columns)!r}.")
+        raise AugurError(f"{self.path}: None of the possible id columns ({', '.join(map(repr, columns))}) were found in the metadata's columns ({', '.join(map(repr, self.columns))}).")
 
     def rows(self, strict: bool = True):
         """Yield rows in a dictionary format. Empty lines are ignored.

diff --git a/augur/merge.py b/augur/merge.py
@@ -32,7 +32,9 @@
 you want to use a version different from what's on PATH), set the SQLITE3
 environment variable to path of the desired sqlite3 executable.
 """
+import gettext
 import os
+import re
 import subprocess
 import sys
 from functools import reduce
@@ -43,7 +45,7 @@
 from textwrap import dedent
 from typing import Iterable, Tuple, TypeVar
 
-from augur.argparse_ import ExtendOverwriteDefault
+from augur.argparse_ import ExtendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP
 from augur.errors import AugurError
 from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, Metadata
 from augur.io.print import print_err, print_debug
@@ -53,6 +55,12 @@
 T = TypeVar('T')
 
 
+# Use ngettext() without a message catalog for its singular/plural handling so
+# we can make proper error messages.  gettext() (no "n") is conventionally
+# aliased as "_", so alias ngettext() as "_n".
+_n = gettext.NullTranslations().ngettext
+
+
 class NamedMetadata(Metadata):
     name: str
     """User-provided descriptive name for this metadata file."""
@@ -73,14 +81,14 @@ def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("merge", help=first_line(__doc__))
 
     input_group = parser.add_argument_group("inputs", "options related to input")
-    input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="metadata files with assigned names")
+    input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
 
-    input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
-    input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
+    input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
+    input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
 
     output_group = parser.add_argument_group("outputs", "options related to output")
-    output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="merged metadata as TSV")
-    output_group.add_argument('--quiet', action="store_true", default=False, help="suppress informational messages on stderr")
+    output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
+    output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)
 
     return parser
 
@@ -96,7 +104,7 @@ def run(args):
         raise AugurError(dedent(f"""\
             All metadata inputs must be assigned a name, e.g. with NAME=FILE.
 
-            The following inputs were missing a name:
+            The following {_n("input was", "inputs were", len(unnamed))} missing a name:
 
               {indented_list(unnamed, '            ' + '  ')}
             """))
@@ -109,7 +117,7 @@ def run(args):
         raise AugurError(dedent(f"""\
             Metadata input names must be unique.
 
-            The following names were used more than once:
+            The following {_n("name was", "names were", len(duplicate_names))} used more than once:
 
               {indented_list(duplicate_names, '            ' + '  ')}
             """))
@@ -315,3 +323,52 @@ def count_unique(xs: Iterable[T]) -> Iterable[Tuple[T, int]]:
 
 def indented_list(xs, prefix):
     return f"\n{prefix}".join(xs)
+
+
+def shquote_humanized(x):
+    r"""
+    shquote for humans.
+
+    Use C-style escapes supported by shells (specifically, Bash) for characters
+    that humans would typically use C-style escapes for instead of quoted
+    literals.
+
+    <https://www.gnu.org/software/bash/manual/bash.html#ANSI_002dC-Quoting>
+
+    >>> shquote_humanized("abc")
+    'abc'
+
+    >>> shquote_humanized("\t")
+    "$'\\t'"
+
+    >>> shquote_humanized("abc def")
+    "'abc def'"
+
+    >>> shquote_humanized("abc\tdef")
+    "abc$'\\t'def"
+    """
+    escapes = {
+        '\a': r'\a',
+        '\b': r'\b',
+        '\f': r'\f',
+        '\n': r'\n',
+        '\r': r'\r',
+        '\t': r'\t',
+        '\v': r'\v',
+    }
+
+    def quote(s):
+        if s in escapes:
+            return f"$'{escapes[s]}'"
+        else:
+            # split leaves leading and trailing empty strings when its input is
+            # entirely (captured) separator.  Avoid quoting every empty string
+            # *part* here…
+            return shquote(s) if s else ''
+
+    parts = re.split('([' + ''.join(escapes.values()) + '])', x)
+    quoted = ''.join(map(quote, parts))
+
+    # …and instead quote a final empty string down here if we're still empty
+    # after joining all our parts together.
+    return quoted if quoted else shquote('')
diff --git a/tests/functional/export_v2/cram/metadata-id-columns.t b/tests/functional/export_v2/cram/metadata-id-columns.t
@@ -41,5 +41,5 @@ This should fail with a helpful error message.
   >  --auspice-config "$TESTDIR/../data/auspice_config1.json" \
   >  --maintainers "Nextstrain Team" \
   >  --output dataset.json > /dev/null
-  ERROR: None of the possible id columns (('strain', 'name')) were found in the metadata's columns ('invalid_id', 'div', 'mutation_length')
+  ERROR: None of the possible id columns ('strain', 'name') were found in the metadata's columns ('invalid_id', 'div', 'mutation_length')
   [1]
diff --git a/tests/functional/merge/cram/merge.t b/tests/functional/merge/cram/merge.t
@@ -175,7 +175,7 @@ Metadata names must be unique.
   >   --output-metadata -
   ERROR: Metadata input names must be unique.
 
-  The following names were used more than once:
+  The following name was used more than once:
 
     'data'
 
@@ -197,6 +197,15 @@ Duplicates.
   ERROR: sqlite3 invocation failed
   [2]
 
+No id column found.
+
+  $ ${AUGUR} merge \
+  >   --metadata X=x-id-column.tsv Y=y.tsv \
+  >   --metadata-id-columns strain \
+  >   --output-metadata /dev/null
+  ERROR: x-id-column.tsv: None of the possible id columns ('strain') were found in the metadata's columns ('id', 'a', 'b', 'c').
+  [2]
+
 SQLITE3 env var can be used to override `sqlite3` location (and failure is
 handled).