Skip to content

Commit

Permalink
Merge branch 'trs/merge/polish'
Browse files Browse the repository at this point in the history
  • Loading branch information
tsibley committed Aug 21, 2024
2 parents d8faf01 + ddca760 commit 242d67f
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 15 deletions.
10 changes: 5 additions & 5 deletions augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id
>>> read_metadata("tests/functional/filter/data/metadata.tsv", id_columns=("Virus name",))
Traceback (most recent call last):
...
Exception: None of the possible id columns (('Virus name',)) were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url')
Exception: None of the possible id columns ('Virus name') were found in the metadata's columns ('strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url')
We also allow iterating through metadata in fixed chunk sizes.
Expand Down Expand Up @@ -110,7 +110,7 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, columns=None, id

# If we couldn't find a valid index column in the metadata, alert the user.
if not id_columns_present:
raise Exception(f"None of the possible id columns ({id_columns!r}) were found in the metadata's columns {tuple(chunk.columns)!r}")
raise Exception(f"None of the possible id columns ({', '.join(map(repr, id_columns))}) were found in the metadata's columns ({', '.join(map(repr, chunk.columns))})")
else:
index_col = id_columns_present[0]

Expand Down Expand Up @@ -599,19 +599,19 @@ def __init__(self, path: str, delimiters: Sequence[str], id_columns: Sequence[st
raise AugurError(f"{self.path}: Expected a header row but it is empty.")

# Infer the ID column.
self.id_column = self._find_first(id_columns)
self.id_column = self._find_id_column(id_columns)

def open(self, **kwargs):
"""Open the file with auto-compression/decompression."""
return open_file(self.path, newline='', **kwargs)

def _find_first(self, columns: Sequence[str]):
def _find_id_column(self, columns: Sequence[str]):
"""Return the first column in `columns` that is present in the metadata.
"""
for column in columns:
if column in self.columns:
return column
raise AugurError(f"{self.path}: None of ({columns!r}) are in the columns {tuple(self.columns)!r}.")
raise AugurError(f"{self.path}: None of the possible id columns ({', '.join(map(repr, columns))}) were found in the metadata's columns ({', '.join(map(repr, self.columns))}).")

def rows(self, strict: bool = True):
"""Yield rows in a dictionary format. Empty lines are ignored.
Expand Down
73 changes: 65 additions & 8 deletions augur/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
you want to use a version different from what's on PATH), set the SQLITE3
environment variable to path of the desired sqlite3 executable.
"""
import gettext
import os
import re
import subprocess
import sys
from functools import reduce
Expand All @@ -43,7 +45,7 @@
from textwrap import dedent
from typing import Iterable, Tuple, TypeVar

from augur.argparse_ import ExtendOverwriteDefault
from augur.argparse_ import ExtendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP
from augur.errors import AugurError
from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, Metadata
from augur.io.print import print_err, print_debug
Expand All @@ -53,6 +55,12 @@
T = TypeVar('T')


# Use ngettext() without a message catalog for its singular/plural handling so
# we can make proper error messages. gettext() (no "n") is conventionally
# aliased as "_", so alias ngettext() as "_n".
_n = gettext.NullTranslations().ngettext


class NamedMetadata(Metadata):
name: str
"""User-provided descriptive name for this metadata file."""
Expand All @@ -73,14 +81,14 @@ def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("merge", help=first_line(__doc__))

input_group = parser.add_argument_group("inputs", "options related to input")
input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="metadata files with assigned names")
input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)

input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP)

output_group = parser.add_argument_group("outputs", "options related to output")
output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="merged metadata as TSV")
output_group.add_argument('--quiet', action="store_true", default=False, help="suppress informational messages on stderr")
output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)

return parser

Expand All @@ -96,7 +104,7 @@ def run(args):
raise AugurError(dedent(f"""\
All metadata inputs must be assigned a name, e.g. with NAME=FILE.
The following inputs were missing a name:
The following {_n("input was", "inputs were", len(unnamed))} missing a name:
{indented_list(unnamed, ' ' + ' ')}
"""))
Expand All @@ -109,7 +117,7 @@ def run(args):
raise AugurError(dedent(f"""\
Metadata input names must be unique.
The following names were used more than once:
The following {_n("name was", "names were", len(duplicate_names))} used more than once:
{indented_list(duplicate_names, ' ' + ' ')}
"""))
Expand Down Expand Up @@ -315,3 +323,52 @@ def count_unique(xs: Iterable[T]) -> Iterable[Tuple[T, int]]:

def indented_list(xs, prefix):
return f"\n{prefix}".join(xs)


def shquote_humanized(x):
r"""
shquote for humans.
Use C-style escapes supported by shells (specifically, Bash) for characters
that humans would typically use C-style escapes for instead of quoted
literals.
<https://www.gnu.org/software/bash/manual/bash.html#ANSI_002dC-Quoting>
>>> shquote_humanized("abc")
'abc'
>>> shquote_humanized("\t")
"$'\\t'"
>>> shquote_humanized("abc def")
"'abc def'"
>>> shquote_humanized("abc\tdef")
"abc$'\\t'def"
"""
escapes = {
'\a': r'\a',
'\b': r'\b',
'\f': r'\f',
'\n': r'\n',
'\r': r'\r',
'\t': r'\t',
'\v': r'\v',
}

def quote(s):
if s in escapes:
return f"$'{escapes[s]}'"
else:
# split leaves leading and trailing empty strings when its input is
# entirely (captured) separator. Avoid quoting every empty string
# *part* here…
return shquote(s) if s else ''

parts = re.split('([' + ''.join(escapes.values()) + '])', x)
quoted = ''.join(map(quote, parts))

# …and instead quote a final empty string down here if we're still empty
# after joining all our parts together.
return quoted if quoted else shquote('')
2 changes: 1 addition & 1 deletion tests/functional/export_v2/cram/metadata-id-columns.t
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ This should fail with a helpful error message.
> --auspice-config "$TESTDIR/../data/auspice_config1.json" \
> --maintainers "Nextstrain Team" \
> --output dataset.json > /dev/null
ERROR: None of the possible id columns (('strain', 'name')) were found in the metadata's columns ('invalid_id', 'div', 'mutation_length')
ERROR: None of the possible id columns ('strain', 'name') were found in the metadata's columns ('invalid_id', 'div', 'mutation_length')
[1]
11 changes: 10 additions & 1 deletion tests/functional/merge/cram/merge.t
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ Metadata names must be unique.
> --output-metadata -
ERROR: Metadata input names must be unique.

The following names were used more than once:
The following name was used more than once:

'data'

Expand All @@ -197,6 +197,15 @@ Duplicates.
ERROR: sqlite3 invocation failed
[2]

No id column found.

$ ${AUGUR} merge \
> --metadata X=x-id-column.tsv Y=y.tsv \
> --metadata-id-columns strain \
> --output-metadata /dev/null
ERROR: x-id-column.tsv: None of the possible id columns ('strain') were found in the metadata's columns ('id', 'a', 'b', 'c').
[2]

SQLITE3 env var can be used to override `sqlite3` location (and failure is
handled).

Expand Down

0 comments on commit 242d67f

Please sign in to comment.