From d99c3e5defc5c9d3d73f705073f7a4ed3b4a24b7 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:59:38 -0700 Subject: [PATCH 1/5] Remove default values from parse_sequence function header The only usage of this function supplies all paramters, so any defaults here are unused and are either duplicate or inaccurate. --- augur/parse.py | 2 +- tests/test_parse.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/augur/parse.py b/augur/parse.py index 5d3cd34a9..795966cfb 100644 --- a/augur/parse.py +++ b/augur/parse.py @@ -88,7 +88,7 @@ def prettify(x, trim=0, camelCase=False, etal=None, removeComma=False): return res -def parse_sequence(sequence, fields, strain_key="strain", separator="|", prettify_fields=None, fix_dates_format=None): +def parse_sequence(sequence, fields, strain_key, separator, prettify_fields, fix_dates_format): """Parse a single sequence record into a sequence record and associated metadata. Parameters diff --git a/tests/test_parse.py b/tests/test_parse.py index 2c159d1a7..04b6f62a3 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -71,7 +71,9 @@ def test_parse_sequence(self): sequence_record, fields=fields, strain_key="strain", - prettify_fields=["region"] + separator="|", + prettify_fields=["region"], + fix_dates_format=None, ) assert sequence.id == metadata["strain"] From f9de7106d33b76779481570d5dcbf04786966750 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:06:02 -0700 Subject: [PATCH 2/5] Add types to parse_sequence --- augur/parse.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/augur/parse.py b/augur/parse.py index 795966cfb..4e9f504f4 100644 --- a/augur/parse.py +++ b/augur/parse.py @@ -1,8 +1,10 @@ """ Parse delimited fields from FASTA sequence names into a TSV and FASTA file. """ +import Bio.SeqRecord import pandas as pd import sys +from typing import Dict, Sequence, Tuple from .io.file import open_file from .io.sequences import read_sequences, write_sequences @@ -88,27 +90,34 @@ def prettify(x, trim=0, camelCase=False, etal=None, removeComma=False): return res -def parse_sequence(sequence, fields, strain_key, separator, prettify_fields, fix_dates_format): +def parse_sequence( + sequence: Bio.SeqRecord.SeqRecord, + fields: Sequence[str], + strain_key: str, + separator: str, + prettify_fields: Sequence[str], + fix_dates_format: str, + ) -> Tuple[Bio.SeqRecord.SeqRecord, Dict[str, str]]: """Parse a single sequence record into a sequence record and associated metadata. Parameters ---------- - sequence : Bio.SeqRecord.SeqRecord + sequence a BioPython sequence record to parse with metadata stored in its description field. - fields : list or tuple + fields a list of names for fields expected in the given record's description. - strain_key : str + strain_key name of the field to use as the given sequence's unique id - separator : str + separator delimiter to split record description by. - prettify_fields : list or tuple + prettify_fields a list of field names for which the values in those fields should be prettified. - fix_dates_format : str + fix_dates_format parse "date" field into the requested canonical format ("dayfirst" or "monthfirst"). Returns From 6064fa2e521c55fa91bedec15a59c80767beb6ae Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:51:38 -0700 Subject: [PATCH 3/5] Prefer `strain` over `name` as sequence ID field Following through with deprecation. --- augur/parse.py | 4 +--- tests/functional/parse.t | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/augur/parse.py b/augur/parse.py index 4e9f504f4..80b909faa 100644 --- a/augur/parse.py +++ b/augur/parse.py @@ -11,7 +11,7 @@ from .dates import get_numerical_date_from_value from .errors import AugurError -PARSE_DEFAULT_ID_COLUMNS = ("name", "strain") +PARSE_DEFAULT_ID_COLUMNS = ("strain", "name") forbidden_characters = str.maketrans( {' ': None, @@ -187,8 +187,6 @@ def run(args): for possible_id in PARSE_DEFAULT_ID_COLUMNS: if possible_id in args.fields: strain_key = possible_id - if possible_id == "name" and "strain" in args.fields: - print("DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').\nUsers who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'", file=sys.stderr) break if not strain_key: strain_key = args.fields[0] diff --git a/tests/functional/parse.t b/tests/functional/parse.t index 6b62bbb5a..963afeeb7 100644 --- a/tests/functional/parse.t +++ b/tests/functional/parse.t @@ -69,10 +69,9 @@ Parse Zika sequences into sequences and metadata, preferred default ids is 'name > --output-sequences "$TMP/sequences.fasta" \ > --output-metadata "$TMP/metadata.tsv" \ > --fields strain virus name date region country division city db segment authors url title journal paper_url \ + > --output-id-field 'name' \ > --prettify-fields region country division city \ > --fix-dates monthfirst - DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name'). - Users who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name' $ diff -u "parse/sequences_other.fasta" "$TMP/sequences.fasta" $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv" From bec3b0fbc3b0c6f648a3f125c8488d8c96865e78 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:54:10 -0700 Subject: [PATCH 4/5] Update deprecation entry --- DEPRECATED.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/DEPRECATED.md b/DEPRECATED.md index 149e60f12..3283d61c7 100644 --- a/DEPRECATED.md +++ b/DEPRECATED.md @@ -10,9 +10,7 @@ available for backwards compatibility, but should not be used in new code. ## `augur parse` preference of `name` over `strain` as the sequence ID field -*Deprecated in version 24.2.0 (February 2024). Planned to be reordered June 2024 or after.* - -Currently, `augur parse` checks for a 'name' field and then a 'strain' field to use as a sequence ID. This order will be changed in favor of searching for a 'strain' and then a 'name' field to be more consistent with the rest of Augur. +*Deprecated in version 24.2.0 (February 2024). Reordered in version __NEXT__ (September 2024).* Users who have both 'name' and 'strain' fields in their data, and want to favor using the 'name' field should add the following `augur parse` parameter `--output-id-field 'name'`. From 5e0bc544c12725b91e27bdfc2e322a2b62a76f37 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:57:56 -0700 Subject: [PATCH 5/5] Update changelog --- CHANGES.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 12c49c938..715a5c0b7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,10 @@ ## __NEXT__ +### Major Changes + +* parse: When both `strain` and `name` fields are present, the `strain` field will now be used as the sequence ID field. [#1629][] (@victorlin) + ### Features * merge: Generated source columns (e.g. `__source_metadata_{NAME}`) may now have their name template changed with `--source-columns=TEMPLATE` or may be omitted entirely with `--no-source-columns`. [#1625][] (@tsibley) @@ -13,6 +17,7 @@ [#1588]: https://github.com/nextstrain/augur/issues/1588 [#1598]: https://github.com/nextstrain/augur/issues/1598 [#1625]: https://github.com/nextstrain/augur/issues/1625 +[#1629]: https://github.com/nextstrain/augur/pull/1629 ## 25.4.0 (3 September 2024)