From 435edb009429242e5de1d15bfa81545299c0dbc7 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 20 Jul 2022 11:07:49 -0700 Subject: [PATCH 1/2] Rename force inclusion filtering functions - include -> force_include_strains - include_by_include_where -> force_include_where I found the existing names a bit vague/confusing. Replacing with more descriptive names. --- augur/filter.py | 30 +++++++++---------- tests/functional/filter/data/filtered_log.tsv | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index f92d41b4a..59d710900 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -502,7 +502,7 @@ def filter_by_non_nucleotide(metadata, sequence_index): return set(filtered_sequence_index[no_invalid_nucleotides].index.values) -def include(metadata, include_file): +def force_include_strains(metadata, include_file): """Include strains in the given text file from the given metadata. Parameters @@ -521,7 +521,7 @@ def include(metadata, include_file): >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> with NamedTemporaryFile(delete=False) as include_file: ... characters_written = include_file.write(b'strain1') - >>> include(metadata, include_file.name) + >>> force_include_strains(metadata, include_file.name) {'strain1'} >>> os.unlink(include_file.name) @@ -530,7 +530,7 @@ def include(metadata, include_file): return set(metadata.index.values) & included_strains -def include_by_include_where(metadata, include_where): +def force_include_where(metadata, include_where): """Include all strains from the given metadata that match the given query. Unlike pandas query syntax, inclusion queries should follow the pattern of @@ -552,16 +552,16 @@ def include_by_include_where(metadata, include_where): >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) - >>> include_by_include_where(metadata, "region!=Europe") + >>> force_include_where(metadata, "region!=Europe") {'strain1'} - >>> include_by_include_where(metadata, "region=Europe") + >>> force_include_where(metadata, "region=Europe") {'strain2'} - >>> include_by_include_where(metadata, "region=europe") + >>> force_include_where(metadata, "region=europe") {'strain2'} If the column referenced in the given query does not exist, skip the filter. - >>> include_by_include_where(metadata, "missing_column=value") + >>> force_include_where(metadata, "missing_column=value") set() """ @@ -612,7 +612,7 @@ def construct_filters(args, sequence_index): # Collect the union of all given strains to include. for include_file in args.include: include_by.append(( - include, + force_include_strains, { "include_file": include_file, } @@ -622,7 +622,7 @@ def construct_filters(args, sequence_index): if args.include_where: for include_where in args.include_where: include_by.append(( - include_by_include_where, + force_include_where, { "include_where": include_where, } @@ -805,28 +805,28 @@ def apply_filters(metadata, exclude_by, include_by): >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-10-02"}, {"region": "North America", "date": "2020-01-01"}], index=["strain1", "strain2", "strain3"]) >>> exclude_by = [(filter_by_date, {"min_date": numeric_date("2020-04-01")})] - >>> include_by = [(include_by_include_where, {"include_where": "region=Africa"})] + >>> include_by = [(force_include_where, {"include_where": "region=Africa"})] >>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by) >>> strains_to_keep {'strain2'} >>> sorted(strains_to_exclude, key=lambda record: record["strain"]) [{'strain': 'strain1', 'filter': 'filter_by_date', 'kwargs': '[["min_date", 2020.25]]'}, {'strain': 'strain3', 'filter': 'filter_by_date', 'kwargs': '[["min_date", 2020.25]]'}] >>> strains_to_include - [{'strain': 'strain1', 'filter': 'include_by_include_where', 'kwargs': '[["include_where", "region=Africa"]]'}] + [{'strain': 'strain1', 'filter': 'force_include_where', 'kwargs': '[["include_where", "region=Africa"]]'}] We also want to filter by characteristics of the sequence data that we've annotated in a sequence index. >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}, {"strain": "strain3", "A": 1250, "C": 1250, "G": 1250, "T": 1250}]).set_index("strain") >>> exclude_by = [(filter_by_sequence_length, {"sequence_index": sequence_index, "min_length": 27000})] - >>> include_by = [(include_by_include_where, {"include_where": "region=Europe"})] + >>> include_by = [(force_include_where, {"include_where": "region=Europe"})] >>> strains_to_keep, strains_to_exclude, strains_to_include = apply_filters(metadata, exclude_by, include_by) >>> strains_to_keep {'strain1'} >>> sorted(strains_to_exclude, key=lambda record: record["strain"]) [{'strain': 'strain2', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}, {'strain': 'strain3', 'filter': 'filter_by_sequence_length', 'kwargs': '[["min_length", 27000]]'}] >>> strains_to_include - [{'strain': 'strain2', 'filter': 'include_by_include_where', 'kwargs': '[["include_where", "region=Europe"]]'}] + [{'strain': 'strain2', 'filter': 'force_include_where', 'kwargs': '[["include_where", "region=Europe"]]'}] """ strains_to_keep = set(metadata.index.values) @@ -1703,8 +1703,8 @@ def run(args): "filter_by_non_nucleotide": "{count} of these were dropped because they had non-nucleotide characters", "skip_group_by_with_ambiguous_year": "{count} were dropped during grouping due to ambiguous year information", "skip_group_by_with_ambiguous_month": "{count} were dropped during grouping due to ambiguous month information", - "include": "{count} strains were added back because they were in {include_file}", - "include_by_include_where": "{count} sequences were added back because of '{include_where}'", + "force_include_strains": "{count} strains were added back because they were in {include_file}", + "force_include_where": "{count} sequences were added back because of '{include_where}'", } for (filter_name, filter_kwargs), count in filter_counts.items(): if filter_kwargs: diff --git a/tests/functional/filter/data/filtered_log.tsv b/tests/functional/filter/data/filtered_log.tsv index 22938cb36..5e12a05a4 100644 --- a/tests/functional/filter/data/filtered_log.tsv +++ b/tests/functional/filter/data/filtered_log.tsv @@ -3,4 +3,4 @@ HND/2016/HU_ME59 filter_by_sequence_index [] COL/FLR_00008/2015 filter_by_query "[[""query"", ""country != 'Colombia'""]]" Colombia/2016/ZC204Se filter_by_query "[[""query"", ""country != 'Colombia'""]]" COL/FLR_00024/2015 filter_by_query "[[""query"", ""country != 'Colombia'""]]" -COL/FLR_00008/2015 include "[[""include_file"", ""filter/data/include.txt""]]" +COL/FLR_00008/2015 force_include_strains "[[""include_file"", ""filter/data/include.txt""]]" From 73a9ae5ee688066993b2cd1beb27af2d1e64304e Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 20 Jul 2022 14:42:27 -0700 Subject: [PATCH 2/2] Update changelog --- CHANGES.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 286be1655..52cab9d1d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,7 +14,12 @@ * `utils.HideAsFalseAction` -> `argparse_.HideAsFalseAction` * Subcommands must include a `register_parser` function to add their own parser instead of a `register_arguments` function [#1002][]. (@joverlee521) +### Bug Fixes + +* filter: Rename internal force inclusion filtering functions [#1006][] (@victorlin) + [#1002]: https://github.com/nextstrain/augur/pull/1002 +[#1006]: https://github.com/nextstrain/augur/pull/1006 ## 16.0.3 (6 July 2022)