From 6b1dbf672e37f9d23861889ca59def3350dc45d0 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 5 Jan 2024 16:59:02 -0800 Subject: [PATCH] export v2: Add `--metadata-columns` option Allows users to specify additional metadata columns to export as node attributes that are not specified as coloring options via the existing `--color-by-metadata` option or the Auspice config JSON. This allows the metadata columns to be visible in the tree in Auspice without polluting the color-by options. --- augur/export_v2.py | 28 +++- .../export_v2/cram/metadata-columns.t | 16 +++ ...aset-with-additional-metadata-columns.json | 121 ++++++++++++++++++ ...aset1_metadata_with_additional_columns.tsv | 7 + 4 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 tests/functional/export_v2/cram/metadata-columns.t create mode 100644 tests/functional/export_v2/data/dataset-with-additional-metadata-columns.json create mode 100644 tests/functional/export_v2/data/dataset1_metadata_with_additional_columns.tsv diff --git a/augur/export_v2.py b/augur/export_v2.py index 12378f31b..df2a5be92 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -734,7 +734,7 @@ def _recursively_set_data(node): _recursively_set_data(data_json["tree"]) -def set_node_attrs_on_tree(data_json, node_attrs): +def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns): ''' Assign desired colorings, metadata etc to the `node_attrs` of nodes in the tree @@ -743,10 +743,17 @@ def set_node_attrs_on_tree(data_json, node_attrs): data_json : dict node_attrs: dict keys: strain names. values: dict with keys -> all available metadata (even "excluded" keys), values -> data (string / numeric / bool) + additional_metadata_columns: list + Requested additional metadata columns to export ''' author_data = create_author_data(node_attrs) + def _transfer_additional_metadata_columns(node, raw_data): + for col in additional_metadata_columns: + if is_valid(raw_data.get(col, None)): + node["node_attrs"][col] = {"value": raw_data[col]} + def _transfer_vaccine_info(node, raw_data): if raw_data.get("vaccine"): node["node_attrs"]['vaccine'] = raw_data['vaccine'] @@ -798,6 +805,9 @@ def _transfer_author_data(node): def _recursively_set_data(node): # get all the available information for this particular node raw_data = node_attrs[node["name"]] + # transfer requested metadata columns first so that the "special cases" + # below can overwrite them as necessary + _transfer_additional_metadata_columns(node, raw_data) # transfer "special cases" _transfer_vaccine_info(node, raw_data) _transfer_hidden_flag(node, raw_data) @@ -877,6 +887,9 @@ def register_parser(parent_subparsers): help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") optional_inputs.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") + optional_inputs.add_argument('--metadata-columns', nargs="+", + help="Metadata columns to export in addition to columns provided by --color-by-metadata or --auspice-config. " + + "These columns will not be used as coloring options in Auspice but will be visible in the tree.") optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`") optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)") @@ -1142,6 +1155,17 @@ def run(args): parse_node_data_and_metadata(T, node_data_file, metadata_file) config = get_config(args) + # Check additional metadata columns requested exist + additional_metadata_columns = [] + if args.metadata_columns: + for col in args.metadata_columns: + # Match the column names corrected within parse_node_data_and_metadata + corrected_col = update_deprecated_names(col) + if corrected_col not in metadata_names: + print(f"WARNING: Requested metadata column {col!r} does not exist and will not be exported") + continue + additional_metadata_columns.append(corrected_col) + # set metadata data structures set_title(data_json, config, args.title) set_display_defaults(data_json, config) @@ -1169,7 +1193,7 @@ def run(args): # set tree structure data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs)) - set_node_attrs_on_tree(data_json, node_attrs) + set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns) set_branch_attrs_on_tree(data_json, branch_attrs) set_geo_resolutions(data_json, config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs) diff --git a/tests/functional/export_v2/cram/metadata-columns.t b/tests/functional/export_v2/cram/metadata-columns.t new file mode 100644 index 000000000..723c931d4 --- /dev/null +++ b/tests/functional/export_v2/cram/metadata-columns.t @@ -0,0 +1,16 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Run export with tree and metadata with additional columns. + + $ ${AUGUR} export v2 \ + > --tree "$TESTDIR/../data/tree.nwk" \ + > --metadata "$TESTDIR/../data/dataset1_metadata_with_additional_columns.tsv" \ + > --metadata-columns "field_A" "field_B" \ + > --maintainers "Nextstrain Team" \ + > --output dataset.json > /dev/null + + $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/dataset-with-additional-metadata-columns.json" dataset.json \ + > --exclude-paths "root['meta']['updated']" "root['meta']['maintainers']" + {} diff --git a/tests/functional/export_v2/data/dataset-with-additional-metadata-columns.json b/tests/functional/export_v2/data/dataset-with-additional-metadata-columns.json new file mode 100644 index 000000000..4022c5e3c --- /dev/null +++ b/tests/functional/export_v2/data/dataset-with-additional-metadata-columns.json @@ -0,0 +1,121 @@ +{ + "version": "v2", + "meta": { + "updated": "2024-01-05", + "maintainers": [ + { + "name": "Nextstrain Team" + } + ], + "colorings": [], + "filters": [], + "panels": [ + "tree" + ] + }, + "tree": { + "name": "ROOT", + "node_attrs": { + "div": 0 + }, + "branch_attrs": {}, + "children": [ + { + "name": "tipA", + "node_attrs": { + "div": 1.0, + "field_A": { + "value": "AA" + }, + "field_B": { + "value": "AAA" + } + }, + "branch_attrs": {} + }, + { + "name": "internalBC", + "node_attrs": { + "div": 2.0 + }, + "branch_attrs": {}, + "children": [ + { + "name": "tipB", + "node_attrs": { + "div": 3.0, + "field_A": { + "value": "BB" + }, + "field_B": { + "value": "BBB" + } + }, + "branch_attrs": {} + }, + { + "name": "tipC", + "node_attrs": { + "div": 3.0, + "field_A": { + "value": "CC" + }, + "field_B": { + "value": "CCC" + } + }, + "branch_attrs": {} + } + ] + }, + { + "name": "internalDEF", + "node_attrs": { + "div": 5.0 + }, + "branch_attrs": {}, + "children": [ + { + "name": "tipD", + "node_attrs": { + "div": 8.0, + "field_A": { + "value": "DD" + }, + "field_B": { + "value": "DDD" + } + }, + "branch_attrs": {} + }, + { + "name": "tipE", + "node_attrs": { + "div": 9.0, + "field_A": { + "value": "EE" + }, + "field_B": { + "value": "EEE" + } + }, + "branch_attrs": {} + }, + { + "name": "tipF", + "node_attrs": { + "div": 6.0, + "field_A": { + "value": "FF" + }, + "field_B": { + "value": "FFF" + } + }, + "branch_attrs": {} + } + ] + } + ] + } +} \ No newline at end of file diff --git a/tests/functional/export_v2/data/dataset1_metadata_with_additional_columns.tsv b/tests/functional/export_v2/data/dataset1_metadata_with_additional_columns.tsv new file mode 100644 index 000000000..0683b96aa --- /dev/null +++ b/tests/functional/export_v2/data/dataset1_metadata_with_additional_columns.tsv @@ -0,0 +1,7 @@ +strain field_A field_B +tipA AA AAA +tipB BB BBB +tipC CC CCC +tipD DD DDD +tipE EE EEE +tipF FF FFF