Skip to content

Commit

Permalink
export v2: Add --metadata-columns option
Browse files Browse the repository at this point in the history
Allows users to specify additional metadata columns to export as
node attributes that are not specified as coloring options via
the existing `--color-by-metadata` option or the Auspice config JSON.
This allows the metadata columns to be visible in the tree in Auspice
without polluting the color-by options.
  • Loading branch information
joverlee521 committed Jan 6, 2024
1 parent 9317485 commit 6b1dbf6
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 2 deletions.
28 changes: 26 additions & 2 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ def _recursively_set_data(node):
_recursively_set_data(data_json["tree"])


def set_node_attrs_on_tree(data_json, node_attrs):
def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns):
'''
Assign desired colorings, metadata etc to the `node_attrs` of nodes in the tree
Expand All @@ -743,10 +743,17 @@ def set_node_attrs_on_tree(data_json, node_attrs):
data_json : dict
node_attrs: dict
keys: strain names. values: dict with keys -> all available metadata (even "excluded" keys), values -> data (string / numeric / bool)
additional_metadata_columns: list
Requested additional metadata columns to export
'''

author_data = create_author_data(node_attrs)

def _transfer_additional_metadata_columns(node, raw_data):
for col in additional_metadata_columns:
if is_valid(raw_data.get(col, None)):
node["node_attrs"][col] = {"value": raw_data[col]}

def _transfer_vaccine_info(node, raw_data):
if raw_data.get("vaccine"):
node["node_attrs"]['vaccine'] = raw_data['vaccine']
Expand Down Expand Up @@ -798,6 +805,9 @@ def _transfer_author_data(node):
def _recursively_set_data(node):
# get all the available information for this particular node
raw_data = node_attrs[node["name"]]
# transfer requested metadata columns first so that the "special cases"
# below can overwrite them as necessary
_transfer_additional_metadata_columns(node, raw_data)
# transfer "special cases"
_transfer_vaccine_info(node, raw_data)
_transfer_hidden_flag(node, raw_data)
Expand Down Expand Up @@ -877,6 +887,9 @@ def register_parser(parent_subparsers):
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
optional_inputs.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
optional_inputs.add_argument('--metadata-columns', nargs="+",
help="Metadata columns to export in addition to columns provided by --color-by-metadata or --auspice-config. " +
"These columns will not be used as coloring options in Auspice but will be visible in the tree.")
optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`")
optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)")

Expand Down Expand Up @@ -1142,6 +1155,17 @@ def run(args):
parse_node_data_and_metadata(T, node_data_file, metadata_file)
config = get_config(args)

# Check additional metadata columns requested exist
additional_metadata_columns = []
if args.metadata_columns:
for col in args.metadata_columns:
# Match the column names corrected within parse_node_data_and_metadata
corrected_col = update_deprecated_names(col)
if corrected_col not in metadata_names:
print(f"WARNING: Requested metadata column {col!r} does not exist and will not be exported")
continue
additional_metadata_columns.append(corrected_col)

# set metadata data structures
set_title(data_json, config, args.title)
set_display_defaults(data_json, config)
Expand Down Expand Up @@ -1169,7 +1193,7 @@ def run(args):

# set tree structure
data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs))
set_node_attrs_on_tree(data_json, node_attrs)
set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns)
set_branch_attrs_on_tree(data_json, branch_attrs)

set_geo_resolutions(data_json, config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs)
Expand Down
16 changes: 16 additions & 0 deletions tests/functional/export_v2/cram/metadata-columns.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Setup

$ source "$TESTDIR"/_setup.sh

Run export with tree and metadata with additional columns.

$ ${AUGUR} export v2 \
> --tree "$TESTDIR/../data/tree.nwk" \
> --metadata "$TESTDIR/../data/dataset1_metadata_with_additional_columns.tsv" \
> --metadata-columns "field_A" "field_B" \
> --maintainers "Nextstrain Team" \
> --output dataset.json > /dev/null

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" "$TESTDIR/../data/dataset-with-additional-metadata-columns.json" dataset.json \
> --exclude-paths "root['meta']['updated']" "root['meta']['maintainers']"
{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
{
"version": "v2",
"meta": {
"updated": "2024-01-05",
"maintainers": [
{
"name": "Nextstrain Team"
}
],
"colorings": [],
"filters": [],
"panels": [
"tree"
]
},
"tree": {
"name": "ROOT",
"node_attrs": {
"div": 0
},
"branch_attrs": {},
"children": [
{
"name": "tipA",
"node_attrs": {
"div": 1.0,
"field_A": {
"value": "AA"
},
"field_B": {
"value": "AAA"
}
},
"branch_attrs": {}
},
{
"name": "internalBC",
"node_attrs": {
"div": 2.0
},
"branch_attrs": {},
"children": [
{
"name": "tipB",
"node_attrs": {
"div": 3.0,
"field_A": {
"value": "BB"
},
"field_B": {
"value": "BBB"
}
},
"branch_attrs": {}
},
{
"name": "tipC",
"node_attrs": {
"div": 3.0,
"field_A": {
"value": "CC"
},
"field_B": {
"value": "CCC"
}
},
"branch_attrs": {}
}
]
},
{
"name": "internalDEF",
"node_attrs": {
"div": 5.0
},
"branch_attrs": {},
"children": [
{
"name": "tipD",
"node_attrs": {
"div": 8.0,
"field_A": {
"value": "DD"
},
"field_B": {
"value": "DDD"
}
},
"branch_attrs": {}
},
{
"name": "tipE",
"node_attrs": {
"div": 9.0,
"field_A": {
"value": "EE"
},
"field_B": {
"value": "EEE"
}
},
"branch_attrs": {}
},
{
"name": "tipF",
"node_attrs": {
"div": 6.0,
"field_A": {
"value": "FF"
},
"field_B": {
"value": "FFF"
}
},
"branch_attrs": {}
}
]
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
strain field_A field_B
tipA AA AAA
tipB BB BBB
tipC CC CCC
tipD DD DDD
tipE EE EEE
tipF FF FFF

0 comments on commit 6b1dbf6

Please sign in to comment.