Skip to content

Commit

Permalink
augur clades allows attribute name to be specified
Browse files Browse the repository at this point in the history
Previously the `augur clades` command produced a node-data JSON
which stored clade membership as the node-attr "clade_membership"
and defined the basal nodes of each clade with the node-attr
"clade_annotation". `augur export v2` interpreted the latter as a
special-case and turned it into a branch label of the same name.

The previous commit allowed `augur export` to be supplied node-data
JSONs with a `branches` dictionary. Here we update `augur clades`
to export data in this structure, which allows the user to specify
the keys to use via the `--attribute-name` arg.

This commit breaks backwards compatibility for pipelines as the default
attribute name is "clade". This will result in dataset (auspice) JSONs
with the same branch labelling as before, but with a different node-attr
(was "clade_membership", now "clade"). As `augur export v2` will make
colorings for all node-attrs in in node-data JSONs, this will be
exported as a "clade" coloring with no changes needed, however auspice
config JSONs may now refer to a non-existent "clade_membership" key.

`augur export v2` has been updated to no longer special-case
`clade_membership` or `clade_annotation` node attrs. We print a
warning if an auspice config JSON refers to `clade_membership` to
help users update their configs.

Functional tests for `augur clades` have been added.

Closes #720
  • Loading branch information
jameshadfield committed Jun 15, 2021
1 parent 695bfec commit 422084d
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 46 deletions.
37 changes: 22 additions & 15 deletions augur/clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,12 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
mapping of node to clades
'''

clade_membership = {}
# We use the following dictionaries to store which clade nodes belong to.
# All nodes in a clade should appear in `clade_membership` while only one node should
# appear in `basal_clade_nodes`
(clade_membership, basal_clade_nodes) = ({}, {})
parents = get_parent_name_by_child_name_for_tree(tree)

# first pass to set all nodes to unassigned as precaution to ensure attribute is set
for node in tree.find_clades(order = 'preorder'):
clade_membership[node.name] = {'clade_membership': 'unassigned'}

# count leaves
for node in tree.find_clades(order = 'postorder'):
node.leaf_count = 1 if node.is_terminal() else np.sum([c.leaf_count for c in node])
Expand Down Expand Up @@ -136,7 +135,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
node.sequences[gene][pos] = d


# second pass to assign 'clade_annotation' to basal nodes within each clade
# store names of basal nodes of each clade in `basal_clade_nodes` and `clade_membership` dicts.
# if multiple nodes match, assign annotation to largest
# otherwise occasional unwanted cousin nodes get assigned the annotation
for clade_name, clade_alleles in clade_designations.items():
Expand All @@ -147,16 +146,17 @@ def assign_clades(clade_designations, all_muts, tree, ref=None):
sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True)
if len(sorted_nodes) > 0:
target_node = sorted_nodes[0]
clade_membership[target_node.name] = {'clade_annotation': clade_name, 'clade_membership': clade_name}
basal_clade_nodes[target_node.name] = clade_name
clade_membership[target_node.name] = clade_name # basal nodes are members of the clade

# third pass to propagate 'clade_membership'
# propagate 'clade_membership' to children nodes
# don't propagate if encountering 'clade_annotation'
for node in tree.find_clades(order = 'preorder'):
for child in node:
if 'clade_annotation' not in clade_membership[child.name]:
clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership']

return clade_membership
# if the child doesn't define the start of its own clade, but the parent belongs to a clade, then inherit that membership
if child.name not in basal_clade_nodes and node.name in clade_membership:
clade_membership[child.name] = clade_membership[node.name]
return (basal_clade_nodes, clade_membership)


def get_reference_sequence_from_root_node(all_muts, root_name):
Expand All @@ -181,6 +181,7 @@ def register_arguments(parser):
parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
parser.add_argument('--attribute-name', type=str, default="clade", help="name to use for clade membership & branch labels", required=False)
parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to')


Expand All @@ -205,8 +206,14 @@ def run(args):

clade_designations = read_in_clade_definitions(args.clades)

clade_membership = assign_clades(clade_designations, all_muts, tree, ref)
(basal_clade_nodes, clade_membership) = assign_clades(clade_designations, all_muts, tree, ref)

# create node_data for export as a JSON
node_data = {
'nodes': {node: {args.attribute_name: clade} for node,clade in clade_membership.items()},
'branches': {node: {'labels': {args.attribute_name: clade}} for node,clade in basal_clade_nodes.items()}
}

out_name = get_json_name(args)
write_json({'nodes': clade_membership}, out_name)
print("clades written to", out_name, file=sys.stdout)
write_json(node_data, out_name)
print(f"clades written to {out_name} using attribute name {args.attribute_name}", file=sys.stdout)
46 changes: 15 additions & 31 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,9 @@ def are_mutations_defined(node_attrs):
return True
return False


def are_clades_defined(node_attrs):
def is_node_attr_defined(node_attrs, attr_name):
for node, data in node_attrs.items():
if data.get("clade_membership") or data.get("clade_annotation"):
return True
return False


def are_dates_defined(node_attrs):
for node, data in node_attrs.items():
if data.get("num_date"):
if data.get(attr_name):
return True
return False

Expand Down Expand Up @@ -163,7 +155,7 @@ def set_colorings(data_json, config, command_line_colorings, metadata_names, nod
def _get_type(key, trait_values):
# for some keys we know what the type must be
known_types = {
"clade_membership": "categorical",
"clade": "categorical",
"gt": "categorical",
"author": "categorical",
"num_date": "continuous"
Expand Down Expand Up @@ -200,7 +192,7 @@ def _get_title(key):
return config_title

# hardcoded fallbacks:
if key == "clade_membership":
if key == "clade":
return "Clade"
if key == "gt":
return "Genotype"
Expand Down Expand Up @@ -310,6 +302,12 @@ def _is_valid(coloring):
if key == "gt" and not are_mutations_defined(node_attrs):
warn("[colorings] You asked for mutations (\"gt\"), but none are defined on the tree. They cannot be used as a coloring.")
return False
if key == "clade_membership" and not trait_values:
# augur 12 & below defined clades via the key "clade_membership", not "clade".
# If an auspice_config file specifies this, and it is not present in any node-data, we print a warning.
# (Note that if "clade" is present in node-data, we automatically include it as a colouring.)
warn("You asked for a color-by for 'clade_membership' but this is now called 'clade'. You should update your auspice config file.")
return False
if key != "gt" and not trait_values:
warn("You asked for a color-by for trait '{}', but it has no values on the tree. It has been ignored.".format(key))
return False
Expand Down Expand Up @@ -348,11 +346,10 @@ def _get_colorings():
# add in genotype as a special case if (a) not already set and (b) the data supports it
if "gt" not in explicitly_defined_colorings and are_mutations_defined(node_attrs):
colorings.insert(0,{'key':'gt'})
if "num_date" not in explicitly_defined_colorings and are_dates_defined(node_attrs):
if "num_date" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "num_date"):
colorings.insert(0,{'key':'num_date'})
if "clade_membership" not in explicitly_defined_colorings and are_clades_defined(node_attrs):
colorings.insert(0,{'key':'clade_membership'})

if "clade" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "clade"):
colorings.insert(0,{'key':'clade'})
return colorings


Expand Down Expand Up @@ -714,8 +711,6 @@ def node_data_prop_is_normal_trait(name):
# those traits / keys / attrs which are not "special" and can be exported
# as normal attributes on nodes
excluded = [
"clade_annotation", # Clade annotation is label, not colorby!
"clade_membership", # will be auto-detected if it is available
"authors", # authors are set as a node property, not a trait property
"author", # see above
"vaccine", # vaccine info is stored as a "special" node prop
Expand Down Expand Up @@ -914,16 +909,6 @@ def transfer_mutations_to_branches(node_attrs, branch_attrs):
else:
branch_attrs[node_name]["labels"] = { "aa": aa_lab }

def transfer_clade_annotation_to_branches(node_attrs, branch_attrs):
for node_name, raw_data in node_attrs.items():
if "clade_annotation" in raw_data and is_valid(raw_data["clade_annotation"]):
if node_name not in branch_attrs:
branch_attrs[node_name] = {}
if 'labels' in branch_attrs[node_name]:
branch_attrs[node_name]["labels"]['clade'] = raw_data["clade_annotation"]
else:
branch_attrs[node_name]["labels"] = { "clade": raw_data["clade_annotation"] }

def transfer_branch_data_to_branch_attrs(branches_node_data, branch_attrs):
"""
Transfers information stored in node-data JSONs under "branches" to the `branch_attrs`.
Expand Down Expand Up @@ -968,12 +953,11 @@ def parse_node_data_and_metadata(T, node_data_files, metadata_file):
node_attrs[name][corrected_key] = value
node_data_names.add(corrected_key)

# third pass: create `branch_attrs` which includes certain traits supplied in `node_attrs`
# (e.g. mutations are coverted to branch attrs, and `clade_annotation` is interpreted as a label)
# third pass: create `branch_attrs` which includes a few special-case traits from in `node_attrs`
# (e.g. mutations are coverted from node attrs to branch attrs)
# as well as any branch labels supplied in node-data files.
branch_attrs = {}
transfer_mutations_to_branches(node_attrs, branch_attrs)
transfer_clade_annotation_to_branches(node_attrs, branch_attrs)
transfer_branch_data_to_branch_attrs(node_data.get('branches', {}), branch_attrs)

return (node_data, node_attrs, node_data_names, metadata_names, branch_attrs)
Expand Down
34 changes: 34 additions & 0 deletions tests/functional/clades.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
Integration tests for augur clades.

$ pushd "$TESTDIR" > /dev/null
$ export AUGUR="../../bin/augur"

Run augur clades without --attribute-name. We expect the name to be "clade"

$ ${AUGUR} clades \
> --tree clades/tree.nwk \
> --clades clades/clades.tsv \
> --mutations clades/nt_muts.json \
> --output-node-data "$TMP/default.json" > /dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-default.json" "$TMP/default.json"
{}

Run augur clades with a custom --attribute-name

$ ${AUGUR} clades \
> --tree clades/tree.nwk \
> --clades clades/clades.tsv \
> --mutations clades/nt_muts.json \
> --attribute-name custom \
> --output-node-data "$TMP/custom-attr.json" > /dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-custom-attr.json" "$TMP/custom-attr.json"
{}

Ensure the only change between runs of `augur clades` is the attr name used
$ cat "$TMP/default.json" | sed "s/clade/custom/" > "$TMP/default-now-custom.json"
$ diff -u "$TMP/default-now-custom.json" "$TMP/custom-attr.json"

Cleanup
$ rm -f "$TMP/default.json" "$TMP/custom-attr.json" "$TMP/default-now-custom.json"
12 changes: 12 additions & 0 deletions tests/functional/clades/clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
clade gene site alt

# the 1b mutation only once, on the branch leading to tips B and C
# thus we expect the clade label to be on node `internalAB`
cladeCB nuc 1 B
# the 2c mutation appears twice -- on branch `internalAB` and `internalDEF`
# as the latter has 3 descendants, it is chosen over the former
cladeDEF nuc 2 C
# mutation 3e appears only on a terminal node (tipE)
# but we still expect both a branch label and a node_attr
# this means that tipE should be annotated "cladeE" and _not_ "cladeDEF"
cladeE nuc 3 E
46 changes: 46 additions & 0 deletions tests/functional/clades/expected-output-custom-attr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"branches": {
"internalBC": {
"labels": {
"custom": "cladeCB"
}
},
"internalDEF": {
"labels": {
"custom": "cladeDEF"
}
},
"tipE": {
"labels": {
"custom": "cladeE"
}
}
},
"generated_by": {
"program": "augur",
"version": "12.0.0"
},
"nodes": {
"internalBC": {
"custom": "cladeCB"
},
"internalDEF": {
"custom": "cladeDEF"
},
"tipB": {
"custom": "cladeCB"
},
"tipC": {
"custom": "cladeCB"
},
"tipD": {
"custom": "cladeDEF"
},
"tipE": {
"custom": "cladeE"
},
"tipF": {
"custom": "cladeDEF"
}
}
}
46 changes: 46 additions & 0 deletions tests/functional/clades/expected-output-default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"branches": {
"internalBC": {
"labels": {
"clade": "cladeCB"
}
},
"internalDEF": {
"labels": {
"clade": "cladeDEF"
}
},
"tipE": {
"labels": {
"clade": "cladeE"
}
}
},
"generated_by": {
"program": "augur",
"version": "12.0.0"
},
"nodes": {
"internalBC": {
"clade": "cladeCB"
},
"internalDEF": {
"clade": "cladeDEF"
},
"tipB": {
"clade": "cladeCB"
},
"tipC": {
"clade": "cladeCB"
},
"tipD": {
"clade": "cladeDEF"
},
"tipE": {
"clade": "cladeE"
},
"tipF": {
"clade": "cladeDEF"
}
}
}
19 changes: 19 additions & 0 deletions tests/functional/clades/nt_muts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"nodes": {
"tipA": {"muts": [], "aa_muts": {}},
"tipB": {"muts": [], "aa_muts": {}},
"tipC": {"muts": [], "aa_muts": {}},
"tipD": {"muts": [], "aa_muts": {}},
"tipE": {"muts": ["A3E"], "aa_muts": {}},
"tipF": {"muts": [], "aa_muts": {}},
"internalBC": {
"muts": ["A1B", "A2C"],
"aa_muts": {}
},
"internalDEF": {
"muts": ["A2C"],
"aa_muts": {}
},
"ROOT":{"muts": [], "aa_muts": {}}
}
}
1 change: 1 addition & 0 deletions tests/functional/clades/tree.nwk
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(tipA:1,(tipB:1,tipC:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:1;

0 comments on commit 422084d

Please sign in to comment.