Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

matUtils updates: new subcommand "fix"; improvements to MAT::Tree:move_node and mask --move-nodes #357

Merged
merged 3 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions src/matUtils/fix.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include "fix.hpp"

// Default: move node if it has at least 1 descendent
int default_min_descendent_count = 1;
int default_iterations = 1;

po::variables_map parse_fix_command(po::parsed_options parsed) {

po::variables_map vm;
po::options_description filt_desc("fix options");
filt_desc.add_options()
("input-mat,i", po::value<std::string>()->required(),
"Input mutation-annotated tree file [REQUIRED]")
("output-mat,o", po::value<std::string>()->required(),
"Path to output fixed mutation-annotated tree file [REQUIRED]")
("iterations,n", po::value<int>()->default_value(default_iterations),
"Number of iterations to run")
("min-descendent-count,c", po::value<int>()->default_value(default_min_descendent_count),
"Minimum number of descendents required to move a node")
("help,h", "Print help messages");
// Collect all the unrecognized options from the first pass. This will include the
// (positional) command name, so we need to erase that.
std::vector<std::string> opts = po::collect_unrecognized(parsed.options, po::include_positional);
opts.erase(opts.begin());

// Run the parser, with try/catch for help
try {
po::store(po::command_line_parser(opts)
.options(filt_desc)
.run(), vm);
po::notify(vm);
} catch(std::exception &e) {
std::cerr << filt_desc << std::endl;
// Return with error code 1 unless the user specifies help
if (vm.count("help"))
exit(0);
else
exit(1);
}
return vm;
}

static int fix_grandparent_reversions_r(MAT::Tree *T, MAT::Node *node, MAT::Node *ggp_node,
MAT::Node *gp_node, MAT::Node *p_node,
int min_descendent_count) {
// Recursively scan the tree for cases of grandparent-reversion, i.e. N > A > B > revA;
// when a case like that is found, move the revA node to be a child of N having mutation B.
int descendent_count = 0;
// First recursively descend to children, looking each one up by identifier in case it has
// been removed by the time we get to it:
std::vector<std::string> child_ids;
for (auto child: node->children) {
child_ids.push_back(child->identifier);
}
for (auto child_id: child_ids) {
MAT::Node *child = T->get_node(child_id);
if (child != NULL) {
descendent_count += fix_grandparent_reversions_r(T, child, gp_node, p_node, node,
min_descendent_count);
}
}
// Now determine whether this node has only one mutation that reverts its grandparent's only
// mutation; if so (and if parent has only one mut, othw parsimony score would increase),
// then move this node to be a child of its great-grandparent, with only its parent's mut.
if (ggp_node != NULL && node->mutations.size() == 1 && gp_node->mutations.size() == 1 &&
p_node->mutations.size() == 1) {
MAT::Mutation node_mut = node->mutations.front();
MAT::Mutation gp_mut = gp_node->mutations.front();
if (node_mut.position == gp_mut.position &&
node_mut.chrom == gp_mut.chrom &&
node_mut.mut_nuc == gp_mut.par_nuc &&
node_mut.par_nuc == gp_mut.mut_nuc &&
descendent_count >= min_descendent_count) {
fprintf(stderr, "Node %s mutation %s reverts grandparent %s's %s%s, moving to %s with %s (%d descendents)\n",
node->identifier.c_str(), node_mut.get_string().c_str(),
gp_node->identifier.c_str(),
(gp_mut.mut_nuc == gp_mut.ref_nuc ? "reversion " : ""),
gp_mut.get_string().c_str(),
ggp_node->identifier.c_str(),
p_node->mutations.front().get_string().c_str(), descendent_count);
node->mutations.clear();
node->mutations = std::vector<MAT::Mutation>(p_node->mutations);
T->move_node(node->identifier, ggp_node->identifier);
}
}
return descendent_count + 1;
}

static void fix_grandparent_reversions(MAT::Tree *T, int iterations, int min_descendent_count) {
// Find and fix nodes that reverse their grandparent's mutation.
// For each case of N > A > B > revA, move revA to N > B (if N > B already exists then move all
// children of revA to N > B).
int i;
for (i = 0; i < iterations; i++) {
fix_grandparent_reversions_r(T, T->root, NULL, NULL, NULL, min_descendent_count);
}
}

void fix_main(po::parsed_options parsed) {
po::variables_map vm = parse_fix_command(parsed);
std::string input_mat_filename = vm["input-mat"].as<std::string>();
std::string output_mat_filename = vm["output-mat"].as<std::string>();
int iterations = vm["iterations"].as<int>();
int min_descendent_count = vm["min-descendent-count"].as<int>();

// Load the input MAT
timer.Start();
fprintf(stderr, "Loading input MAT file %s.\n", input_mat_filename.c_str());
MAT::Tree T = MAT::load_mutation_annotated_tree(input_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());

// No need to uncondense the tree

timer.Start();
fprintf(stderr, "Fixing grandparent-reversions\n");
fix_grandparent_reversions(&T, iterations, min_descendent_count);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());

timer.Start();
fprintf(stderr, "Saving Final Tree\n");
MAT::save_mutation_annotated_tree(T, output_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
3 changes: 3 additions & 0 deletions src/matUtils/fix.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#include "common.hpp"

void fix_main(po::parsed_options parsed);
8 changes: 6 additions & 2 deletions src/matUtils/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "extract.hpp"
#include "merge.hpp"
#include "introduce.hpp"
#include "fix.hpp"
#include "version.hpp"
#include <cstddef>

Expand All @@ -12,15 +13,15 @@ Timer timer;
int main (int argc, char** argv) {
po::options_description global("Command options");
global.add_options()
("command", po::value<std::string>(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, and summary.")
("command", po::value<std::string>(), "Command to execute. Valid options are annotate, mask, extract, uncertainty, introduce, fix, merge, version, and summary.")
("subargs", po::value<std::vector<std::string> >(), "Command-specific arguments.");
po::positional_options_description pos;
pos.add("command",1 ).add("subargs", -1);
std::string cmd;
po::variables_map vm;
po::parsed_options parsed = po::command_line_parser(argc, argv).options(global).positional(pos).allow_unregistered().run();
//this help string shows up over and over, lets just define it once
std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "version"};
std::string cnames[] = {"COMMAND","summary","extract","annotate","uncertainty","introduce", "merge", "mask", "fix", "version"};
std::string chelp[] = {
"DESCRIPTION\n\n",
"calculates basic statistics and counts samples, mutations, and clades in the input MAT\n\n",
Expand All @@ -30,6 +31,7 @@ int main (int argc, char** argv) {
"given sample region information, heuristically identifies points of geographic introduction along the phylogeny\n\n",
"merge all samples of two input MAT files into a single output MAT \n\n",
"masks the input samples\n\n",
"fixes grandparent-reversion structures\n\n",
"display version number\n\n"
};
try {
Expand Down Expand Up @@ -57,6 +59,8 @@ int main (int argc, char** argv) {
introduce_main(parsed);
} else if (cmd == "mask") {
mask_main(parsed);
} else if (cmd == "fix") {
fix_main(parsed);
} else if (cmd == "version") {
std::cerr << "matUtils (v" << PROJECT_VERSION << ")" << std::endl;
} else if (cmd == "help") {
Expand Down
155 changes: 114 additions & 41 deletions src/matUtils/mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,16 @@ void mask_main(po::parsed_options parsed) {
exit(1);
}
// Load input MAT and uncondense tree
fprintf(stderr, "Loading input MAT file %s.\n", input_mat_filename.c_str());
timer.Start();
MAT::Tree T = MAT::load_mutation_annotated_tree(input_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
//T here is the actual object.
if (T.condensed_nodes.size() > 0) {
fprintf(stderr, "Uncondensing condensed nodes.\n");
timer.Start();
T.uncondense_leaves();
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}

// If a restricted samples file was provided, perform masking procedure
Expand Down Expand Up @@ -103,12 +109,20 @@ void mask_main(po::parsed_options parsed) {

// Store final MAT to output file
if (output_mat_filename != "") {
fprintf(stderr, "Saving Final Tree\n");
if (recondense) {
timer.Start();
fprintf(stderr, "Collapsing tree...\n");
T.collapse_tree();
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
timer.Start();
fprintf(stderr, "Condensing leaves...\n");
T.condense_leaves();
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
fprintf(stderr, "Saving Final Tree to %s\n", output_mat_filename.c_str());
timer.Start();
MAT::save_mutation_annotated_tree(T, output_mat_filename);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
}

Expand Down Expand Up @@ -211,6 +225,8 @@ void restrictMutationsLocally (std::string mutations_filename, MAT::Tree* T, boo
delim = ',';
}
std::string rootid = T->root->identifier;
size_t total_masked = 0;
timer.Start();
while (std::getline(infile, line)) {
std::vector<std::string> words;
if (line[line.size()-1] == '\r') {
Expand All @@ -236,19 +252,27 @@ void restrictMutationsLocally (std::string mutations_filename, MAT::Tree* T, boo
}
// fprintf(stderr, "Masking mutation %s below node %s\n", ml.first.c_str(), ml.second.c_str());
for (auto n: T->depth_first_expansion(rn)) {
std::vector<MAT::Mutation> nmuts;
// The expected common case is to not match any mutations and have nothing to remove.
std::vector<MAT::Mutation> muts_to_remove;
for (auto& mut: n->mutations) {
if (match_mutations(mutobj, &mut)) {
instances_masked++;
} else {
nmuts.push_back(mut);
muts_to_remove.push_back(mut);
}
}
n->mutations = nmuts;
for (auto mut: muts_to_remove) {
auto iter = std::find(n->mutations.begin(), n->mutations.end(), mut);
n->mutations.erase(iter);
}
}
total_masked += instances_masked;
}
fprintf(stderr, "Completed in %ld msec \n", timer.Stop());
infile.close();
fprintf(stderr, "Masked a total of %lu mutations. Collapsing tree...\n", total_masked);
timer.Start();
T->collapse_tree();
fprintf(stderr, "Completed in %ld msec \n", timer.Stop());
}

void restrictSamples (std::string samples_filename, MAT::Tree& T) {
Expand Down Expand Up @@ -356,6 +380,43 @@ void restrictSamples (std::string samples_filename, MAT::Tree& T) {
}
}

std::unordered_set<std::string> mutation_set_from_node(MAT::Tree* T, MAT::Node* node, bool include_node, bool include_ancestors) {
std::unordered_set<std::string> mutations;
if (include_ancestors) {
for (auto an: T->rsearch(node->identifier, include_node)) {
for (auto mut: an->mutations) {
if (mut.is_masked()) {
continue;
}
MAT::Mutation mut_opposite = mut.copy();
//we check for whether this mutation is going to negate with something in the set
//by identifying its opposite and checking whether the opposite is already present on the traversal.
mut_opposite.par_nuc = mut.mut_nuc;
mut_opposite.mut_nuc = mut.par_nuc;
auto cml = mutations.find(mut_opposite.get_string());
if (cml != mutations.end()) {
mutations.erase(cml);
} else {
mutations.insert(mut.get_string());
}
}
}
} else if (include_node) {
for (auto mut: node->mutations) {
if (mut.is_masked()) {
continue;
}
mutations.insert(mut.get_string());
}
} else {
fprintf(stderr, "ERROR: mutation_set_from_node: at least one of include_node and include_ancestors should be true.\n");
exit(1);
}
return mutations;
}



void moveNodes (std::string node_filename, MAT::Tree* T) {
// Function to move nodes between two identical placement paths. That is, move the target node so that is a child of the indicated new parent node,
// but the current placement and new placement must involve exactly the same set of mutations for the move to be allowed.
Expand Down Expand Up @@ -394,49 +455,61 @@ void moveNodes (std::string node_filename, MAT::Tree* T) {
}
//accumulate the set of mutations belonging to the current and the new placement
//not counting mutations belonging to the target, but counting ones to the putative new parent.
std::unordered_set<std::string> curr_mutations;
std::unordered_set<std::string> new_mutations;
for (auto an: T->rsearch(mn->identifier, false)) {
for (auto mut: an->mutations) {
if (mut.is_masked()) {
continue;
}
MAT::Mutation mut_opposite = mut.copy();
//we check for whether this mutation is going to negate with something in the set
//by identifying its opposite and checking whether the opposite is already present on the traversal.
mut_opposite.par_nuc = mut.mut_nuc;
mut_opposite.mut_nuc = mut.par_nuc;
auto cml = curr_mutations.find(mut_opposite.get_string());
if (cml != curr_mutations.end()) {
curr_mutations.erase(cml);
std::unordered_set<std::string> curr_mutations = mutation_set_from_node(T, mn, false, true);
std::unordered_set<std::string> new_mutations = mutation_set_from_node(T, np, true, true);
if (curr_mutations == new_mutations) {
//we can now proceed with the move.
T->move_node(mn->identifier, np->identifier);
fprintf(stderr, "Move of node %s to node %s successful.\n", words[0].c_str(), words[1].c_str());
} else {
// Not quite the same; figure out whether we need to add a node under new parent.
fprintf(stderr, "The current (%s) and new (%s) node paths do not involve the same set of mutations.\n",
mn->identifier.c_str(), np->identifier.c_str());

std::unordered_set<std::string> extra_mutations;
size_t curr_in_new_count = 0;
for (auto mut: curr_mutations) {
if (new_mutations.find(mut) != new_mutations.end()) {
curr_in_new_count++;
} else {
curr_mutations.insert(mut.get_string());
extra_mutations.insert(mut);
}
}
}
for (auto an: T->rsearch(np->identifier, true)) {
for (auto mut: an->mutations) {
if (mut.is_masked()) {
continue;
if (extra_mutations.size() == 0 || curr_in_new_count != new_mutations.size()) {
fprintf(stderr, "ERROR: the new parent (%s) has mutations not found in the current node (%s); %ld in common, %ld in new\n",
np->identifier.c_str(), mn->identifier.c_str(), curr_in_new_count, new_mutations.size());
exit(1);
}
// Look for a child of np that already has extra_mutations. If there is such a child
// then move mn to that child. Otherwise add those mutations to mn and move it to np.
MAT::Node *child_with_muts = NULL;
for (auto child: np->children) {
std::unordered_set<std::string> mut_set = mutation_set_from_node(T, child, true, false);
if (mut_set == extra_mutations) {
child_with_muts = child;
fprintf(stderr, "Found child with extra_mutations: %s\n", child->identifier.c_str());
break;
}
MAT::Mutation mut_opposite = mut.copy();
mut_opposite.par_nuc = mut.mut_nuc;
mut_opposite.mut_nuc = mut.par_nuc;
auto cml = new_mutations.find(mut_opposite.get_string());
if (cml != new_mutations.end()) {
new_mutations.erase(cml);
} else {
new_mutations.insert(mut.get_string());
}
if (child_with_muts != NULL) {
T->move_node(mn->identifier, child_with_muts->identifier);
} else {
// Preserve chronological order expected by add_mutation by adding mn's mutations
// after extra_mutations instead of vice versa.
std::vector<MAT::Mutation>mn_mutations;
for (auto mut: mn->mutations) {
mn_mutations.push_back(mut);
}
mn->mutations.clear();
for (auto mut: extra_mutations) {
mn->add_mutation(*(MAT::mutation_from_string(mut)));
}
for (auto mut: mn_mutations) {
mn->add_mutation(mut);
}
T->move_node(mn->identifier, np->identifier);
}
}
if (curr_mutations != new_mutations) {
fprintf(stderr, "ERROR: The current and new placement paths do not involve the same set of mutations. Exiting\n");
exit(1);
}
//we can now proceed with the move.
T->move_node(mn->identifier, np->identifier);
fprintf(stderr, "Move of node %s to node %s successful.\n", words[0].c_str(), words[1].c_str());
}
fprintf(stderr, "All requested moves complete.\n");
}
Loading