Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Whitelist in matUtils extract + refactoring of introduce #198

Merged
merged 4 commits into from
Nov 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/matUtils/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ po::variables_map parse_extract_command(po::parsed_options parsed) {
"Set to add to the sample selection the y nearest samples to each of your samples, without duplicates.")
("dump-metadata,Q", po::value<std::string>()->default_value(""),
"Set to write all final stored metadata to a tsv.")
("whitelist,L", po::value<std::string>()->default_value(""),
"Pass a list of samples, one per line, to always retain regardless of any other parameters.")
("threads,T", po::value<uint32_t>()->default_value(num_cores), num_threads_message.c_str())
("help,h", "Print help messages");
// Collect all the unrecognized options from the first pass. This will include the
Expand Down Expand Up @@ -128,6 +130,7 @@ void extract_main (po::parsed_options parsed) {
po::variables_map vm = parse_extract_command(parsed);
std::string input_mat_filename = vm["input-mat"].as<std::string>();
std::string input_samples_file = vm["samples"].as<std::string>();
std::string whitelist_samples_file = vm["whitelist"].as<std::string>();
std::string nearest_k = vm["nearest-k"].as<std::string>();
std::string nearest_k_batch_file = vm["nearest-k-batch"].as<std::string>();
std::string clade_choice = vm["clade"].as<std::string>();
Expand Down Expand Up @@ -411,6 +414,16 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
exit(1);
}
}
//make sure all whitelisted samples are included in the output after all other selection is performed.
if (whitelist_samples_file != "") {
fprintf(stderr, "Whitelisting samples...\n");
auto wsamples = read_sample_names(whitelist_samples_file);
for (auto w: wsamples) {
if (std::find(samples.begin(),samples.end(),w) == samples.end()) {
samples.push_back(w);
}
}
}
//before performing any other action, reroot the tree if requested.
if (reroot_node != "") {
reroot_tree(&T, reroot_node);
Expand Down
109 changes: 50 additions & 59 deletions src/matUtils/introduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,10 @@ std::unordered_map<std::string, float> get_assignments(MAT::Tree* T, std::unorde
*/

std::unordered_map<std::string, float> assignments;
auto dfs = T->depth_first_expansion();
for (auto n: dfs) {
std::unordered_map<std::string, size_t[4]> stored_params;
auto bfs = T->breadth_first_expansion();
std::reverse(bfs.begin(), bfs.end());
for (auto n: bfs) {
if (n->is_leaf()) {
//rule 1
if (sample_set.find(n->identifier) != sample_set.end()) {
Expand All @@ -292,84 +294,74 @@ std::unordered_map<std::string, float> get_assignments(MAT::Tree* T, std::unorde
assignments[n->identifier] = 0;
}
} else {
auto leaves = T->get_leaves_ids(n->identifier);
//to apply rules 2-3, we need to check the state of each leaf
std::unordered_set<std::string> in_leaves;
std::unordered_set<std::string> out_leaves;
for (auto l: leaves) {
if (sample_set.find(l) != sample_set.end()) {
in_leaves.insert(l);
size_t in_leaves = 0;
size_t out_leaves = 0;
//initialize these at large numbers.
size_t min_to_in = 10000000;
size_t min_to_out = 10000000;
for (auto c: n->children) {
if (!c->is_leaf()) {
//this should be present in the stored_params map, if traversal is working correctly
auto search = stored_params.find(c->identifier);
if (search != stored_params.end()) {
in_leaves += search->second[0];
out_leaves += search->second[1];
if (search->second[2] + c->mutations.size() < min_to_in) {
min_to_in = search->second[2] + c->mutations.size();
}
if (search->second[3] + c->mutations.size() < min_to_out) {
min_to_out = search->second[3] + c->mutations.size();
}
} else {
fprintf(stderr, "ERROR: traversal order failure, stored data unavailable\n");
exit(1);
}
} else {
out_leaves.insert(l);
if (sample_set.find(c->identifier) != sample_set.end()) {
in_leaves++;
if (c->mutations.size() < min_to_in) {
min_to_in = c->mutations.size();
}
} else {
out_leaves++;
if (c->mutations.size() < min_to_out) {
min_to_out = c->mutations.size();
}
}
}
}
if (out_leaves.size() == 0) {
stored_params[n->identifier][0] = in_leaves;
stored_params[n->identifier][1] = out_leaves;
stored_params[n->identifier][2] = min_to_in;
stored_params[n->identifier][3] = min_to_out;
// fprintf(stderr, "DEBUG: min %ld, mout %ld, ols %ld, ils %ld\n", min_to_in, min_to_out, out_leaves, in_leaves);
if (out_leaves == 0) {
//rule 2
assignments[n->identifier] = 1;
} else if (in_leaves.size() == 0) {
} else if (in_leaves == 0) {
//rule 3
assignments[n->identifier] = 0;
} else {
//rule 4...
//the best way to do this is to keep in mind that the nearest descendent leaf
//is going to be the next leaf encountered in DFS order
//so we're going to iterate over DFS until we encounter a leaf of each type and record those distances
size_t min_to_in = 0;
size_t min_to_out = 0;
for (auto d: T->depth_first_expansion(n)) {
if ((min_to_in > 0) & (min_to_out > 0)) {
//if both of these are assigned, we're done. break out of this loop and move on
break;
}
if (d->is_leaf()) {
if ((sample_set.find(d->identifier) != sample_set.end()) & (min_to_in == 0)) {
//found the nearest IN.
//rsearch back from it so that we're getting the direct path to the original node
size_t total_traveled = 0;
for (auto a: T->rsearch(d->identifier, true)) {
total_traveled += a->mutations.size();
if (a->identifier == n->identifier) {
//back to the original query, break here
min_to_in = total_traveled;
break;
}
}
} else if ((sample_set.find(d->identifier) == sample_set.end()) & (min_to_out == 0)) {
//found the nearest out
//same routine as above
size_t total_traveled = 0;
for (auto a: T->rsearch(d->identifier, true)) {
total_traveled += a->mutations.size();
if (a->identifier == n->identifier) {
//back to the original query, break here
min_to_out = total_traveled;
break;
}
}
}
}
}
//update to rule 4- 1 is IN, 0 is OUT, but we want to have in-between numbers to represent relative confidence.
//rule 4- 1 is IN, 0 is OUT, but we want to have in-between numbers to represent relative confidence.
//we calculate the balance by computing C=1/(1+((OUT_MD/OUT_LEAVES)/(IN_MD/IN_LEAVES)))
//C is near 0 when OUT is large, C is near 1 when IN is large, C is 0.5 when they are the same
//now we complete rule 4 by checking the balance.
if (min_to_in == 0) {
//this calculation is unnecessary in these cases.
//tiebreaker for both being 0 is IN with this ordering.
//identical IN sample, its IN.
//identical IN sample, its IN, because logically this ancestor did exist there at that time (just maybe elsewhere also)
assignments[n->identifier] = 1;
} else if (min_to_out == 0) {
assignments[n->identifier] = 0;
} else {
// fprintf(stderr, "DEBUG: min %ld, mout %ld, ols %ld, ils %ld\n", min_to_in, min_to_out, out_leaves.size(), in_leaves.size());
//unnecessary variable declarations because I was hitting floating point exceptions.
float vor = (static_cast<float>(min_to_out) / static_cast<float>(out_leaves.size()));
float vir = (static_cast<float>(min_to_in) / static_cast<float>(in_leaves.size()));
//not strictly necessary variable declarations, but makes debugging a bit easier
float vor = (static_cast<float>(min_to_out) / static_cast<float>(out_leaves));
float vir = (static_cast<float>(min_to_in) / static_cast<float>(in_leaves));
float r = (vir/vor);
float c = (1/(1+r));
if (isnan(c)) {
fprintf(stderr, "ERROR: Invalid introduction assignment calculation. Debug information follows.\n");
fprintf(stderr, "min %ld, mout %ld, ols %ld, ils %ld,", min_to_in, min_to_out, out_leaves.size(), in_leaves.size());
fprintf(stderr, "min %ld, mout %ld, ols %ld, ils %ld,", min_to_in, min_to_out, out_leaves, in_leaves);
fprintf(stderr, " vor %f, vir %f, r %f\n", vor, vir, r);
exit(1);
}
Expand Down Expand Up @@ -816,7 +808,6 @@ std::vector<std::string> find_introductions(MAT::Tree* T, std::unordered_map<std
continue;
}
ldatestr = boost::gregorian::to_simple_string(ldates.first) + "\t" + boost::gregorian::to_simple_string(ldates.second);
//diff = (ldates.second - ldates.first);
diff = (boost::gregorian::day_clock::universal_day() - ldates.first); //try weighting growth by current date to change top cluster display.
}
date_tracker[cs.first] = ldatestr;
Expand Down