Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

phylogeny: get_common_ancestors performance #41

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 50 additions & 54 deletions src/phylogeny/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::utils;
use color_eyre::eyre::{eyre, Report, Result, WrapErr};
use color_eyre::eyre::{eyre, ContextCompat, Report, Result, WrapErr};
use color_eyre::Help;
use itertools::Itertools;
use log::debug;
Expand All @@ -9,7 +9,6 @@ use petgraph::visit::{Dfs, IntoNodeReferences};
use petgraph::Direction;
use serde::{Deserialize, Serialize};
use serde_json;
use std::collections::HashMap;
use std::fs::File;
use std::io::Write;
use std::path::Path;
Expand Down Expand Up @@ -369,65 +368,62 @@ impl Phylogeny {
}

/// Identify the most recent common ancestor shared between all node names.
pub fn get_common_ancestor(&self, names: &Vec<String>) -> Result<String, Report> {
pub fn get_common_ancestor(&self, names: &[String]) -> Result<String, Report> {
// if only one node name was provided, just return it
if names.len() == 1 {
let common_ancestor = names[0].clone();
return Ok(common_ancestor);
}

// Phase 1: Count up the ancestors shared between all named populations
let mut ancestor_counts: HashMap<String, Vec<String>> = HashMap::new();
let mut ancestor_depths: HashMap<String, isize> = HashMap::new();

for name in names {
// directly use the get_paths method over get_ancestors, because
// get_ancestors removes the self node name from the list,
// but some datasets have named internal nodes, so a listed
// node could be a common ancestor!
let ancestor_paths = self.get_paths("root", name, petgraph::Outgoing)?;

for ancestor_path in ancestor_paths {
for (depth, ancestor) in ancestor_path.iter().enumerate() {
let depth = depth as isize;
// add ancestor if first time encountered
ancestor_depths.entry(ancestor.clone()).or_insert(depth);

// recombinants can appear multiple times in ancestors, update
// depth map to use deepest one
if depth > ancestor_depths[ancestor] {
ancestor_depths.insert(ancestor.clone(), depth);
}
ancestor_counts
.entry(ancestor.clone())
.and_modify(|p| {
p.push(name.clone());
p.dedup();
})
.or_insert(vec![name.clone()]);
}
}
}
// mass pile of all ancestors of all named nodes
let ancestors: Vec<_> = names
.iter()
.map(|pop| {
let paths = self.get_paths(pop, "root", Direction::Incoming)?;
let ancestors = paths.into_iter().flatten().unique().collect_vec();
debug!("{pop}: {ancestors:?}");
Ok(ancestors)
})
.collect::<Result<Vec<_>, Report>>()?
.into_iter()
.flatten()
.collect::<Vec<_>>();

// get ancestors shared by all sequences
let common_ancestors: Vec<_> = ancestors
.iter()
.unique()
.filter(|anc| {
let count = ancestors.iter().filter(|pop| pop == anc).count();
count == names.len()
})
.collect();

debug!("common_ancestors: {common_ancestors:?}");

// get the depths (distance to root) of the common ancestors
let depths = common_ancestors
.into_iter()
.map(|pop| {
let paths = self.get_paths(pop, "root", Direction::Incoming)?;
let longest_path = paths
.into_iter()
.max_by(|a, b| a.len().cmp(&b.len()))
.unwrap_or_default();
let depth = longest_path.len();
debug!("{pop}: {depth}");
Ok((pop, depth))
})
.collect::<Result<Vec<_>, Report>>()?;

// Phase 2: Find the highest depth ancestor shared between all
let mut common_ancestor = "root".to_string();
let mut max_depth = 0;

for (ancestor, populations) in ancestor_counts {
// Which ancestors were found in all populations?
if populations.len() == names.len() {
// Which ancestor has the max depth?

let depth = ancestor_depths
.get(&ancestor)
.cloned()
.expect("Ancestor {ancestor} was not found in ancestor depths.");
if depth > max_depth {
max_depth = depth;
common_ancestor = ancestor;
}
}
}
// get the deepest (ie. most recent common ancestor)
let deepest_ancestor = depths
.into_iter()
.max_by(|a, b| a.1.cmp(&b.1))
.context("Failed to get common ancestor.")?;

// tuple (population name, depth)
let common_ancestor = deepest_ancestor.0.to_string();

Ok(common_ancestor)
}
Expand Down
Loading