Skip to content

Commit

Permalink
remove threads that visit same exon multiple times
Browse files Browse the repository at this point in the history
  • Loading branch information
jeizenga committed Sep 12, 2024
1 parent 1a588ed commit 042fb16
Showing 1 changed file with 66 additions and 30 deletions.
96 changes: 66 additions & 30 deletions src/transcriptome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1628,14 +1628,18 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri

assert(haplotype_index.bidirectional());

// the return value
list<EditedTranscriptPath> edited_transcript_paths;

// boundary nodes for the exons (we save them for later while walking out the haplotypes)
vector<pair<vg::id_t, vg::id_t> > exon_node_ids;
exon_node_ids.reserve(cur_transcript.exons.size());

// exon paths for each exon and the threads that follow them, which we will use in the phase of construcint the edited transcript pathsw
vector<pair<vector<exon_nodes_t>, thread_ids_t> > haplotypes;
// for each haplotype, the index of its exons in the haplotypes vector and the next expected exon (only used for constructing haplotypes)
multimap<int32_t, pair<int32_t, int32_t> > haplotype_id_index;

for (size_t exon_idx = 0; exon_idx < cur_transcript.exons.size(); ++exon_idx) {

const Exon & cur_exon = cur_transcript.exons.at(exon_idx);
Expand All @@ -1650,38 +1654,33 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
// first position in downstream intron).
auto exon_haplotypes = get_exon_haplotypes(exon_node_ids.back().first, exon_node_ids.back().second, haplotype_index,
reference_samples, expected_length);

if (haplotypes.empty()) {

for (auto & exon_haplotype: exon_haplotypes) {

haplotypes.emplace_back(vector<exon_nodes_t>(1, exon_haplotype.first), exon_haplotype.second);
haplotypes.back().first.reserve(cur_transcript.exons.size());

for (auto & haplotype_id: exon_haplotype.second) {

haplotype_id_index.emplace(haplotype_id, make_pair(haplotypes.size() - 1, exon_idx + 1));
}
}

} else {

} else {
for (auto & exon_haplotype: exon_haplotypes) {

assert(!exon_haplotype.first.empty());
spp::sparse_hash_map<int32_t, uint32_t> extended_haplotypes;

for (auto & haplotype_id: exon_haplotype.second) {

auto haplotype_id_index_it_range = haplotype_id_index.equal_range(haplotype_id);
auto haplotype_id_index_it = haplotype_id_index_it_range.first;

while (haplotype_id_index_it != haplotype_id_index_it_range.second) {

if (exon_idx != haplotype_id_index_it->second.second) {

assert(haplotype_id_index_it->second.second < exon_idx);

haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
continue;
}
Expand All @@ -1690,36 +1689,36 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
pair<vector<exon_nodes_t>, thread_ids_t> * cur_haplotype = &haplotypes.at(haplotype_id_index_it->second.first);

if (extended_haplotypes.find(haplotype_id_index_it->second.first) != extended_haplotypes.end()) {

// we have already started to extend the haplotypes from this haplotype's previous exon haplotype set
assert(cur_haplotype->first.size() == exon_idx + 1);
haplotypes.at(extended_haplotypes.at(haplotype_id_index_it->second.first)).second.emplace_back(haplotype_id);
haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);

haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);
} else if (cur_haplotype->first.size() == exon_idx) {

// the next expected exon is this one, so we extend it by this exon and mark the extension as conly containing this thread so far
cur_haplotype->first.emplace_back(exon_haplotype.first);
cur_haplotype->second = {haplotype_id};
assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotype_id_index_it->second.first).second);

} else if (cur_haplotype->first.size() == exon_idx + 1) {

} else if (cur_haplotype->first.size() == exon_idx + 1) {
// the next expected exon is the following one, so we must have already added this one, so we must need divide this
// haplotype group into two
haplotypes.emplace_back(vector<exon_nodes_t>(cur_haplotype->first.begin(), cur_haplotype->first.end() - 1), thread_ids_t(1, haplotype_id));
haplotypes.back().first.emplace_back(exon_haplotype.first);

assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotypes.size() - 1).second);
haplotype_id_index_it->second.first = haplotypes.size() - 1;

} else {
haplotype_id_index_it->second.first = haplotypes.size() - 1;

} else {
// we must have missed an exon, so we have to erase this haplotype from the results
haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
continue;
}
}

++haplotype_id_index_it;
}
}
}
}
}
}

for (auto & haplotype: haplotypes) {
Expand Down Expand Up @@ -1853,7 +1852,7 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT & haplotype_index, const unordered_set<string>& reference_samples, const int32_t expected_length) const {

assert(expected_length > 0);

// Calculate the expected upperbound of the length between the two
// nodes (number of nodes).
const int32_t expected_length_upperbound = 1.1 * expected_length;
Expand Down Expand Up @@ -1888,7 +1887,7 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// Stop current extension if end node is reached.
if (gbwt::Node::id(cur_exon_haplotype.first.back()) == end_node) {

exon_haplotypes.emplace_back(cur_exon_haplotype.first, haplotype_index.locate(cur_exon_haplotype.second));
assert(exon_haplotypes.back().second.size() <= cur_exon_haplotype.second.size());

Expand Down Expand Up @@ -1928,7 +1927,6 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
}

if (!has_relevant_haplotype) {

exon_haplotype_queue.pop();
continue;
}
Expand All @@ -1938,23 +1936,21 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// End current extension if no outgoing edges exist.
if (out_edges.empty()) {

exon_haplotype_queue.pop();
continue;
}

auto out_edges_it = out_edges.begin();
++out_edges_it;
++out_edges_it; // skip the first edge (we will follow it in-place on the queue after the following loop)

while (out_edges_it != out_edges.end()) {

// Do not extend haplotypes that end within the exon.
if (out_edges_it->first != gbwt::ENDMARKER) {

auto extended_search = haplotype_index.extend(cur_exon_haplotype.second, out_edges_it->first);

// Add new extension to queue if not empty (haplotypes found).
if (!extended_search.empty()) {
if (!extended_search.empty()) {

exon_haplotype_queue.push(make_pair(cur_exon_haplotype.first, extended_search));
exon_haplotype_queue.back().first.emplace_back(out_edges_it->first);
Expand All @@ -1966,18 +1962,58 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// Do not extend haplotypes that end within the exon.
if (out_edges.begin()->first != gbwt::ENDMARKER) {

cur_exon_haplotype.first.emplace_back(out_edges.begin()->first);
cur_exon_haplotype.second = haplotype_index.extend(cur_exon_haplotype.second, out_edges.begin()->first);

// End current extension if empty (no haplotypes found).
if (cur_exon_haplotype.second.empty()) { exon_haplotype_queue.pop(); }
if (cur_exon_haplotype.second.empty()) {
exon_haplotype_queue.pop(); }

} else {

exon_haplotype_queue.pop();
}
}

// remove haplotypes that visit this exon more than once
// FIXME: we should instead use the R-index to get reachability information in addition to thread IDs
{
unordered_map<gbwt::size_type, pair<size_t, size_t>> seen_position;
bool found_duplicate = false;
for (size_t i = 0; i < exon_haplotypes.size(); ++i) {

auto& exon_haplotype_set = exon_haplotypes[i];

for (size_t j = 0; j < exon_haplotype_set.second.size(); ++j) {
auto it = seen_position.find(exon_haplotype_set.second[j]);
if (it != seen_position.end()) {
// mark this and the previous instance of this haplotype for removal
exon_haplotypes[it->second.first].second[it->second.second] = -1;
exon_haplotype_set.second[j] = -1;
found_duplicate = true;
}
else {
// there is no previous instance of this haplotype
seen_position[exon_haplotype_set.second[j]] = make_pair(i, j);
}
}
}

if (found_duplicate) {
// remove any duplicate thread IDs
for (auto& exon_haplotype_set : exon_haplotypes) {
auto new_end = remove_if(exon_haplotype_set.second.begin(), exon_haplotype_set.second.end(),
[](gbwt::size_type thread_id) {
return thread_id == (gbwt::size_type) -1;
});
exon_haplotype_set.second.resize(new_end - exon_haplotype_set.second.begin());
}
// remove any haplotype sets that are now empty
auto new_end = remove_if(exon_haplotypes.begin(), exon_haplotypes.end(), [](const pair<exon_nodes_t, thread_ids_t>& a) {
return a.second.empty();
});
exon_haplotypes.resize(new_end - exon_haplotypes.begin());
}
}

return exon_haplotypes;
}
Expand Down

1 comment on commit 042fb16

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch rna-tandem-dup-exon. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17421 seconds

Please sign in to comment.