Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

--write-taxodium ouput changes for other pathogens #191

Merged
merged 6 commits into from
Nov 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 59 additions & 17 deletions src/matUtils/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,7 @@ void get_minimum_subtrees(MAT::Tree* T, std::vector<std::string> samples, size_t

// Helper function to format one attribute into taxodium encoding for a SingleValuePerNode metadata type
void populate_generic_metadata(int attribute_column, std::vector<std::string> &attributes, std::unordered_map<std::string, std::string> &seen_map, int &encoding_counter, Taxodium::MetadataSingleValuePerNode *single) {
if (attribute_column < (int) attributes.size() && attributes[attribute_column] != "") {
if (attributes[attribute_column] != "") {
std::string attr_val = attributes[attribute_column];
if (seen_map.find(attr_val) == seen_map.end()) {
encoding_counter++;
Expand All @@ -699,16 +699,20 @@ void populate_generic_metadata(int attribute_column, std::vector<std::string> &a

// Helper function to populate non-generic metadata types that have mapping encodings.
void populate_fixed_metadata(std::string name, int attribute_column, std::vector<std::string> &attributes, std::unordered_map<std::string, std::string> &seen_map, int &encoding_counter, Taxodium::AllData &all_data) {
if (seen_map.find(attributes[attribute_column]) == seen_map.end()) {
encoding_counter++;
std::string encoding_str = std::to_string(encoding_counter);
seen_map[attributes[attribute_column]] = encoding_str;
if (name == "date") { // only date for now
all_data.add_date_mapping(attributes[attribute_column]);
}
attributes[attribute_column] = encoding_str;
if (attributes[attribute_column] != "") {
if (seen_map.find(attributes[attribute_column]) == seen_map.end()) {
encoding_counter++;
std::string encoding_str = std::to_string(encoding_counter);
seen_map[attributes[attribute_column]] = encoding_str;
if (name == "date") { // only date for now
all_data.add_date_mapping(attributes[attribute_column]);
}
attributes[attribute_column] = encoding_str;
} else {
attributes[attribute_column] = seen_map[attributes[attribute_column]];
}
} else {
attributes[attribute_column] = seen_map[attributes[attribute_column]];
attributes[attribute_column] = "0";
}
}

Expand All @@ -721,6 +725,7 @@ std::unordered_map<std::string, std::vector<std::string>> read_metafiles_tax(std
* Additional fields to look for are specified with -F
*/


int32_t date_ct = 0;
all_data.add_date_mapping("");
std::unordered_map<std::string, std::string> seen_dates_map;
Expand All @@ -733,6 +738,7 @@ std::unordered_map<std::string, std::vector<std::string>> read_metafiles_tax(std

// First parse all files into metadata map
std::vector<std::string> header;
int additional_fields = 0; // Number of new fields in each metadata file
for (std::string f : filenames) {
std::ifstream infile(f);
if (!infile) {
Expand All @@ -757,12 +763,15 @@ std::unordered_map<std::string, std::vector<std::string>> read_metafiles_tax(std
}
MAT::string_split(line, delim, words);
if (first) { // header line
int field_count = 0;
for (int i = 0; i < (int) words.size(); i++) { // for each column name
header.push_back(words[i]);
field_count++;
if (words[i] == "strain") {
strain_column = i;
}
}
additional_fields = field_count;
first = false;
if (strain_column == -1) {
fprintf(stderr, "The column \"strain\" (sample ID) is missing from at least one metadata file.\n");
Expand All @@ -775,23 +784,57 @@ std::unordered_map<std::string, std::vector<std::string>> read_metafiles_tax(std
continue; // ignore duplicates in each metadata file
}
seen_in_this_file[key] = true;

int prev_header_size = header.size() - additional_fields;

if (metadata.find(key) == metadata.end()) {
// if we haven't seen this sample yet
metadata[key] = std::vector<std::string>();
while (metadata[key].size() < prev_header_size) {
metadata[key].push_back("");
}
}
for (auto word : words) {

for (int i = 0; i < words.size(); i++) {
// Check all metadata fields up to this one
// If the same field exists earlier, copy non-empty
// values into the first column of the field
std::string word = words[i];
metadata[key].push_back(word);
for (int j = 0; j < prev_header_size; j++) {
if (header[j] == header[prev_header_size + i]) {
if (words[i] != "") {
metadata[key][j] = words[i];
}
break;
}
}
}
if (metadata[key].size() == header.size() - 1) {
metadata[key].push_back(""); // the case where the last column is empty

// fills out empty columns
while(metadata[key].size() < header.size()) {
metadata[key].push_back("");
}

}
infile.close();
}
for(auto &v : metadata) {
// fill out empty columns
while(metadata[v.first].size() < header.size()) {
metadata[v.first].push_back(""); // handles empty metadata in the last columns
}
}
// Then use map to make taxodium encodings and check for defined/generic fields
// if multiple columns define the same field, the last occurrence is picked
// If the same column is present in multiple metadata files (or multiple times in a file),
// the non-empty values are condensed into the first column of that name.
std::unordered_map<std::string, bool> done_fields;
for (int i = 0; i < (int) header.size(); i++) {
std::string field = header[i];
if (done_fields.find(field) != done_fields.end()) {
continue; // already included this field
}
done_fields[field] = true;
if (field == "strain") {
columns.strain_column = i;
} else if (field == "genbank_accession") {
Expand Down Expand Up @@ -865,12 +908,11 @@ std::unordered_map<std::string, std::vector<std::string>> read_metafiles_tax(std
}



fprintf(stderr, "\nPerforming conversion.\n");

return metadata;
}
void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector<std::string> meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector<std::string> additional_meta_fields) {
void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector<std::string> meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector<std::string> additional_meta_fields, float x_scale) {

// These are the taxodium pb objects
Taxodium::AllNodeData *node_data = new Taxodium::AllNodeData();
Expand All @@ -892,7 +934,7 @@ void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector<
TIMEIT();

// Fill in the taxodium data while doing aa translations
translate_and_populate_node_data(&tree, gtf_filename, fasta_filename, node_data, &all_data, metadata, columns, generic_metadata);
translate_and_populate_node_data(&tree, gtf_filename, fasta_filename, node_data, &all_data, metadata, columns, generic_metadata, x_scale);
all_data.set_allocated_node_data(node_data);

// Boost library used to stream the contents to the output protobuf file in
Expand Down
2 changes: 1 addition & 1 deletion src/matUtils/convert.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector<
MAT::Tree load_mat_from_json(std::string json_filename);
void get_minimum_subtrees(MAT::Tree* T, std::vector<std::string> samples, size_t target_size, std::string output_dir, std::vector<std::unordered_map<std::string,std::unordered_map<std::string,std::string>>>* catmeta, std::string json_n, std::string newick_n, bool retain_original_branch_len = false);
std::vector<std::string> get_nearby (MAT::Tree* T, std::string sample_id, int number_to_get);
void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector<std::string> meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector<std::string> additional_meta_fields);
void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector<std::string> meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector<std::string> additional_meta_fields, float x_scale);
std::unordered_map<std::string, std::vector<std::string>> read_metafiles_tax(std::vector<std::string> filenames, Taxodium::AllData &all_data, Taxodium::AllNodeData *node_data, MetaColumns &columns, std::vector<GenericMetadata> &generic_metadata, std::vector<std::string> additional_meta_fields);
5 changes: 4 additions & 1 deletion src/matUtils/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ po::variables_map parse_extract_command(po::parsed_options parsed) {
"Use to write a newick tree to the indicated file.")
("write-taxodium,l", po::value<std::string>()->default_value(""),
"Write protobuf in alternate format consumed by Taxodium.")
("x-scale,G", po::value<float>()->default_value(0.2),
"Specifies custom X-axis scaling value for Taxodium output. Not necessary for UShER SARS-CoV-2 trees.")
("title,B", po::value<std::string>()->default_value("mutation_annotated_tree"),
"Title of MAT to display in Taxodium or Auspice (used with --write-taxodium or -j).")
("description,D", po::value<std::string>()->default_value(""),
Expand Down Expand Up @@ -147,6 +149,7 @@ void extract_main (po::parsed_options parsed) {
bool limit_lca = vm["limit-to-lca"].as<bool>();
size_t add_random = vm["add-random"].as<size_t>();
size_t select_nearest = vm["select-nearest"].as<size_t>();
float x_scale = vm["x-scale"].as<float>();

boost::filesystem::path path(dir_prefix);
if (!boost::filesystem::exists(path)) {
Expand Down Expand Up @@ -743,7 +746,7 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
if (!resolve_polytomies) {
subtree.condense_leaves();
}
save_taxodium_tree(subtree, output_tax_filename, metav, gtf_filename, fasta_filename, tax_title, tax_description, additional_meta_fields);
save_taxodium_tree(subtree, output_tax_filename, metav, gtf_filename, fasta_filename, tax_title, tax_description, additional_meta_fields, x_scale);
fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop());
}
if (dump_metadata != dir_prefix) {
Expand Down
Loading