diff --git a/src/matUtils/convert.cpp b/src/matUtils/convert.cpp index cde799ac..520b699f 100644 --- a/src/matUtils/convert.cpp +++ b/src/matUtils/convert.cpp @@ -681,7 +681,7 @@ void get_minimum_subtrees(MAT::Tree* T, std::vector samples, size_t // Helper function to format one attribute into taxodium encoding for a SingleValuePerNode metadata type void populate_generic_metadata(int attribute_column, std::vector &attributes, std::unordered_map &seen_map, int &encoding_counter, Taxodium::MetadataSingleValuePerNode *single) { - if (attribute_column < (int) attributes.size() && attributes[attribute_column] != "") { + if (attributes[attribute_column] != "") { std::string attr_val = attributes[attribute_column]; if (seen_map.find(attr_val) == seen_map.end()) { encoding_counter++; @@ -699,16 +699,20 @@ void populate_generic_metadata(int attribute_column, std::vector &a // Helper function to populate non-generic metadata types that have mapping encodings. void populate_fixed_metadata(std::string name, int attribute_column, std::vector &attributes, std::unordered_map &seen_map, int &encoding_counter, Taxodium::AllData &all_data) { - if (seen_map.find(attributes[attribute_column]) == seen_map.end()) { - encoding_counter++; - std::string encoding_str = std::to_string(encoding_counter); - seen_map[attributes[attribute_column]] = encoding_str; - if (name == "date") { // only date for now - all_data.add_date_mapping(attributes[attribute_column]); - } - attributes[attribute_column] = encoding_str; + if (attributes[attribute_column] != "") { + if (seen_map.find(attributes[attribute_column]) == seen_map.end()) { + encoding_counter++; + std::string encoding_str = std::to_string(encoding_counter); + seen_map[attributes[attribute_column]] = encoding_str; + if (name == "date") { // only date for now + all_data.add_date_mapping(attributes[attribute_column]); + } + attributes[attribute_column] = encoding_str; + } else { + attributes[attribute_column] = seen_map[attributes[attribute_column]]; + } } else { - attributes[attribute_column] = seen_map[attributes[attribute_column]]; + attributes[attribute_column] = "0"; } } @@ -721,6 +725,7 @@ std::unordered_map> read_metafiles_tax(std * Additional fields to look for are specified with -F */ + int32_t date_ct = 0; all_data.add_date_mapping(""); std::unordered_map seen_dates_map; @@ -733,6 +738,7 @@ std::unordered_map> read_metafiles_tax(std // First parse all files into metadata map std::vector header; + int additional_fields = 0; // Number of new fields in each metadata file for (std::string f : filenames) { std::ifstream infile(f); if (!infile) { @@ -757,12 +763,15 @@ std::unordered_map> read_metafiles_tax(std } MAT::string_split(line, delim, words); if (first) { // header line + int field_count = 0; for (int i = 0; i < (int) words.size(); i++) { // for each column name header.push_back(words[i]); + field_count++; if (words[i] == "strain") { strain_column = i; } } + additional_fields = field_count; first = false; if (strain_column == -1) { fprintf(stderr, "The column \"strain\" (sample ID) is missing from at least one metadata file.\n"); @@ -775,23 +784,57 @@ std::unordered_map> read_metafiles_tax(std continue; // ignore duplicates in each metadata file } seen_in_this_file[key] = true; + + int prev_header_size = header.size() - additional_fields; + if (metadata.find(key) == metadata.end()) { // if we haven't seen this sample yet metadata[key] = std::vector(); + while (metadata[key].size() < prev_header_size) { + metadata[key].push_back(""); + } } - for (auto word : words) { + + for (int i = 0; i < words.size(); i++) { + // Check all metadata fields up to this one + // If the same field exists earlier, copy non-empty + // values into the first column of the field + std::string word = words[i]; metadata[key].push_back(word); + for (int j = 0; j < prev_header_size; j++) { + if (header[j] == header[prev_header_size + i]) { + if (words[i] != "") { + metadata[key][j] = words[i]; + } + break; + } + } } - if (metadata[key].size() == header.size() - 1) { - metadata[key].push_back(""); // the case where the last column is empty + + // fills out empty columns + while(metadata[key].size() < header.size()) { + metadata[key].push_back(""); } + } infile.close(); } + for(auto &v : metadata) { + // fill out empty columns + while(metadata[v.first].size() < header.size()) { + metadata[v.first].push_back(""); // handles empty metadata in the last columns + } + } // Then use map to make taxodium encodings and check for defined/generic fields - // if multiple columns define the same field, the last occurrence is picked + // If the same column is present in multiple metadata files (or multiple times in a file), + // the non-empty values are condensed into the first column of that name. + std::unordered_map done_fields; for (int i = 0; i < (int) header.size(); i++) { std::string field = header[i]; + if (done_fields.find(field) != done_fields.end()) { + continue; // already included this field + } + done_fields[field] = true; if (field == "strain") { columns.strain_column = i; } else if (field == "genbank_accession") { @@ -865,12 +908,11 @@ std::unordered_map> read_metafiles_tax(std } - fprintf(stderr, "\nPerforming conversion.\n"); return metadata; } -void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector additional_meta_fields) { +void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector additional_meta_fields, float x_scale) { // These are the taxodium pb objects Taxodium::AllNodeData *node_data = new Taxodium::AllNodeData(); @@ -892,7 +934,7 @@ void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector< TIMEIT(); // Fill in the taxodium data while doing aa translations - translate_and_populate_node_data(&tree, gtf_filename, fasta_filename, node_data, &all_data, metadata, columns, generic_metadata); + translate_and_populate_node_data(&tree, gtf_filename, fasta_filename, node_data, &all_data, metadata, columns, generic_metadata, x_scale); all_data.set_allocated_node_data(node_data); // Boost library used to stream the contents to the output protobuf file in diff --git a/src/matUtils/convert.hpp b/src/matUtils/convert.hpp index 4c578f9f..1ae2a270 100644 --- a/src/matUtils/convert.hpp +++ b/src/matUtils/convert.hpp @@ -6,5 +6,5 @@ void write_json_from_mat(MAT::Tree* T, std::string output_filename, std::vector< MAT::Tree load_mat_from_json(std::string json_filename); void get_minimum_subtrees(MAT::Tree* T, std::vector samples, size_t target_size, std::string output_dir, std::vector>>* catmeta, std::string json_n, std::string newick_n, bool retain_original_branch_len = false); std::vector get_nearby (MAT::Tree* T, std::string sample_id, int number_to_get); -void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector additional_meta_fields); +void save_taxodium_tree (MAT::Tree &tree, std::string out_filename, std::vector meta_filenames, std::string gtf_filename, std::string fasta_filename, std::string title, std::string description, std::vector additional_meta_fields, float x_scale); std::unordered_map> read_metafiles_tax(std::vector filenames, Taxodium::AllData &all_data, Taxodium::AllNodeData *node_data, MetaColumns &columns, std::vector &generic_metadata, std::vector additional_meta_fields); diff --git a/src/matUtils/extract.cpp b/src/matUtils/extract.cpp index 9ca6a0f3..3dc27e1b 100644 --- a/src/matUtils/extract.cpp +++ b/src/matUtils/extract.cpp @@ -69,6 +69,8 @@ po::variables_map parse_extract_command(po::parsed_options parsed) { "Use to write a newick tree to the indicated file.") ("write-taxodium,l", po::value()->default_value(""), "Write protobuf in alternate format consumed by Taxodium.") + ("x-scale,G", po::value()->default_value(0.2), + "Specifies custom X-axis scaling value for Taxodium output. Not necessary for UShER SARS-CoV-2 trees.") ("title,B", po::value()->default_value("mutation_annotated_tree"), "Title of MAT to display in Taxodium or Auspice (used with --write-taxodium or -j).") ("description,D", po::value()->default_value(""), @@ -147,6 +149,7 @@ void extract_main (po::parsed_options parsed) { bool limit_lca = vm["limit-to-lca"].as(); size_t add_random = vm["add-random"].as(); size_t select_nearest = vm["select-nearest"].as(); + float x_scale = vm["x-scale"].as(); boost::filesystem::path path(dir_prefix); if (!boost::filesystem::exists(path)) { @@ -743,7 +746,7 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) { if (!resolve_polytomies) { subtree.condense_leaves(); } - save_taxodium_tree(subtree, output_tax_filename, metav, gtf_filename, fasta_filename, tax_title, tax_description, additional_meta_fields); + save_taxodium_tree(subtree, output_tax_filename, metav, gtf_filename, fasta_filename, tax_title, tax_description, additional_meta_fields, x_scale); fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); } if (dump_metadata != dir_prefix) { diff --git a/src/matUtils/translate.cpp b/src/matUtils/translate.cpp index c12f05ba..340112ae 100644 --- a/src/matUtils/translate.cpp +++ b/src/matUtils/translate.cpp @@ -29,7 +29,14 @@ std::string build_reference(std::ifstream &fasta_file) { return reference_output; } - +char complement(char nt) { + auto it = complement_map.find(nt); + if (it == complement_map.end()) { + return 'N'; // ambiguous, couldn't resolve nt + } else { + return it->second; + } +} // Maps a genomic coordinate to a list of codons it is part of std::unordered_map>> build_codon_map(std::ifstream >f_file, std::string reference) { @@ -37,10 +44,14 @@ std::unordered_map>> build_codon_map(std std::string gtf_line; std::vector gtf_lines; std::vector done; + while (std::getline(gtf_file, gtf_line)) { gtf_lines.push_back(gtf_line); } + int curr_line = -1; for (std::string line_outer : gtf_lines) { + curr_line += 1; + if (line_outer[0] == '#' || line_outer[0] == '\n') { continue; } @@ -54,6 +65,7 @@ std::unordered_map>> build_codon_map(std } std::string feature_outer = split_line_outer[2]; std::string gene_outer = split(split(split_line_outer[8], '\"')[1], '\"')[0]; + char strand_outer = split_line_outer[6][0]; if (feature_outer == "CDS") { bool found = (std::find(done.begin(), done.end(), gene_outer) != done.end()); @@ -66,77 +78,155 @@ std::unordered_map>> build_codon_map(std int first_cds_start = std::stoi(split_line_outer[3]); // expect the GTF is ordered by start position int first_cds_stop = std::stoi(split_line_outer[4]); int codon_counter = 0; // the number of codons we have added so far - for (int pos = first_cds_start - 1; pos < first_cds_stop; pos += 3) { - - char nt[3] = { - reference[pos], - reference[pos+1], - reference[pos+2] - }; - - // Coordinates are 0-based at this point - std::shared_ptr c(new Codon(gene_outer, codon_counter, pos, nt)); - codon_counter += 1; - - // The current pos and the next positions - // are associated with this codon - auto it = codon_map.find(pos); - if (it == codon_map.end()) { - codon_map.insert({pos, {c}}); - } else { - (it->second).push_back(c); - } + if (strand_outer == '+') { + for (int pos = first_cds_start - 1; pos < first_cds_stop; pos += 3) { + + char nt[3] = { + reference[pos], + reference[pos+1], + reference[pos+2] + }; + + // Coordinates are 0-based at this point + std::shared_ptr c(new Codon(gene_outer, codon_counter, pos, nt)); + codon_counter += 1; + + // The current pos and the next positions + // are associated with this codon + auto it = codon_map.find(pos); + if (it == codon_map.end()) { + codon_map.insert({pos, {c}}); + } else { + (it->second).push_back(c); + } - it = codon_map.find(pos+1); - if (it == codon_map.end()) { - codon_map.insert({pos+1, {c}}); - } else { - (it->second).push_back(c); + it = codon_map.find(pos+1); + if (it == codon_map.end()) { + codon_map.insert({pos+1, {c}}); + } else { + (it->second).push_back(c); + } + + it = codon_map.find(pos+2); + if (it == codon_map.end()) { + codon_map.insert({pos+2, {c}}); + } else { + (it->second).push_back(c); + } } + } else { + for (int pos = first_cds_stop - 1; pos > first_cds_start; pos -= 3) { + + char nt[3] = { + complement(reference[pos]), + complement(reference[pos-1]), + complement(reference[pos-2]) + }; + + // Coordinates are 0-based at this point + std::shared_ptr c(new Codon(gene_outer, codon_counter, pos, nt)); + codon_counter += 1; + + // The current pos and the next positions + // are associated with this codon + auto it = codon_map.find(pos); + if (it == codon_map.end()) { + codon_map.insert({pos, {c}}); + } else { + (it->second).push_back(c); + } - it = codon_map.find(pos+2); - if (it == codon_map.end()) { - codon_map.insert({pos+2, {c}}); - } else { - (it->second).push_back(c); + it = codon_map.find(pos-1); + if (it == codon_map.end()) { + codon_map.insert({pos-1, {c}}); + } else { + (it->second).push_back(c); + } + + it = codon_map.find(pos-2); + if (it == codon_map.end()) { + codon_map.insert({pos-2, {c}}); + } else { + (it->second).push_back(c); + } } } for (std::string line_inner : gtf_lines) { // find the rest of the CDS features, assuming they are in position order + + if (line_inner[0] == '#' || line_inner[0] == '\n') { + continue; + } std::vector split_line_inner = split(line_inner, '\t'); std::string feature_inner = split_line_inner[2]; std::string gene_inner = split(split(split_line_inner[8], '\"')[1], '\"')[0]; if (feature_inner == "CDS" && gene_outer == gene_inner) { int inner_cds_start = std::stoi(split_line_inner[3]); int inner_cds_stop = std::stoi(split_line_inner[4]); - if (inner_cds_start != first_cds_start) { - for (int pos = inner_cds_start - 1; pos < inner_cds_stop; pos += 3) { - char nt[3] = { - reference[pos], - reference[pos+1], - reference[pos+2] - }; - std::shared_ptr c(new Codon(gene_outer, codon_counter, pos, nt)); - codon_counter += 1; - - auto it = codon_map.find(pos); - if (it == codon_map.end()) { - codon_map.insert({pos, {c}}); - } else { - (it->second).push_back(c); - } - - it = codon_map.find(pos+1); - if (it == codon_map.end()) { - codon_map.insert({pos+1, {c}}); - } else { - (it->second).push_back(c); + char strand_inner = split_line_inner[6][0]; + if (strand_inner == '+') { + if (inner_cds_start != first_cds_start || strand_outer != strand_inner) { + for (int pos = inner_cds_start - 1; pos < inner_cds_stop; pos += 3) { + char nt[3] = { + reference[pos], + reference[pos+1], + reference[pos+2] + }; + std::shared_ptr c(new Codon(gene_outer, codon_counter, pos, nt)); + codon_counter += 1; + + auto it = codon_map.find(pos); + if (it == codon_map.end()) { + codon_map.insert({pos, {c}}); + } else { + (it->second).push_back(c); + } + + it = codon_map.find(pos+1); + if (it == codon_map.end()) { + codon_map.insert({pos+1, {c}}); + } else { + (it->second).push_back(c); + } + + it = codon_map.find(pos+2); + if (it == codon_map.end()) { + codon_map.insert({pos+2, {c}}); + } else { + (it->second).push_back(c); + } } - - it = codon_map.find(pos+2); - if (it == codon_map.end()) { - codon_map.insert({pos+2, {c}}); - } else { - (it->second).push_back(c); + } + } else { + if (inner_cds_start != first_cds_start || strand_outer != strand_inner) { + for (int pos = inner_cds_stop - 1; pos > inner_cds_start; pos -= 3) { + char nt[3] = { + complement(reference[pos]), + complement(reference[pos-1]), + complement(reference[pos-2]) + }; + std::shared_ptr c(new Codon(gene_outer, codon_counter, pos, nt)); + codon_counter += 1; + + auto it = codon_map.find(pos); + if (it == codon_map.end()) { + codon_map.insert({pos, {c}}); + } else { + (it->second).push_back(c); + } + + it = codon_map.find(pos-1); + if (it == codon_map.end()) { + codon_map.insert({pos-1, {c}}); + } else { + (it->second).push_back(c); + } + + it = codon_map.find(pos-2); + if (it == codon_map.end()) { + codon_map.insert({pos-2, {c}}); + } else { + (it->second).push_back(c); + } } } } @@ -202,7 +292,7 @@ void translate_main(MAT::Tree *T, std::string output_filename, std::string gtf_f } // This is used for taxodium output. It translates each node and saves metadata to node_data along the way -void translate_and_populate_node_data(MAT::Tree *T, std::string gtf_filename, std::string fasta_filename, Taxodium::AllNodeData *node_data, Taxodium::AllData *all_data, std::unordered_map> &metadata, MetaColumns fixed_columns, std::vector &generic_metadata) { +void translate_and_populate_node_data(MAT::Tree *T, std::string gtf_filename, std::string fasta_filename, Taxodium::AllNodeData *node_data, Taxodium::AllData *all_data, std::unordered_map> &metadata, MetaColumns fixed_columns, std::vector &generic_metadata, float x_scale) { std::ifstream fasta_file(fasta_filename); if (!fasta_file) { fprintf(stderr, "ERROR: Could not open the fasta file: %s!\n", fasta_filename.c_str()); @@ -304,7 +394,7 @@ void translate_and_populate_node_data(MAT::Tree *T, std::string gtf_filename, st } } - node_data->add_x(branch_length_map[node->identifier] * 0.2); + node_data->add_x(branch_length_map[node->identifier] * x_scale); node_data->add_y(0); // temp value, set later node_data->add_epi_isl_numbers(0); // not currently set node_data->add_num_tips(T->get_leaves(node->identifier).size()); diff --git a/src/matUtils/translate.hpp b/src/matUtils/translate.hpp index 5592ea6e..bf33f061 100644 --- a/src/matUtils/translate.hpp +++ b/src/matUtils/translate.hpp @@ -17,7 +17,7 @@ typedef struct { Taxodium::MetadataSingleValuePerNode *protobuf_data_ptr; } GenericMetadata; -static std::unordered_map translation_map= { +static std::unordered_map translation_map = { {"GCT", 'A'}, {"GCC", 'A'}, {"GCA", 'A'}, {"GCG", 'A'}, {"GCN", 'A'}, {"TGT", 'C'}, {"TGC", 'C'}, {"TGY", 'C'}, {"GAT", 'D'}, {"GAC", 'D'}, {"GAY", 'D'}, @@ -41,6 +41,13 @@ static std::unordered_map translation_map= { {"TAG", '*'}, {"TAA", '*'}, {"TGA", '*'} }; +static std::unordered_map complement_map = { + {'A', 'T'}, {'C', 'G'}, {'G', 'C'}, {'T', 'A'}, + {'M', 'K'}, {'R', 'Y'}, {'W', 'W'}, {'S', 'S'}, + {'Y', 'R'}, {'K', 'M'}, {'V', 'B'}, {'H', 'D'}, + {'D', 'H'}, {'B', 'V'}, {'N', 'N'} +}; + struct Codon { std::string orf_name; std::string nucleotides; @@ -62,7 +69,7 @@ struct Codon { // The nt to mutate is the difference between the // genomic coordinate of the mutated nt and the // starting coordinate of the codon - nucleotides[nuc_pos-start_position] = mutated_nuc; + nucleotides[abs(nuc_pos-start_position)] = mutated_nuc; protein = translate_codon(nucleotides); } @@ -88,6 +95,7 @@ struct Codon { std::string do_mutations(std::vector &mutations, std::unordered_map>> &codon_map, bool taxodium_format); void translate_main(MAT::Tree *T, std::string output_filename, std::string gff_filename, std::string fasta_filename); -void translate_and_populate_node_data(MAT::Tree *T, std::string gtf_filename, std::string fasta_filename, Taxodium::AllNodeData *node_data, Taxodium::AllData *all_data, std::unordered_map> &metadata, MetaColumns fixed_columns, std::vector &generic_metadata); +void translate_and_populate_node_data(MAT::Tree *T, std::string gtf_filename, std::string fasta_filename, Taxodium::AllNodeData *node_data, Taxodium::AllData *all_data, std::unordered_map> &metadata, MetaColumns fixed_columns, std::vector &generic_metadata, float x_scale); void cleanup_codon_map(std::unordered_map>> &codon_map); void undo_mutations(std::vector &mutations, std::unordered_map>> &codon_map); +char complement(char nt); \ No newline at end of file