From f8126ef90a55a05144bf5a6dab3a850a7b346006 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Tue, 1 Sep 2020 10:27:54 -0400 Subject: [PATCH] [cudapoa] Add optional graph output in GFA format. Adds methods for outputting the graph structure generated during partial order alignment in GFA format and corresponding optional command line arguments in main.cpp --- .../genomeworks/utils/graph.hpp | 45 +++++++++++++++++++ cudapoa/src/application_parameters.cpp | 10 ++++- cudapoa/src/application_parameters.hpp | 1 + cudapoa/src/main.cpp | 9 +++- 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/common/base/include/claraparabricks/genomeworks/utils/graph.hpp b/common/base/include/claraparabricks/genomeworks/utils/graph.hpp index ba28bf23d..7868659c9 100644 --- a/common/base/include/claraparabricks/genomeworks/utils/graph.hpp +++ b/common/base/include/claraparabricks/genomeworks/utils/graph.hpp @@ -160,6 +160,19 @@ class Graph } } + /// + /// \brief Generates a GFA S(sequence) line from a node's label and sequence. + /// + /// \param gfa_str An output stringstream to write S lines to. + void node_labels_to_gfa(std::ostringstream& gfa_str) const + { + for (auto& iter : node_labels_) + { + gfa_str << "S" + << "\t" << iter.first << "\t" << iter.second << std::endl; + } + } + /// \brief Serialize edges to dot format /// /// \param dot_str Output string stream to serialize labels to @@ -175,6 +188,27 @@ class Graph } } + /// + /// \brief Serialize edges to GFA (v1) format + /// + /// \param gfa_str An output stringstream to write output to. + void edges_to_gfa(std::ostringstream& gfa_str) const + { + for (auto& iter : edges_) + { + const edge_t& edge = iter.first; + const node_id_t edge_source = edge.first; + const node_id_t edge_sink = edge.second; + gfa_str << "L" + << "\t" << edge_source << "\t" + << "+" + << "\t" << edge_sink << "\t" + << "+" + << "\t" + << "*" << std::endl; + } + } + /// List of adjacent nodes per node ID std::unordered_map> adjacent_nodes_; @@ -214,6 +248,17 @@ class DirectedGraph : public Graph } } + std::string serialize_to_gfa() const + { + std::ostringstream gfa_str; + gfa_str << "H" + << "\t" + << "VN:Z:1.0" << std::endl; + node_labels_to_gfa(gfa_str); + edges_to_gfa(gfa_str); + return gfa_str.str(); + } + /// \brief Serialize graph structure to dot format /// /// \return A string encoding the graph in dot format diff --git a/cudapoa/src/application_parameters.cpp b/cudapoa/src/application_parameters.cpp index 085803764..18ca652d2 100644 --- a/cudapoa/src/application_parameters.cpp +++ b/cudapoa/src/application_parameters.cpp @@ -40,6 +40,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[]) {"band-mode", required_argument, 0, 'b'}, {"band-width", required_argument, 0, 'w'}, {"dot", required_argument, 0, 'd'}, + {"gfa", required_argument, 0, 'G'}, {"max-groups", required_argument, 0, 'M'}, {"gpu-mem-alloc", required_argument, 0, 'R'}, {"match", required_argument, 0, 'm'}, @@ -49,7 +50,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[]) {"help", no_argument, 0, 'h'}, }; - std::string optstring = "i:ab:w:d:M:R:m:n:g:vh"; + std::string optstring = "i:ab:w:d:G:M:R:m:n:g:vh"; int32_t argument = 0; while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1) @@ -75,6 +76,10 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[]) case 'd': graph_output_path = std::string(optarg); break; + case 'G': + graph_output_path = std::string(optarg); + output_gfa = true; + break; case 'M': max_groups = std::stoi(optarg); break; @@ -188,6 +193,9 @@ void ApplicationParameters::help(int32_t exit_code) -d, --dot output path for printing graph in DOT format [disabled])" << R"( + -G, --gfa + output path for printing graph in GFA format [disabled])" + << R"( -M, --max-groups maximum number of POA groups to create from file (-1 for all, > 0 for limited) [-1] repeats groups if less groups are present than specified)" diff --git a/cudapoa/src/application_parameters.hpp b/cudapoa/src/application_parameters.hpp index 9364785ea..70bad0cdc 100644 --- a/cudapoa/src/application_parameters.hpp +++ b/cudapoa/src/application_parameters.hpp @@ -39,6 +39,7 @@ class ApplicationParameters std::vector input_paths; std::string graph_output_path; + bool output_gfa = false; bool all_fasta = true; bool msa = false; // consensus by default BandMode band_mode = BandMode::adaptive_band; diff --git a/cudapoa/src/main.cpp b/cudapoa/src/main.cpp index 5d571268f..a6256c806 100644 --- a/cudapoa/src/main.cpp +++ b/cudapoa/src/main.cpp @@ -235,7 +235,14 @@ int main(int argc, char* argv[]) batch->get_graphs(graph, graph_status); for (auto& g : graph) { - graph_output << g.serialize_to_dot() << std::endl; + if (parameters.output_gfa) + { + graph_output << g.serialize_to_gfa() << std::endl; + } + else + { + graph_output << g.serialize_to_dot() << std::endl; + } } }