Merge branch 'dev-v0.6.0' of https://github.com/clara-parabricks/Geno…

…meWorks into paf-compare
NVIDIA-Genomics-Research · Sep 14, 2020 · cc1a835 · cc1a835
2 parents bd9be38 + 0e9a6f3
commit cc1a835
Show file tree

Hide file tree

Showing 25 changed files with 845 additions and 659 deletions.
diff --git a/cudapoa/include/claraparabricks/genomeworks/cudapoa/batch.hpp b/cudapoa/include/claraparabricks/genomeworks/cudapoa/batch.hpp
@@ -76,7 +76,7 @@ struct BatchConfig
     BandMode band_mode;
 
     /// constructor- set upper limit parameters based on max_seq_sz and band_width
-    BatchConfig(int32_t max_seq_sz = 1024, int32_t max_seq_per_poa = 100, int32_t band_width = 256, BandMode banding = BandMode::full_band);
+    BatchConfig(int32_t max_seq_sz = 1024, int32_t max_seq_per_poa = 100, int32_t band_width = 256, BandMode banding = BandMode::full_band, float adapive_storage_factor = 2.0);
 
     /// constructor- set all parameters separately
     BatchConfig(int32_t max_seq_sz, int32_t max_consensus_sz, int32_t max_nodes_per_w,

diff --git a/cudapoa/include/claraparabricks/genomeworks/cudapoa/utils.hpp b/cudapoa/include/claraparabricks/genomeworks/cudapoa/utils.hpp
@@ -58,6 +58,7 @@ void get_multi_batch_sizes(std::vector<BatchConfig>& list_of_batch_sizes,
                            bool msa_flag                       = false,
                            int32_t band_width                  = 256,
                            BandMode band_mode                  = BandMode::adaptive_band,
+                           float adaptive_storage_factor       = 2.0f,
                            std::vector<int32_t>* bins_capacity = nullptr,
                            float gpu_memory_usage_quota        = 0.9,
                            int32_t mismatch_score              = -6,

diff --git a/cudapoa/samples/sample_cudapoa.cpp b/cudapoa/samples/sample_cudapoa.cpp
@@ -286,13 +286,18 @@ int main(int argc, char** argv)
             if (status == StatusType::success)
             {
                 // Check if all sequences in POA group wre added successfully.
+                int32_t num_dropped_seq = 0;
                 for (const auto& s : seq_status)
                 {
                     if (s == StatusType::exceeded_maximum_sequence_size)
                     {
-                        std::cerr << "Dropping sequence because sequence exceeded maximum size" << std::endl;
+                        num_dropped_seq++;
                     }
                 }
+                if (num_dropped_seq > 0)
+                {
+                    std::cerr << "Dropping " << num_dropped_seq << " sequence(s) in POA group " << batch_group_ids[i] << " because it exceeded maximum size" << std::endl;
+                }
                 i++;
             }
 

diff --git a/cudapoa/src/allocate_block.hpp b/cudapoa/src/allocate_block.hpp
@@ -253,17 +253,10 @@ class BatchBlock
         offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->outgoing_edge_count) * max_nodes_per_window_ * max_poas_);
         graph_details_d->incoming_edge_weights = reinterpret_cast<decltype(graph_details_d->incoming_edge_weights)>(&block_data_d_[offset_d_]);
         offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->incoming_edge_weights) * max_nodes_per_window_ * CUDAPOA_MAX_NODE_EDGES * max_poas_);
-        graph_details_d->outgoing_edge_weights = reinterpret_cast<decltype(graph_details_d->outgoing_edge_weights)>(&block_data_d_[offset_d_]);
-        offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->outgoing_edge_weights) * max_nodes_per_window_ * CUDAPOA_MAX_NODE_EDGES * max_poas_);
         graph_details_d->sorted_poa = reinterpret_cast<decltype(graph_details_d->sorted_poa)>(&block_data_d_[offset_d_]);
         offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->sorted_poa) * max_nodes_per_window_ * max_poas_);
         graph_details_d->sorted_poa_node_map = reinterpret_cast<decltype(graph_details_d->sorted_poa_node_map)>(&block_data_d_[offset_d_]);
         offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->sorted_poa_node_map) * max_nodes_per_window_ * max_poas_);
-        if (variable_bands_)
-        {
-            graph_details_d->node_distance_to_head = reinterpret_cast<decltype(graph_details_d->node_distance_to_head)>(&block_data_d_[offset_d_]);
-            offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->node_distance_to_head) * max_nodes_per_window_ * max_poas_);
-        }
         graph_details_d->sorted_poa_local_edge_count = reinterpret_cast<decltype(graph_details_d->sorted_poa_local_edge_count)>(&block_data_d_[offset_d_]);
         offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->sorted_poa_local_edge_count) * max_nodes_per_window_ * max_poas_);
         if (output_mask_ & OutputType::consensus)
@@ -332,10 +325,8 @@ class BatchBlock
         device_size_per_poa += sizeof(*GraphDetails<SizeT>::outgoing_edges) * max_nodes_per_graph * CUDAPOA_MAX_NODE_EDGES;                                                              // graph_details_d_->outgoing_edges
         device_size_per_poa += sizeof(*GraphDetails<SizeT>::outgoing_edge_count) * max_nodes_per_graph;                                                                                  // graph_details_d_->outgoing_edge_count
         device_size_per_poa += sizeof(*GraphDetails<SizeT>::incoming_edge_weights) * max_nodes_per_graph * CUDAPOA_MAX_NODE_EDGES;                                                       // graph_details_d_->incoming_edge_weights
-        device_size_per_poa += sizeof(*GraphDetails<SizeT>::outgoing_edge_weights) * max_nodes_per_graph * CUDAPOA_MAX_NODE_EDGES;                                                       // graph_details_d_->outgoing_edge_weights
         device_size_per_poa += sizeof(*GraphDetails<SizeT>::sorted_poa) * max_nodes_per_graph;                                                                                           // graph_details_d_->sorted_poa
         device_size_per_poa += sizeof(*GraphDetails<SizeT>::sorted_poa_node_map) * max_nodes_per_graph;                                                                                  // graph_details_d_->sorted_poa_node_map
-        device_size_per_poa += variable_bands ? sizeof(*GraphDetails<SizeT>::node_distance_to_head) * max_nodes_per_graph : 0;                                                           // graph_details_d_->node_distance_to_head
         device_size_per_poa += sizeof(*GraphDetails<SizeT>::sorted_poa_local_edge_count) * max_nodes_per_graph;                                                                          // graph_details_d_->sorted_poa_local_edge_count
         device_size_per_poa += (!msa_flag) ? sizeof(*GraphDetails<SizeT>::consensus_scores) * max_nodes_per_graph : 0;                                                                   // graph_details_d_->consensus_scores
         device_size_per_poa += (!msa_flag) ? sizeof(*GraphDetails<SizeT>::consensus_predecessors) * max_nodes_per_graph : 0;                                                             // graph_details_d_->consensus_predecessors

diff --git a/cudapoa/src/application_parameters.cpp b/cudapoa/src/application_parameters.cpp
@@ -39,6 +39,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
         {"msa", no_argument, 0, 'a'},
         {"band-mode", required_argument, 0, 'b'},
         {"band-width", required_argument, 0, 'w'},
+        {"adaptive-storage", required_argument, 0, 's'},
         {"dot", required_argument, 0, 'd'},
         {"max-groups", required_argument, 0, 'M'},
         {"gpu-mem-alloc", required_argument, 0, 'R'},
@@ -49,7 +50,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
         {"help", no_argument, 0, 'h'},
     };
 
-    std::string optstring = "i:ab:w:d:M:R:m:n:g:vh";
+    std::string optstring = "i:ab:w:s:d:M:R:m:n:g:vh";
 
     int32_t argument = 0;
     while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1)
@@ -72,6 +73,9 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
         case 'w':
             band_width = std::stoi(optarg);
             break;
+        case 's':
+            adaptive_storage = std::stof(optarg);
+            break;
         case 'd':
             graph_output_path = std::string(optarg);
             break;
@@ -185,6 +189,9 @@ void ApplicationParameters::help(int32_t exit_code)
         -w, --band-width <int>
             band-width for banded alignment (must be multiple of 128) [256])"
               << R"(
+        -s, --adaptive-storage  <float>
+            factor to accommodate extra memory for adaptive score matrix. The factor represents ratio of adaptive-banded score matrix to static-banded score matrix [2.0])"
+              << R"(
         -d, --dot <file>
             output path for printing graph in DOT format [disabled])"
               << R"(

diff --git a/cudapoa/src/application_parameters.hpp b/cudapoa/src/application_parameters.hpp
@@ -48,6 +48,7 @@ class ApplicationParameters
     int32_t gap_score         = -8;
     int32_t match_score       = 8;
     double gpu_mem_allocation = 0.9;
+    float adaptive_storage    = 2.0f;
 
 private:
     /// \brief verifies input file formats

diff --git a/cudapoa/src/batch.cu b/cudapoa/src/batch.cu
@@ -31,7 +31,8 @@ namespace cudapoa
 {
 
 /// constructor- set other parameters based on a minimum set of input arguments
-BatchConfig::BatchConfig(int32_t max_seq_sz /*= 1024*/, int32_t max_seq_per_poa /*= 100*/, int32_t band_width /*= 256*/, BandMode banding /*= BandMode::full_band*/)
+BatchConfig::BatchConfig(int32_t max_seq_sz /*= 1024*/, int32_t max_seq_per_poa /*= 100*/, int32_t band_width /*= 256*/,
+                         BandMode banding /*= BandMode::full_band*/, float adapive_storage_factor /*= 2.0*/)
     /// ensure a 4-byte boundary alignment for any allocated buffer
     : max_sequence_size(max_seq_sz)
     , max_consensus_size(2 * max_sequence_size)
@@ -58,7 +59,7 @@ BatchConfig::BatchConfig(int32_t max_seq_sz /*= 1024*/, int32_t max_seq_per_poa
         max_nodes_per_graph    = cudautils::align<int32_t, CELLS_PER_THREAD>(4 * max_sequence_size);
         matrix_graph_dimension = cudautils::align<int32_t, CELLS_PER_THREAD>(max_nodes_per_graph);
         // 8 = CUDAPOA_BANDED_MATRIX_RIGHT_PADDING, *2 is to reserve extra memory for cases with extended band-width
-        matrix_sequence_dimension = cudautils::align<int32_t, CELLS_PER_THREAD>(2 * (alignment_band_width + 8));
+        matrix_sequence_dimension = cudautils::align<int32_t, CELLS_PER_THREAD>(adapive_storage_factor * (alignment_band_width + 8));
     }
 
     throw_on_negative(max_seq_sz, "max_sequence_size cannot be negative.");

diff --git a/cudapoa/src/cudapoa_add_alignment.cuh b/cudapoa/src/cudapoa_add_alignment.cuh
@@ -45,7 +45,6 @@ namespace cudapoa
  * @param[in] outgoing_edges              Device buffer with outgoing edges per node
  * @param[in] outgoing_edges_count        Device buffer with number of outgoing edges per node
  * @param[in] incoming_edge_w             Device buffer with weight of incoming edges
- * @param[in] outgoing_edge_w             Device buffer with weight of outgoing edges
  * @param[in] alignment_length            Total length of new alignment
  * @param[in] graph                       Device scratch space with sorted graph
  * @param[in] alignment_graph             Device buffer with nodes from graph in alignment
@@ -63,7 +62,7 @@ namespace cudapoa
  * @return Status code for any errors encountered.
  */
 
-template <typename SizeT>
+template <typename SizeT, bool MSA = false>
 __device__
     uint8_t
     addAlignmentToGraph(SizeT& new_node_count,
@@ -72,7 +71,7 @@ __device__
                         SizeT* node_alignments, uint16_t* node_alignment_count,
                         SizeT* incoming_edges, uint16_t* incoming_edge_count,
                         SizeT* outgoing_edges, uint16_t* outgoing_edge_count,
-                        uint16_t* incoming_edge_w, uint16_t* /*outgoing_edge_w*/,
+                        uint16_t* incoming_edge_w,
                         SizeT alignment_length,
                         SizeT* /*graph*/,
                         SizeT* alignment_graph,
@@ -85,8 +84,7 @@ __device__
                         uint16_t* outgoing_edges_coverage_count,
                         uint16_t s,
                         uint32_t max_sequences_per_poa,
-                        uint32_t max_limit_nodes_per_window,
-                        bool msa = false)
+                        uint32_t max_limit_nodes_per_window)
 {
     //printf("Running addition for alignment %d\n", alignment_length);
     SizeT head_node_id   = -1;
@@ -152,7 +150,7 @@ __device__
                     //printf("aligned nodes are %d\n", num_aligned_node);
                     SizeT aligned_node_id = -1;
                     //printf("looping through alignments\n");
-                    for (uint16_t n = 0; n < num_aligned_node; n++)
+                    for (int32_t n = 0; n < num_aligned_node; n++)
                     {
                         SizeT aid = node_alignments[graph_node_id * CUDAPOA_MAX_NODE_ALIGNMENTS + n];
                         if (nodes[aid] == read_base)
@@ -187,7 +185,7 @@ __device__
                         node_coverage_counts[curr_node_id] = 0;
                         SizeT new_node_alignments          = 0;
 
-                        for (uint16_t n = 0; n < num_aligned_node; n++)
+                        for (int32_t n = 0; n < num_aligned_node; n++)
                         {
                             SizeT aid                                                                         = node_alignments[graph_node_id * CUDAPOA_MAX_NODE_ALIGNMENTS + n];
                             uint16_t aid_count                                                                = node_alignment_count[aid];
@@ -214,7 +212,7 @@ __device__
             }
 
             // for msa generation
-            if (msa && (read_pos == 0))
+            if (MSA && (read_pos == 0))
             {
                 //begin node of the sequence, add its node_id (curr_node_id) to sequence_begin_nodes_ids
                 *sequence_begin_nodes_ids = curr_node_id;
@@ -226,7 +224,7 @@ __device__
             {
                 bool edge_exists  = false;
                 uint16_t in_count = incoming_edge_count[curr_node_id];
-                for (uint16_t e = 0; e < in_count; e++)
+                for (int32_t e = 0; e < in_count; e++)
                 {
                     if (incoming_edges[curr_node_id * CUDAPOA_MAX_NODE_EDGES + e] == head_node_id)
                     {
@@ -242,7 +240,7 @@ __device__
                     incoming_edge_count[curr_node_id]                                 = in_count + 1;
                     uint16_t out_count                                                = outgoing_edge_count[head_node_id];
                     outgoing_edges[head_node_id * CUDAPOA_MAX_NODE_EDGES + out_count] = curr_node_id;
-                    if (msa)
+                    if (MSA)
                     {
                         outgoing_edges_coverage_count[head_node_id * CUDAPOA_MAX_NODE_EDGES + out_count]                     = 1;
                         outgoing_edges_coverage[(head_node_id * CUDAPOA_MAX_NODE_EDGES + out_count) * max_sequences_per_poa] = s;
@@ -256,10 +254,10 @@ __device__
                         //printf("exceeded max edge count\n");
                     }
                 }
-                else if (msa) //if edge exists and for msa generation
+                else if (MSA) //if edge exists and for msa generation
                 {
                     uint16_t out_count = outgoing_edge_count[head_node_id];
-                    for (uint16_t e = 0; e < out_count; e++)
+                    for (int32_t e = 0; e < out_count; e++)
                     {
                         if (outgoing_edges[head_node_id * CUDAPOA_MAX_NODE_EDGES + e] == curr_node_id)
                         {
@@ -293,7 +291,7 @@ __global__ void addAlignmentKernel(uint8_t* nodes,
                                    SizeT* node_alignments, uint16_t* node_alignment_count,
                                    SizeT* incoming_edges, uint16_t* incoming_edge_count,
                                    SizeT* outgoing_edges, uint16_t* outgoing_edge_count,
-                                   uint16_t* incoming_edge_w, uint16_t* outgoing_edge_w,
+                                   uint16_t* incoming_edge_w,
                                    SizeT* alignment_length,
                                    SizeT* graph,
                                    SizeT* alignment_graph,
@@ -315,7 +313,7 @@ __global__ void addAlignmentKernel(uint8_t* nodes,
                                node_alignments, node_alignment_count,
                                incoming_edges, incoming_edge_count,
                                outgoing_edges, outgoing_edge_count,
-                               incoming_edge_w, outgoing_edge_w,
+                               incoming_edge_w,
                                *alignment_length,
                                graph,
                                alignment_graph,
@@ -339,7 +337,7 @@ void addAlignment(uint8_t* nodes,
                   SizeT* node_alignments, uint16_t* node_alignment_count,
                   SizeT* incoming_edges, uint16_t* incoming_edge_count,
                   SizeT* outgoing_edges, uint16_t* outgoing_edge_count,
-                  uint16_t* incoming_edge_w, uint16_t* outgoing_edge_w,
+                  uint16_t* incoming_edge_w,
                   SizeT* alignment_length,
                   SizeT* graph,
                   SizeT* alignment_graph,
@@ -359,7 +357,7 @@ void addAlignment(uint8_t* nodes,
                                         node_alignments, node_alignment_count,
                                         incoming_edges, incoming_edge_count,
                                         outgoing_edges, outgoing_edge_count,
-                                        incoming_edge_w, outgoing_edge_w,
+                                        incoming_edge_w,
                                         alignment_length,
                                         graph,
                                         alignment_graph,

diff --git a/cudapoa/src/cudapoa_batch.cuh b/cudapoa/src/cudapoa_batch.cuh
@@ -300,7 +300,7 @@ public:
             {
                 output_status.emplace_back(genomeworks::cudapoa::StatusType::success);
                 uint16_t num_seqs = input_details_h_->window_details[poa].num_seqs;
-                for (uint16_t i = 0; i < num_seqs; i++)
+                for (int32_t i = 0; i < num_seqs; i++)
                 {
                     char* c = reinterpret_cast<char*>(&(output_details_h_->multiple_sequence_alignments[(poa * max_sequences_per_poa_ + i) * batch_size_.max_consensus_size]));
                     msa[poa].emplace_back(std::string(c));
@@ -379,7 +379,7 @@ public:
                     DirectedGraph::node_id_t sink = n;
                     graph.set_node_label(sink, std::string(1, static_cast<char>(nodes[n])));
                     uint16_t num_edges = graph_details_h_->incoming_edge_count[poa * max_nodes_per_window_ + n];
-                    for (uint16_t e = 0; e < num_edges; e++)
+                    for (int32_t e = 0; e < num_edges; e++)
                     {
                         int32_t idx                         = poa * max_nodes_per_window_ * CUDAPOA_MAX_NODE_EDGES + n * CUDAPOA_MAX_NODE_EDGES + e;
                         DirectedGraph::node_id_t src        = graph_details_h_->incoming_edges[idx];