From c209460343152a6bea96cb113f6dd35e5ecde495 Mon Sep 17 00:00:00 2001
From: ramin <rmafi@nvidia.com>
Date: Mon, 30 Nov 2020 19:31:29 -0500
Subject: [PATCH 01/11] [cudapoa] defined decode_error() to generate proper
 error message

---
 .../genomeworks/cudapoa/cudapoa.hpp           |  7 ++++
 cudapoa/src/cudapoa.cpp                       | 33 ++++++++++++++++
 cudapoa/src/cudapoa_batch.cuh                 | 38 ++-----------------
 3 files changed, 43 insertions(+), 35 deletions(-)
diff --git a/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp b/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
index 5df1e9249..2fc483117 100644
--- a/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
+++ b/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <string>
+
 namespace claraparabricks
 {
 
@@ -45,6 +47,11 @@ enum StatusType
     exceeded_maximum_predecessor_distance
 };
 
+/// Generate corresponding error message for a given error type
+/// \param [in] error_type input error code
+/// \return corresponding error message
+std::string decode_error(StatusType error_type);
+
 /// Banding mode used in Needleman-Wunsch algorithm
 /// - full_band performs computations on full scores matrix, highest accuracy
 /// - static_band performs computations on a fixed band along scores matrix diagonal, fastest implementation
diff --git a/cudapoa/src/cudapoa.cpp b/cudapoa/src/cudapoa.cpp
index 9699c663d..7c889d0cf 100644
--- a/cudapoa/src/cudapoa.cpp
+++ b/cudapoa/src/cudapoa.cpp
@@ -34,6 +34,39 @@ StatusType Init()
     return StatusType::success;
 }
 
+std::string decode_error(StatusType error_type)
+{
+    std::string error_message;
+    switch (error_type)
+    {
+    case genomeworks::cudapoa::StatusType::node_count_exceeded_maximum_graph_size:
+        error_message = "Kernel Error:: Node count exceeded maximum nodes per graph in batch";
+        break;
+    case genomeworks::cudapoa::StatusType::edge_count_exceeded_maximum_graph_size:
+        error_message = "Kernel Error:: Edge count exceeded maximum edges per graph in batch";
+        break;
+    case genomeworks::cudapoa::StatusType::seq_len_exceeded_maximum_nodes_per_window:
+        error_message = "Kernel Error:: Sequence length exceeded maximum nodes per window in batch";
+        break;
+    case genomeworks::cudapoa::StatusType::loop_count_exceeded_upper_bound:
+        error_message = "Kernel Error:: Loop count exceeded upper bound in nw algorithm in batch";
+        break;
+    case genomeworks::cudapoa::StatusType::exceeded_adaptive_banded_matrix_size:
+        error_message = "Kernel Error:: Band width set for adaptive matrix allocation is too small in batch";
+        break;
+    case genomeworks::cudapoa::StatusType::exceeded_maximum_sequence_size:
+        error_message = "Kernel Error:: Consensus/MSA sequence size exceeded max sequence size in batch";
+        break;
+    case genomeworks::cudapoa::StatusType::exceeded_maximum_predecessor_distance:
+        error_message = "Kernel Error:: Set value for maximum predecessor distance in traceback NW is too small";
+        break;
+    default:
+        error_message = "Kernel Error:: Unknown error in batch";
+        break;
+    }
+    return error_message;
+}
+
 } // namespace cudapoa
 
 } // namespace genomeworks
diff --git a/cudapoa/src/cudapoa_batch.cuh b/cudapoa/src/cudapoa_batch.cuh
index 690d1628e..3fd5887ad 100644
--- a/cudapoa/src/cudapoa_batch.cuh
+++ b/cudapoa/src/cudapoa_batch.cuh
@@ -440,41 +440,9 @@ protected:
     void decode_cudapoa_kernel_error(genomeworks::cudapoa::StatusType error_type,
                                      std::vector<StatusType>& output_status)
     {
-        switch (error_type)
-        {
-        case genomeworks::cudapoa::StatusType::node_count_exceeded_maximum_graph_size:
-            GW_LOG_WARN("Kernel Error:: Node count exceeded maximum nodes per graph in batch {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        case genomeworks::cudapoa::StatusType::edge_count_exceeded_maximum_graph_size:
-            GW_LOG_WARN("Kernel Error:: Edge count exceeded maximum edges per graph in batch {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        case genomeworks::cudapoa::StatusType::seq_len_exceeded_maximum_nodes_per_window:
-            GW_LOG_WARN("Kernel Error:: Sequence length exceeded maximum nodes per window in batch {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        case genomeworks::cudapoa::StatusType::loop_count_exceeded_upper_bound:
-            GW_LOG_WARN("Kernel Error:: Loop count exceeded upper bound in nw algorithm in batch {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        case genomeworks::cudapoa::StatusType::exceeded_adaptive_banded_matrix_size:
-            GW_LOG_WARN("Kernel Error:: Band width set for adaptive matrix allocation is too small in batch {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        case genomeworks::cudapoa::StatusType::exceeded_maximum_sequence_size:
-            GW_LOG_WARN("Kernel Error:: Consensus/MSA sequence size exceeded max sequence size in batch {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        case genomeworks::cudapoa::StatusType::exceeded_maximum_predecessor_distance:
-            GW_LOG_WARN("Kernel Error:: Set value for maximum predecessor distance in traceback NW is too small {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        default:
-            GW_LOG_WARN("Kernel Error:: Unknown error in batch {}\n", bid_);
-            output_status.emplace_back(error_type);
-            break;
-        }
+        std::string error_message = decode_error(error_type) + " {}\n";
+        GW_LOG_WARN(error_message.c_str(), bid_);
+        output_status.emplace_back(error_type);
     }
 
     // Add new partial order alignment to batch.

From 5e4f217c3443e67a8ae45eadc8a4a3d450bf521a Mon Sep 17 00:00:00 2001
From: ramin <rmafi@nvidia.com>
Date: Tue, 1 Dec 2020 11:12:50 -0500
Subject: [PATCH 02/11] [cudapoa] improved error messages in decode_error()

---
 .../genomeworks/cudapoa/cudapoa.hpp           |  5 +-
 cudapoa/src/cudapoa.cpp                       | 54 ++++++++++++-------
 cudapoa/src/cudapoa_batch.cuh                 |  5 +-
 3 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp b/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
index 2fc483117..1caaae56e 100644
--- a/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
+++ b/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
@@ -49,8 +49,9 @@ enum StatusType
 
 /// Generate corresponding error message for a given error type
 /// \param [in] error_type input error code
-/// \return corresponding error message
-std::string decode_error(StatusType error_type);
+/// \param [out] error_message corresponding error message
+/// \param [out] error_hint possible hint to resolve the error
+void decode_error(StatusType error_type, std::string& error_message, std::string& error_hint);
 
 /// Banding mode used in Needleman-Wunsch algorithm
 /// - full_band performs computations on full scores matrix, highest accuracy
diff --git a/cudapoa/src/cudapoa.cpp b/cudapoa/src/cudapoa.cpp
index 7c889d0cf..216227cd6 100644
--- a/cudapoa/src/cudapoa.cpp
+++ b/cudapoa/src/cudapoa.cpp
@@ -34,37 +34,55 @@ StatusType Init()
     return StatusType::success;
 }
 
-std::string decode_error(StatusType error_type)
+void decode_error(StatusType error_type, std::string& error_message, std::string& error_hint)
 {
-    std::string error_message;
     switch (error_type)
     {
-    case genomeworks::cudapoa::StatusType::node_count_exceeded_maximum_graph_size:
-        error_message = "Kernel Error:: Node count exceeded maximum nodes per graph in batch";
+    case StatusType::exceeded_maximum_poas:
+        error_message = "Kernel Error: Number of groups per batch exceeded maximum POAs";
+        error_hint    = "Suggestion  : Evaluate maximum number of groups per batch using BatchBlock::estimate_max_poas()";
         break;
-    case genomeworks::cudapoa::StatusType::edge_count_exceeded_maximum_graph_size:
-        error_message = "Kernel Error:: Edge count exceeded maximum edges per graph in batch";
+    case StatusType::exceeded_maximum_sequence_size:
+        error_message = "Kernel Error: Input read length or output consensus/MSA sequence length exceeded max sequence size";
+        error_hint    = "Suggestion  : Check BatchConfig::max_sequence_size and BatchConfig::max_consensus_size, increase if necessary";
         break;
-    case genomeworks::cudapoa::StatusType::seq_len_exceeded_maximum_nodes_per_window:
-        error_message = "Kernel Error:: Sequence length exceeded maximum nodes per window in batch";
+    case StatusType::exceeded_maximum_sequences_per_poa:
+        error_message = "Kernel Error: Exceeded maximum number of reads per POA";
+        error_hint    = "Suggestion  : Check BatchConfig::max_sequences_per_poa and increase if necessary";
         break;
-    case genomeworks::cudapoa::StatusType::loop_count_exceeded_upper_bound:
-        error_message = "Kernel Error:: Loop count exceeded upper bound in nw algorithm in batch";
+    case StatusType::node_count_exceeded_maximum_graph_size:
+        error_message = "Kernel Error: Node count exceeded maximum nodes per POA graph";
+        error_hint    = "Suggestion  : Check BatchConfig::max_nodes_per_graph and increase if necessary";
         break;
-    case genomeworks::cudapoa::StatusType::exceeded_adaptive_banded_matrix_size:
-        error_message = "Kernel Error:: Band width set for adaptive matrix allocation is too small in batch";
+    case StatusType::edge_count_exceeded_maximum_graph_size:
+        error_message = "Kernel Error: Edge count exceeded maximum edges per graph";
+        error_hint    = "Suggestion  : Check default value of CUDAPOA_MAX_NODE_EDGES, note that increasing this macro would increase memory usage per POA";
         break;
-    case genomeworks::cudapoa::StatusType::exceeded_maximum_sequence_size:
-        error_message = "Kernel Error:: Consensus/MSA sequence size exceeded max sequence size in batch";
+    case StatusType::exceeded_adaptive_banded_matrix_size:
+        error_message = "Kernel Error: Allocated buffer for score/traceback matrix in adaptive banding is not large enough";
+        error_hint    = "Suggestion  : Check BatchConfig::matrix_sequence_dimension and increase if necessary";
         break;
-    case genomeworks::cudapoa::StatusType::exceeded_maximum_predecessor_distance:
-        error_message = "Kernel Error:: Set value for maximum predecessor distance in traceback NW is too small";
+    case StatusType::loop_count_exceeded_upper_bound:
+        error_message = "Kernel Error: Traceback in Needleman-Wunsch algorithm failed";
+        error_hint    = "Suggestion  : You may retry with a different banding mode";
+        break;
+    case StatusType::output_type_unavailable:
+        error_message = "Kernel Error: Output type not available";
+        error_hint    = "Suggestion  : Check MSA/Consensus selection for output type";
+        break;
+    case StatusType::exceeded_maximum_predecessor_distance:
+        error_message = "Kernel Error: Set value for maximum predecessor distance in Needleman-Wunsch algorithm with traceback buffer is not large enough";
+        error_hint    = "Suggestion  : Check BatchConfig::max_banded_pred_distance and increase if necessary";
+        break;
+    case StatusType::generic_error:
+        error_message = "Kernel Error: Unknown error";
+        error_hint    = "";
         break;
     default:
-        error_message = "Kernel Error:: Unknown error in batch";
+        error_message = "Kernel Error: Unknown error";
+        error_hint    = "";
         break;
     }
-    return error_message;
 }
 
 } // namespace cudapoa
diff --git a/cudapoa/src/cudapoa_batch.cuh b/cudapoa/src/cudapoa_batch.cuh
index 3fd5887ad..70d7ee956 100644
--- a/cudapoa/src/cudapoa_batch.cuh
+++ b/cudapoa/src/cudapoa_batch.cuh
@@ -440,7 +440,10 @@ protected:
     void decode_cudapoa_kernel_error(genomeworks::cudapoa::StatusType error_type,
                                      std::vector<StatusType>& output_status)
     {
-        std::string error_message = decode_error(error_type) + " {}\n";
+        std::string error_message;
+        std::string error_hint;
+        decode_error(error_type, error_message, error_hint);
+        error_message = error_message + " in batch {}\n" + error_hint;
         GW_LOG_WARN(error_message.c_str(), bid_);
         output_status.emplace_back(error_type);
     }

From 6fb1760633b20f9170b1cb6ea200ffa5c04e5db5 Mon Sep 17 00:00:00 2001
From: ramin <rmafi@nvidia.com>
Date: Tue, 1 Dec 2020 11:27:55 -0500
Subject: [PATCH 03/11] [cudapoa] removed obsolete
 seq_len_exceeded_maximum_nodes_per_window error code; update python code

---
 .../claraparabricks/genomeworks/cudapoa/cudapoa.hpp       | 5 ++---
 cudapoa/src/cudapoa.cpp                                   | 8 ++++----
 pygenomeworks/genomeworks/cudapoa/cudapoa.pxd             | 3 ++-
 pygenomeworks/genomeworks/cudapoa/cudapoa.pyx             | 6 ++++--
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp b/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
index 1caaae56e..24f632c5c 100644
--- a/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
+++ b/cudapoa/include/claraparabricks/genomeworks/cudapoa/cudapoa.hpp
@@ -40,11 +40,10 @@ enum StatusType
     node_count_exceeded_maximum_graph_size,
     edge_count_exceeded_maximum_graph_size,
     exceeded_adaptive_banded_matrix_size,
-    seq_len_exceeded_maximum_nodes_per_window,
+    exceeded_maximum_predecessor_distance,
     loop_count_exceeded_upper_bound,
     output_type_unavailable,
-    generic_error,
-    exceeded_maximum_predecessor_distance
+    generic_error
 };
 
 /// Generate corresponding error message for a given error type
diff --git a/cudapoa/src/cudapoa.cpp b/cudapoa/src/cudapoa.cpp
index 216227cd6..d239c1235 100644
--- a/cudapoa/src/cudapoa.cpp
+++ b/cudapoa/src/cudapoa.cpp
@@ -62,6 +62,10 @@ void decode_error(StatusType error_type, std::string& error_message, std::string
         error_message = "Kernel Error: Allocated buffer for score/traceback matrix in adaptive banding is not large enough";
         error_hint    = "Suggestion  : Check BatchConfig::matrix_sequence_dimension and increase if necessary";
         break;
+    case StatusType::exceeded_maximum_predecessor_distance:
+        error_message = "Kernel Error: Set value for maximum predecessor distance in Needleman-Wunsch algorithm with traceback buffer is not large enough";
+        error_hint    = "Suggestion  : Check BatchConfig::max_banded_pred_distance and increase if necessary";
+        break;
     case StatusType::loop_count_exceeded_upper_bound:
         error_message = "Kernel Error: Traceback in Needleman-Wunsch algorithm failed";
         error_hint    = "Suggestion  : You may retry with a different banding mode";
@@ -70,10 +74,6 @@ void decode_error(StatusType error_type, std::string& error_message, std::string
         error_message = "Kernel Error: Output type not available";
         error_hint    = "Suggestion  : Check MSA/Consensus selection for output type";
         break;
-    case StatusType::exceeded_maximum_predecessor_distance:
-        error_message = "Kernel Error: Set value for maximum predecessor distance in Needleman-Wunsch algorithm with traceback buffer is not large enough";
-        error_hint    = "Suggestion  : Check BatchConfig::max_banded_pred_distance and increase if necessary";
-        break;
     case StatusType::generic_error:
         error_message = "Kernel Error: Unknown error";
         error_hint    = "";
diff --git a/pygenomeworks/genomeworks/cudapoa/cudapoa.pxd b/pygenomeworks/genomeworks/cudapoa/cudapoa.pxd
index eff1b0310..5f7f5bcbc 100644
--- a/pygenomeworks/genomeworks/cudapoa/cudapoa.pxd
+++ b/pygenomeworks/genomeworks/cudapoa/cudapoa.pxd
@@ -47,7 +47,8 @@ cdef extern from "claraparabricks/genomeworks/cudapoa/cudapoa.hpp" namespace "cl
         exceeded_maximum_sequences_per_poa
         node_count_exceeded_maximum_graph_size
         edge_count_exceeded_maximum_graph_size
-        seq_len_exceeded_maximum_nodes_per_window
+        exceeded_adaptive_banded_matrix_size
+        exceeded_maximum_predecessor_distance
         loop_count_exceeded_upper_bound
         output_type_unavailable
         generic_error
diff --git a/pygenomeworks/genomeworks/cudapoa/cudapoa.pyx b/pygenomeworks/genomeworks/cudapoa/cudapoa.pyx
index c78a638a7..8d52f5061 100644
--- a/pygenomeworks/genomeworks/cudapoa/cudapoa.pyx
+++ b/pygenomeworks/genomeworks/cudapoa/cudapoa.pyx
@@ -52,8 +52,10 @@ def status_to_str(status):
         return "node_count_exceeded_maximum_graph_size"
     elif status == cudapoa.edge_count_exceeded_maximum_graph_size:
         return "edge_count_exceeded_maximum_graph_size"
-    elif status == cudapoa.seq_len_exceeded_maximum_nodes_per_window:
-        return "seq_len_exceeded_maximum_nodes_per_window"
+    elif status == cudapoa.exceeded_adaptive_banded_matrix_size:
+        return "exceeded_adaptive_banded_matrix_size"
+    elif status == cudapoa.exceeded_maximum_predecessor_distance:
+        return "exceeded_maximum_predecessor_distance"
     elif status == cudapoa.loop_count_exceeded_upper_bound:
         return "loop_count_exceeded_upper_bound"
     elif status == cudapoa.output_type_unavailable:

From b80d7dfed43ea6cee25eafe6de7fae2c20a0aee3 Mon Sep 17 00:00:00 2001
From: ramin <rmafi@nvidia.com>
Date: Tue, 1 Dec 2020 13:13:52 -0500
Subject: [PATCH 04/11] [cudapoa] updated error message output in cudapoa-bin
 and sample-cudapoa

---
 cudapoa/samples/sample_cudapoa.cpp | 29 ++++++++++++++++++++++++-----
 cudapoa/src/main.cpp               | 29 ++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/cudapoa/samples/sample_cudapoa.cpp b/cudapoa/samples/sample_cudapoa.cpp
index 116995f9e..773e8ee17 100644
--- a/cudapoa/samples/sample_cudapoa.cpp
+++ b/cudapoa/samples/sample_cudapoa.cpp
@@ -64,6 +64,7 @@ std::unique_ptr<Batch> initialize_batch(bool msa, const BatchConfig& batch_size)
 void process_batch(Batch* batch, bool msa_flag, bool print, std::vector<int32_t>& list_of_group_ids, int id_offset)
 {
     batch->generate_poa();
+    std::string error_message, error_hint;
 
     StatusType status = StatusType::success;
     if (msa_flag)
@@ -75,14 +76,20 @@ void process_batch(Batch* batch, bool msa_flag, bool print, std::vector<int32_t>
         status = batch->get_msa(msa, output_status);
         if (status != StatusType::success)
         {
-            std::cerr << "Could not generate MSA for batch : " << status << std::endl;
+            decode_error(status, error_message, error_hint);
+            std::cerr << "Could not generate MSA for batch : " << std::endl;
+            std::cerr << error_message << std::endl
+                      << error_hint << std::endl;
         }
 
         for (int32_t g = 0; g < get_size(msa); g++)
         {
             if (output_status[g] != StatusType::success)
             {
-                std::cerr << "Error generating  MSA for POA group " << list_of_group_ids[g + id_offset] << ". Error type " << output_status[g] << std::endl;
+                decode_error(output_status[g], error_message, error_hint);
+                std::cerr << "Error generating  MSA for POA group " << list_of_group_ids[g + id_offset] << std::endl;
+                std::cerr << error_message << std::endl
+                          << error_hint << std::endl;
             }
             else
             {
@@ -106,14 +113,20 @@ void process_batch(Batch* batch, bool msa_flag, bool print, std::vector<int32_t>
         status = batch->get_consensus(consensus, coverage, output_status);
         if (status != StatusType::success)
         {
-            std::cerr << "Could not generate consensus for batch : " << status << std::endl;
+            decode_error(status, error_message, error_hint);
+            std::cerr << "Could not generate consensus for batch : " << std::endl;
+            std::cerr << error_message << std::endl
+                      << error_hint << std::endl;
         }
 
         for (int32_t g = 0; g < get_size(consensus); g++)
         {
             if (output_status[g] != StatusType::success)
             {
-                std::cerr << "Error generating consensus for POA group " << list_of_group_ids[g + id_offset] << ". Error type " << output_status[g] << std::endl;
+                decode_error(output_status[g], error_message, error_hint);
+                std::cerr << "Error generating  consensus for POA group " << list_of_group_ids[g + id_offset] << std::endl;
+                std::cerr << error_message << std::endl
+                          << error_hint << std::endl;
             }
             else
             {
@@ -213,6 +226,9 @@ int main(int argc, char** argv)
         }
     }
 
+    // for error code message
+    std::string error_message, error_hint;
+
     // analyze the POA groups and create a minimal set of batches to process them all
     std::vector<BatchConfig> list_of_batch_sizes;
     std::vector<std::vector<int32_t>> list_of_groups_per_batch;
@@ -304,7 +320,10 @@ int main(int argc, char** argv)
 
             if (status != StatusType::exceeded_maximum_poas && status != StatusType::success)
             {
-                std::cout << "Could not add POA group " << batch_group_ids[i] << " to batch " << b << ". Error code " << status << std::endl;
+                decode_error(status, error_message, error_hint);
+                std::cerr << "Could not add POA group " << batch_group_ids[i] << " to batch " << b << std::endl;
+                std::cerr << error_message << std::endl
+                          << error_hint << std::endl;
                 i++;
             }
         }
diff --git a/cudapoa/src/main.cpp b/cudapoa/src/main.cpp
index e68d0d2f6..2765492e5 100644
--- a/cudapoa/src/main.cpp
+++ b/cudapoa/src/main.cpp
@@ -67,6 +67,7 @@ std::unique_ptr<Batch> initialize_batch(int32_t mismatch_score,
 void process_batch(Batch* batch, bool msa_flag, bool print, std::vector<int32_t>& list_of_group_ids, int id_offset)
 {
     batch->generate_poa();
+    std::string error_message, error_hint;
 
     StatusType status = StatusType::success;
     if (msa_flag)
@@ -78,14 +79,20 @@ void process_batch(Batch* batch, bool msa_flag, bool print, std::vector<int32_t>
         status = batch->get_msa(msa, output_status);
         if (status != StatusType::success)
         {
-            std::cerr << "Could not generate MSA for batch : " << status << std::endl;
+            decode_error(status, error_message, error_hint);
+            std::cerr << "Could not generate MSA for batch : " << std::endl;
+            std::cerr << error_message << std::endl
+                      << error_hint << std::endl;
         }
 
         for (int32_t g = 0; g < get_size(msa); g++)
         {
             if (output_status[g] != StatusType::success)
             {
-                std::cerr << "Error generating  MSA for POA group " << list_of_group_ids[g + id_offset] << ". Error type " << output_status[g] << std::endl;
+                decode_error(output_status[g], error_message, error_hint);
+                std::cerr << "Error generating  MSA for POA group " << list_of_group_ids[g + id_offset] << std::endl;
+                std::cerr << error_message << std::endl
+                          << error_hint << std::endl;
             }
             else
             {
@@ -109,14 +116,20 @@ void process_batch(Batch* batch, bool msa_flag, bool print, std::vector<int32_t>
         status = batch->get_consensus(consensus, coverage, output_status);
         if (status != StatusType::success)
         {
-            std::cerr << "Could not generate consensus for batch : " << status << std::endl;
+            decode_error(status, error_message, error_hint);
+            std::cerr << "Could not generate consensus for batch : " << std::endl;
+            std::cerr << error_message << std::endl
+                      << error_hint << std::endl;
         }
 
         for (int32_t g = 0; g < get_size(consensus); g++)
         {
             if (output_status[g] != StatusType::success)
             {
-                std::cerr << "Error generating consensus for POA group " << list_of_group_ids[g + id_offset] << ". Error type " << output_status[g] << std::endl;
+                decode_error(output_status[g], error_message, error_hint);
+                std::cerr << "Error generating  consensus for POA group " << list_of_group_ids[g + id_offset] << std::endl;
+                std::cerr << error_message << std::endl
+                          << error_hint << std::endl;
             }
             else
             {
@@ -158,6 +171,9 @@ int main(int argc, char* argv[])
         }
     }
 
+    // for error code message
+    std::string error_message, error_hint;
+
     // Create a vector of POA groups based on windows
     std::vector<Group> poa_groups(windows.size());
     for (int32_t i = 0; i < get_size(windows); ++i)
@@ -289,7 +305,10 @@ int main(int argc, char* argv[])
 
             if (status != StatusType::exceeded_maximum_poas && status != StatusType::success)
             {
-                std::cerr << "Could not add POA group " << batch_group_ids[i] << " to batch " << b << ". Error code " << status << std::endl;
+                decode_error(status, error_message, error_hint);
+                std::cerr << "Could not add POA group " << batch_group_ids[i] << " to batch " << b << std::endl;
+                std::cerr << error_message << std::endl
+                          << error_hint << std::endl;
                 i++;
             }
         }

From cb4142215648cdfc813e0aa30a2992d1b80335ec Mon Sep 17 00:00:00 2001
From: ramin <rmafi@nvidia.com>
Date: Wed, 2 Dec 2020 17:09:47 -0500
Subject: [PATCH 05/11] [cudapoa] minor fix

---
 cudapoa/src/cudapoa.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cudapoa/src/cudapoa.cpp b/cudapoa/src/cudapoa.cpp
index d239c1235..d970b8789 100644
--- a/cudapoa/src/cudapoa.cpp
+++ b/cudapoa/src/cudapoa.cpp
@@ -75,11 +75,11 @@ void decode_error(StatusType error_type, std::string& error_message, std::string
         error_hint    = "Suggestion  : Check MSA/Consensus selection for output type";
         break;
     case StatusType::generic_error:
-        error_message = "Kernel Error: Unknown error";
+        error_message = "Unknown error";
         error_hint    = "";
         break;
     default:
-        error_message = "Kernel Error: Unknown error";
+        error_message = "Unknown error";
         error_hint    = "";
         break;
     }

From 9f5ed3d305e31d7b1123a355329b8bd3d62a14bd Mon Sep 17 00:00:00 2001
From: Joyjit Daw <jdaw@nvidia.com>
Date: Thu, 3 Dec 2020 09:35:35 -0500
Subject: [PATCH 06/11] [cudapoa] throw error for unknown error type

---
 cudapoa/src/cudapoa.cpp | 42 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/cudapoa/src/cudapoa.cpp b/cudapoa/src/cudapoa.cpp
index d970b8789..c16b08d73 100644
--- a/cudapoa/src/cudapoa.cpp
+++ b/cudapoa/src/cudapoa.cpp
@@ -39,49 +39,47 @@ void decode_error(StatusType error_type, std::string& error_message, std::string
     switch (error_type)
     {
     case StatusType::exceeded_maximum_poas:
-        error_message = "Kernel Error: Number of groups per batch exceeded maximum POAs";
-        error_hint    = "Suggestion  : Evaluate maximum number of groups per batch using BatchBlock::estimate_max_poas()";
+        error_message = "Kernel Error: Number of groups per batch exceeded maximum POAs.";
+        error_hint    = "Suggestion  : Evaluate maximum number of groups per batch using BatchBlock::estimate_max_poas().";
         break;
     case StatusType::exceeded_maximum_sequence_size:
-        error_message = "Kernel Error: Input read length or output consensus/MSA sequence length exceeded max sequence size";
-        error_hint    = "Suggestion  : Check BatchConfig::max_sequence_size and BatchConfig::max_consensus_size, increase if necessary";
+        error_message = "Kernel Error: Input read length or output consensus/MSA sequence length exceeded max sequence size.";
+        error_hint    = "Suggestion  : Check BatchConfig::max_sequence_size and BatchConfig::max_consensus_size, increase if necessary.";
         break;
     case StatusType::exceeded_maximum_sequences_per_poa:
-        error_message = "Kernel Error: Exceeded maximum number of reads per POA";
-        error_hint    = "Suggestion  : Check BatchConfig::max_sequences_per_poa and increase if necessary";
+        error_message = "Kernel Error: Exceeded maximum number of reads per POA.";
+        error_hint    = "Suggestion  : Check BatchConfig::max_sequences_per_poa and increase if necessary.";
         break;
     case StatusType::node_count_exceeded_maximum_graph_size:
-        error_message = "Kernel Error: Node count exceeded maximum nodes per POA graph";
-        error_hint    = "Suggestion  : Check BatchConfig::max_nodes_per_graph and increase if necessary";
+        error_message = "Kernel Error: Node count exceeded maximum nodes per POA graph.";
+        error_hint    = "Suggestion  : Check BatchConfig::max_nodes_per_graph and increase if necessary.";
         break;
     case StatusType::edge_count_exceeded_maximum_graph_size:
-        error_message = "Kernel Error: Edge count exceeded maximum edges per graph";
-        error_hint    = "Suggestion  : Check default value of CUDAPOA_MAX_NODE_EDGES, note that increasing this macro would increase memory usage per POA";
+        error_message = "Kernel Error: Edge count exceeded maximum edges per graph.";
+        error_hint    = "Suggestion  : Check default value of CUDAPOA_MAX_NODE_EDGES, note that increasing this macro would increase memory usage per POA.";
         break;
     case StatusType::exceeded_adaptive_banded_matrix_size:
-        error_message = "Kernel Error: Allocated buffer for score/traceback matrix in adaptive banding is not large enough";
-        error_hint    = "Suggestion  : Check BatchConfig::matrix_sequence_dimension and increase if necessary";
+        error_message = "Kernel Error: Allocated buffer for score/traceback matrix in adaptive banding is not large enough.";
+        error_hint    = "Suggestion  : Check BatchConfig::matrix_sequence_dimension and increase if necessary.";
         break;
     case StatusType::exceeded_maximum_predecessor_distance:
-        error_message = "Kernel Error: Set value for maximum predecessor distance in Needleman-Wunsch algorithm with traceback buffer is not large enough";
-        error_hint    = "Suggestion  : Check BatchConfig::max_banded_pred_distance and increase if necessary";
+        error_message = "Kernel Error: Set value for maximum predecessor distance in Needleman-Wunsch algorithm with traceback buffer is not large enough.";
+        error_hint    = "Suggestion  : Check BatchConfig::max_banded_pred_distance and increase if necessary.";
         break;
     case StatusType::loop_count_exceeded_upper_bound:
-        error_message = "Kernel Error: Traceback in Needleman-Wunsch algorithm failed";
-        error_hint    = "Suggestion  : You may retry with a different banding mode";
+        error_message = "Kernel Error: Traceback in Needleman-Wunsch algorithm failed.";
+        error_hint    = "Suggestion  : You may retry with a different banding mode.";
         break;
     case StatusType::output_type_unavailable:
-        error_message = "Kernel Error: Output type not available";
-        error_hint    = "Suggestion  : Check MSA/Consensus selection for output type";
+        error_message = "Kernel Error: Output type not available.";
+        error_hint    = "Suggestion  : Check MSA/Consensus selection for output type.";
         break;
     case StatusType::generic_error:
-        error_message = "Unknown error";
+        error_message = "Unknown error.";
         error_hint    = "";
         break;
     default:
-        error_message = "Unknown error";
-        error_hint    = "";
-        break;
+        throw std::runtime_error("Unknown error type detected.");
     }
 }
 

From f1ac5937ed5e021ed5c5bece523bd86270d04929 Mon Sep 17 00:00:00 2001
From: Joyjit Daw <jdaw@nvidia.com>
Date: Thu, 3 Dec 2020 14:05:22 -0500
Subject: [PATCH 07/11] [docs] refactor docs to cleanup repo look

---
 README.md                   | 77 ++-----------------------------------
 cudaaligner/CMakeLists.txt  |  1 +
 cudaaligner/README.md       | 13 +++++++
 cudaextender/CMakeLists.txt |  1 +
 cudaextender/README.md      |  8 ++--
 cudamapper/CMakeLists.txt   |  1 +
 cudamapper/README.md        | 30 +++++++++++++++
 cudapoa/CMakeLists.txt      |  1 +
 cudapoa/README.md           | 21 ++++++++++
 9 files changed, 76 insertions(+), 77 deletions(-)
 create mode 100644 cudaaligner/README.md
 create mode 100644 cudamapper/README.md
 create mode 100644 cudapoa/README.md

diff --git a/README.md b/README.md
index fb8b027ba..f6a59b8d3 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,10 @@ GenomeWorks is a GPU-accelerated library for biological sequence analysis. This
 For more detailed API documentation please refer to the [documentation](#enable-doc-generation).
 
 * Modules
-    * [cudamapper](#cudamapper) - CUDA-accelerated sequence to sequence mapping
-    * [cudapoa](#cudapoa) - CUDA-accelerated partial order alignment
-    * [cudaaligner](#cudaaligner) - CUDA-accelerated pairwise sequence alignment
-    * [cudaextender](#cudaextender) - CUDA-accelerated seed extension
+    * [cudamapper](cudamapper/README.md) - CUDA-accelerated sequence to sequence mapping
+    * [cudapoa](cudapoa/README.md) - CUDA-accelerated partial order alignment
+    * [cudaaligner](cudaaligner/README.md) - CUDA-accelerated pairwise sequence alignment
+    * [cudaextender](cudaextender/README.md) - CUDA-accelerated seed extension
 * Setup GenomeWorks
     * [Clone GenomeWorks](#clone-genomeworks)
     * [System Requirements](#system-requirements)
@@ -17,75 +17,6 @@ For more detailed API documentation please refer to the [documentation](#enable-
 * [Python API](#genomeworks-python-api)
 * [Development Support](#development-support)
 
-### cudamapper
-
-The `cudamapper` package provides minimizer-based GPU-accelerated approximate mapping.
-
-#### Tool - *cudamapper*
-
-`cudamapper` is an end-to-end command line to for sequence to sequence mapping. `cudamapper` outputs
-mappings in the PAF format and is currently optimised for all-vs-all long read (ONT, Pacific Biosciences) sequences.
-
-To run all-vs all overlaps use the following command:
-
-`cudamapper in.fasta in.fasta`
-
-A query fasta can be mapped to a reference as follows:
-
-`cudamapper query.fasta target.fasta`
-
-To access more information about running cudamapper, run `cudamapper --help`.
-
-#### Library - *libcudamapper.so*
-
-* `Indexer` module to generate an index of minimizers from a list of sequences.
-* `Matcher` module to find locations of matching pairs of minimizers between sequences using minimizer indices.
-* `Overlapper` module to generate overlaps from sequence of minimizer matches generated by matcher.
-
-#### Sample - *sample_cudamapper*
-
-A prototypical binary highlighting the usage of `libcudamapper.so` APIs (indexer, matcher and overlapper) and
-techniques to tie them into an application.
-
-### cudapoa
-
-The `cudapoa` package provides a GPU-accelerated implementation of the [Partial Order Alignment](https://simpsonlab.github.io/2015/05/01/understanding-poa/)
-algorithm. It is heavily influenced by [SPOA](https://github.com/rvaser/spoa) and in many cases can be considered a GPU-accelerated replacement. Features include:
-
-#### Tool - *cudapoa*
-
-A command line tool for generating consensus and MSA from a list of `fasta`/`fastq` files. The tool
-is built on top of `libcudapoa.so` and showcases optimization strategies for writing high performance
-applications with `libcudapoa.so`.
-
-#### Library - *libcudapoa.so*
-
-* Generation of consensus sequences
-* Generation of multi-sequence alignments (MSAs)
-* Custom adaptive band implementation of POA
-* Support for long and short read sequences
-
-#### Sample - *sample_cudapoa*
-
-A prototypical binary to showcase the use of `libcudapoa.so` APIs.
-
-### cudaaligner
-
-The `cudaaligner` package provides GPU-accelerated global alignment. Features include:
-
-#### Library - *libcudaaligner.so*
-
-* Short and long read support
-* Banded implementation with configurable band width for flexible performance and accuracy trade-off
-
-#### Sample - *sample_cudaaligner*
-
-A prototypical binary to showcase the use of `libcudaaligner.so` APIs.
-
-### cudaextender
-The `cudaextender` package provides GPU-accelerated seed-extension. Details can be found in
-the package's readme.
-
 ## Clone GenomeWorks 
 
 ### Latest released version
diff --git a/cudaaligner/CMakeLists.txt b/cudaaligner/CMakeLists.txt
index d2a2f5712..3e75d8e7f 100644
--- a/cudaaligner/CMakeLists.txt
+++ b/cudaaligner/CMakeLists.txt
@@ -65,6 +65,7 @@ target_include_directories(${MODULE_NAME}
 target_compile_options(${MODULE_NAME} PRIVATE -Werror)
 
 add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/include)
+add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/README.md)
 
 # Add tests folder
 add_subdirectory(tests)
diff --git a/cudaaligner/README.md b/cudaaligner/README.md
new file mode 100644
index 000000000..8ccdec9ad
--- /dev/null
+++ b/cudaaligner/README.md
@@ -0,0 +1,13 @@
+# cudaaligner
+
+The `cudaaligner` package provides GPU-accelerated global alignment. Features include:
+
+## Library - libcudaaligner.so
+
+* Short and long read support
+* Banded implementation with configurable band width for flexible performance and accuracy trade-off
+
+## Sample - sample_cudaaligner
+
+A prototypical binary to showcase the use of `libcudaaligner.so` APIs.
+
diff --git a/cudaextender/CMakeLists.txt b/cudaextender/CMakeLists.txt
index adf51a0ff..8c00a1147 100644
--- a/cudaextender/CMakeLists.txt
+++ b/cudaextender/CMakeLists.txt
@@ -59,6 +59,7 @@ target_include_directories(${MODULE_NAME}
         )
 
 add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/include)
+add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/README.md)
 
 install(TARGETS ${MODULE_NAME}
         COMPONENT gwlogging
diff --git a/cudaextender/README.md b/cudaextender/README.md
index 078eb5376..5c049e025 100644
--- a/cudaextender/README.md
+++ b/cudaextender/README.md
@@ -7,21 +7,21 @@ Currently this module implements the ungapped X-drop algorithm, adapted from
 [SegAlign's](https://github.com/gsneha26/SegAlign) Ungapped Extender authored by 
 Sneha Goenka (gsneha@stanford.edu) and Yatish Turakhia (yturakhi@uscs.edu).
 
-### Encoded Input
+## Encoded Input
 `cudaextender` expects the input strands to be encoded as integer sequences. 
 This encoding scheme is documented here: [utils.hpp](include/claraparabricks/genomeworks/cudaextender/utils.hpp)
 file. The provided `encode_sequence()` helper function will encode the input strands on CPU with
 the expected scheme. 
 
-### API
+## API
 `cudaextender` provides host and device pointer APIs to enable ease of integration with other
 producer/consumer modules. The user is expected to handle all memory transactions and device
 sychronizations for the device pointer API. The host pointer API abstracts those operations away.
 Both APIs are documented here: [extender.hpp](include/claraparabricks/genomeworks/cudaextender/extender.hpp)
 
-### Library - *libcudaextender.so*
+## Library - libcudaextender.so
 Features:
 * Ungapped X-Drop extension
 
-### Sample - *[sample_cudaextender.cpp](samples/sample_cudaextender.cpp)*
+## Sample - sample_cudaextender
 Protoype to show the usage of host and device pointer APIs on FASTA sequences.
diff --git a/cudamapper/CMakeLists.txt b/cudamapper/CMakeLists.txt
index 7846d38ce..22868ecd3 100644
--- a/cudamapper/CMakeLists.txt
+++ b/cudamapper/CMakeLists.txt
@@ -83,6 +83,7 @@ if (gw_optimize_for_native_cpu)
 endif()
 
 add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/include)
+add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/README.md)
 
 cuda_add_executable(${MODULE_NAME}-bin
         src/main.cu
diff --git a/cudamapper/README.md b/cudamapper/README.md
new file mode 100644
index 000000000..19b6fd68a
--- /dev/null
+++ b/cudamapper/README.md
@@ -0,0 +1,30 @@
+# cudamapper
+
+The `cudamapper` package provides minimizer-based GPU-accelerated approximate mapping.
+
+## Tool - cudamapper
+
+`cudamapper` is an end-to-end command line to for sequence to sequence mapping. `cudamapper` outputs
+mappings in the PAF format and is currently optimised for all-vs-all long read (ONT, Pacific Biosciences) sequences.
+
+To run all-vs all overlaps use the following command:
+
+`cudamapper in.fasta in.fasta`
+
+A query fasta can be mapped to a reference as follows:
+
+`cudamapper query.fasta target.fasta`
+
+To access more information about running cudamapper, run `cudamapper --help`.
+
+## Library - libcudamapper.so
+
+* `Indexer` module to generate an index of minimizers from a list of sequences.
+* `Matcher` module to find locations of matching pairs of minimizers between sequences using minimizer indices.
+* `Overlapper` module to generate overlaps from sequence of minimizer matches generated by matcher.
+
+## Sample - sample_cudamapper
+
+A prototypical binary highlighting the usage of `libcudamapper.so` APIs (indexer, matcher and overlapper) and
+techniques to tie them into an application.
+
diff --git a/cudapoa/CMakeLists.txt b/cudapoa/CMakeLists.txt
index 53505f523..4ef5331a5 100644
--- a/cudapoa/CMakeLists.txt
+++ b/cudapoa/CMakeLists.txt
@@ -74,6 +74,7 @@ target_include_directories(${MODULE_NAME}
 )
 
 add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/include)
+add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/README.md)
 
 add_executable(${MODULE_NAME}-bin
         src/main.cpp
diff --git a/cudapoa/README.md b/cudapoa/README.md
new file mode 100644
index 000000000..d7b2e9ca6
--- /dev/null
+++ b/cudapoa/README.md
@@ -0,0 +1,21 @@
+# CUDAPOA
+
+The `cudapoa` package provides a GPU-accelerated implementation of the [Partial Order Alignment](https://simpsonlab.github.io/2015/05/01/understanding-poa/)
+algorithm. It is heavily influenced by [SPOA](https://github.com/rvaser/spoa) and in many cases can be considered a GPU-accelerated replacement. Features include:
+
+## Tool - cudapoa
+
+A command line tool for generating consensus and MSA from a list of `fasta`/`fastq` files. The tool
+is built on top of `libcudapoa.so` and showcases optimization strategies for writing high performance
+applications with `libcudapoa.so`.
+
+## Library - libcudapoa.so
+
+* Generation of consensus sequences
+* Generation of multi-sequence alignments (MSAs)
+* Custom adaptive band implementation of POA
+* Support for long and short read sequences
+
+## Sample - sample_cudapoa
+
+A prototypical binary to showcase the use of `libcudapoa.so` APIs.

From 04f784550402151439ad08847491c76a6e4036ec Mon Sep 17 00:00:00 2001
From: Joyjit Daw <jdaw@nvidia.com>
Date: Thu, 3 Dec 2020 14:15:13 -0500
Subject: [PATCH 08/11] [docs] added links to include folder and sample

---
 cudaaligner/README.md  |  8 +++++---
 cudaextender/README.md |  9 +++++----
 cudamapper/README.md   | 10 ++++++----
 cudapoa/README.md      | 10 ++++++----
 4 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/cudaaligner/README.md b/cudaaligner/README.md
index 8ccdec9ad..d642d6ad5 100644
--- a/cudaaligner/README.md
+++ b/cudaaligner/README.md
@@ -2,12 +2,14 @@
 
 The `cudaaligner` package provides GPU-accelerated global alignment. Features include:
 
-## Library - libcudaaligner.so
+## Library
+Built as `libcudaaligner.[so|a]`.
 
 * Short and long read support
 * Banded implementation with configurable band width for flexible performance and accuracy trade-off
 
-## Sample - sample_cudaaligner
+APIs documented in [include](include/claraparabricks/genomeworks/cudaaligner) folder.
 
-A prototypical binary to showcase the use of `libcudaaligner.so` APIs.
+## Sample
+[sample_cudaaligner](samples/sample_cudaaligner.cpp) - A prototypical binary to showcase the use of `libcudaaligner.so` APIs.
 
diff --git a/cudaextender/README.md b/cudaextender/README.md
index 5c049e025..9e718df4d 100644
--- a/cudaextender/README.md
+++ b/cudaextender/README.md
@@ -19,9 +19,10 @@ producer/consumer modules. The user is expected to handle all memory transaction
 sychronizations for the device pointer API. The host pointer API abstracts those operations away.
 Both APIs are documented here: [extender.hpp](include/claraparabricks/genomeworks/cudaextender/extender.hpp)
 
-## Library - libcudaextender.so
-Features:
+## Library
+Built as `libcudaextender.[so|a]`
+
 * Ungapped X-Drop extension
 
-## Sample - sample_cudaextender
-Protoype to show the usage of host and device pointer APIs on FASTA sequences.
+## Sample
+[sample_cudaextender](samples/sample_cudaextender.cpp) - Protoype to show the usage of host and device pointer APIs on FASTA sequences.
diff --git a/cudamapper/README.md b/cudamapper/README.md
index 19b6fd68a..5b2cdbd8b 100644
--- a/cudamapper/README.md
+++ b/cudamapper/README.md
@@ -2,7 +2,7 @@
 
 The `cudamapper` package provides minimizer-based GPU-accelerated approximate mapping.
 
-## Tool - cudamapper
+## Tool
 
 `cudamapper` is an end-to-end command line to for sequence to sequence mapping. `cudamapper` outputs
 mappings in the PAF format and is currently optimised for all-vs-all long read (ONT, Pacific Biosciences) sequences.
@@ -17,14 +17,16 @@ A query fasta can be mapped to a reference as follows:
 
 To access more information about running cudamapper, run `cudamapper --help`.
 
-## Library - libcudamapper.so
+## Library
+Built as `libcudamapper.[so|a]`
 
 * `Indexer` module to generate an index of minimizers from a list of sequences.
 * `Matcher` module to find locations of matching pairs of minimizers between sequences using minimizer indices.
 * `Overlapper` module to generate overlaps from sequence of minimizer matches generated by matcher.
 
-## Sample - sample_cudamapper
+APIs documented in [include](include/claraparabricks/genomeworks/cudamapper) folder.
 
-A prototypical binary highlighting the usage of `libcudamapper.so` APIs (indexer, matcher and overlapper) and
+## Sample
+[sample_cudamapper](samples/sample_cudamapper.cpp) - A prototypical binary highlighting the usage of `libcudamapper.so` APIs (indexer, matcher and overlapper) and
 techniques to tie them into an application.
 
diff --git a/cudapoa/README.md b/cudapoa/README.md
index d7b2e9ca6..e6aafb6c0 100644
--- a/cudapoa/README.md
+++ b/cudapoa/README.md
@@ -3,19 +3,21 @@
 The `cudapoa` package provides a GPU-accelerated implementation of the [Partial Order Alignment](https://simpsonlab.github.io/2015/05/01/understanding-poa/)
 algorithm. It is heavily influenced by [SPOA](https://github.com/rvaser/spoa) and in many cases can be considered a GPU-accelerated replacement. Features include:
 
-## Tool - cudapoa
+## Tool
 
 A command line tool for generating consensus and MSA from a list of `fasta`/`fastq` files. The tool
 is built on top of `libcudapoa.so` and showcases optimization strategies for writing high performance
 applications with `libcudapoa.so`.
 
-## Library - libcudapoa.so
+## Library
+Built as `libcudapoa.[so|a]`
 
 * Generation of consensus sequences
 * Generation of multi-sequence alignments (MSAs)
 * Custom adaptive band implementation of POA
 * Support for long and short read sequences
 
-## Sample - sample_cudapoa
+APIs documented in [include](include/claraparabricks/genomeworks/cudapoa) folder.
 
-A prototypical binary to showcase the use of `libcudapoa.so` APIs.
+## Sample
+[sample_cudapoa](samples/sample_cudapoa.cpp) - A prototypical binary to showcase the use of `libcudapoa.so` APIs.

From 714a2dd6338c272a5339b6b0c6c17f356f3f0c65 Mon Sep 17 00:00:00 2001
From: Joyjit Daw <jdaw@nvidia.com>
Date: Thu, 3 Dec 2020 16:31:25 -0500
Subject: [PATCH 09/11] [docs] improve organization of docs for
 cudapoa/cudaaligner

---
 cudaaligner/README.md  |  4 ++--
 cudaextender/README.md | 20 +++++++++-----------
 cudamapper/README.md   | 27 +++++++++++++--------------
 cudapoa/README.md      | 15 ++++++++-------
 4 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/cudaaligner/README.md b/cudaaligner/README.md
index d642d6ad5..270eb02fb 100644
--- a/cudaaligner/README.md
+++ b/cudaaligner/README.md
@@ -1,6 +1,6 @@
 # cudaaligner
 
-The `cudaaligner` package provides GPU-accelerated global alignment. Features include:
+The `cudaaligner` package provides GPU-accelerated global alignment.
 
 ## Library
 Built as `libcudaaligner.[so|a]`.
@@ -11,5 +11,5 @@ Built as `libcudaaligner.[so|a]`.
 APIs documented in [include](include/claraparabricks/genomeworks/cudaaligner) folder.
 
 ## Sample
-[sample_cudaaligner](samples/sample_cudaaligner.cpp) - A prototypical binary to showcase the use of `libcudaaligner.so` APIs.
+[sample_cudaaligner](samples/sample_cudaaligner.cpp) - A prototypical binary to showcase the use of `libcudaaligner` APIs.
 
diff --git a/cudaextender/README.md b/cudaextender/README.md
index 9e718df4d..6d03d61d2 100644
--- a/cudaextender/README.md
+++ b/cudaextender/README.md
@@ -1,28 +1,26 @@
 # cudaextender
 
-## Overview
 This package implements CUDA-accelerated seed-extension algorithms that use seed positions in 
 encoded input strands to extend and compute the alignment between the strands. 
 Currently this module implements the ungapped X-drop algorithm, adapted from 
 [SegAlign's](https://github.com/gsneha26/SegAlign) Ungapped Extender authored by 
 Sneha Goenka (gsneha@stanford.edu) and Yatish Turakhia (yturakhi@uscs.edu).
 
-## Encoded Input
-`cudaextender` expects the input strands to be encoded as integer sequences. 
-This encoding scheme is documented here: [utils.hpp](include/claraparabricks/genomeworks/cudaextender/utils.hpp)
-file. The provided `encode_sequence()` helper function will encode the input strands on CPU with
-the expected scheme. 
+## Library
+Built as `libcudaextender.[so|a]`
+
+* Ungapped X-Drop extension
 
-## API
 `cudaextender` provides host and device pointer APIs to enable ease of integration with other
 producer/consumer modules. The user is expected to handle all memory transactions and device
 sychronizations for the device pointer API. The host pointer API abstracts those operations away.
 Both APIs are documented here: [extender.hpp](include/claraparabricks/genomeworks/cudaextender/extender.hpp)
 
-## Library
-Built as `libcudaextender.[so|a]`
-
-* Ungapped X-Drop extension
+### Encoded Input
+`cudaextender` expects the input strands to be encoded as integer sequences. 
+This encoding scheme is documented here: [utils.hpp](include/claraparabricks/genomeworks/cudaextender/utils.hpp)
+file. The provided `encode_sequence()` helper function will encode the input strands on CPU with
+the expected scheme. 
 
 ## Sample
 [sample_cudaextender](samples/sample_cudaextender.cpp) - Protoype to show the usage of host and device pointer APIs on FASTA sequences.
diff --git a/cudamapper/README.md b/cudamapper/README.md
index 5b2cdbd8b..85fe8be9c 100644
--- a/cudamapper/README.md
+++ b/cudamapper/README.md
@@ -2,6 +2,19 @@
 
 The `cudamapper` package provides minimizer-based GPU-accelerated approximate mapping.
 
+## Library
+Built as `libcudamapper.[so|a]`
+
+* `Indexer` module to generate an index of minimizers from a list of sequences.
+* `Matcher` module to find locations of matching pairs of minimizers between sequences using minimizer indices.
+* `Overlapper` module to generate overlaps from sequence of minimizer matches generated by matcher.
+
+APIs documented in [include](include/claraparabricks/genomeworks/cudamapper) folder.
+
+## Sample
+[sample_cudamapper](samples/sample_cudamapper.cpp) - A prototypical binary highlighting the usage of `libcudamapper` APIs (indexer, matcher and overlapper) and
+techniques to tie them into an application.
+
 ## Tool
 
 `cudamapper` is an end-to-end command line to for sequence to sequence mapping. `cudamapper` outputs
@@ -16,17 +29,3 @@ A query fasta can be mapped to a reference as follows:
 `cudamapper query.fasta target.fasta`
 
 To access more information about running cudamapper, run `cudamapper --help`.
-
-## Library
-Built as `libcudamapper.[so|a]`
-
-* `Indexer` module to generate an index of minimizers from a list of sequences.
-* `Matcher` module to find locations of matching pairs of minimizers between sequences using minimizer indices.
-* `Overlapper` module to generate overlaps from sequence of minimizer matches generated by matcher.
-
-APIs documented in [include](include/claraparabricks/genomeworks/cudamapper) folder.
-
-## Sample
-[sample_cudamapper](samples/sample_cudamapper.cpp) - A prototypical binary highlighting the usage of `libcudamapper.so` APIs (indexer, matcher and overlapper) and
-techniques to tie them into an application.
-
diff --git a/cudapoa/README.md b/cudapoa/README.md
index e6aafb6c0..ffe8b86c0 100644
--- a/cudapoa/README.md
+++ b/cudapoa/README.md
@@ -3,12 +3,6 @@
 The `cudapoa` package provides a GPU-accelerated implementation of the [Partial Order Alignment](https://simpsonlab.github.io/2015/05/01/understanding-poa/)
 algorithm. It is heavily influenced by [SPOA](https://github.com/rvaser/spoa) and in many cases can be considered a GPU-accelerated replacement. Features include:
 
-## Tool
-
-A command line tool for generating consensus and MSA from a list of `fasta`/`fastq` files. The tool
-is built on top of `libcudapoa.so` and showcases optimization strategies for writing high performance
-applications with `libcudapoa.so`.
-
 ## Library
 Built as `libcudapoa.[so|a]`
 
@@ -20,4 +14,11 @@ Built as `libcudapoa.[so|a]`
 APIs documented in [include](include/claraparabricks/genomeworks/cudapoa) folder.
 
 ## Sample
-[sample_cudapoa](samples/sample_cudapoa.cpp) - A prototypical binary to showcase the use of `libcudapoa.so` APIs.
+[sample_cudapoa](samples/sample_cudapoa.cpp) - A prototypical binary to showcase the use of `libcudapoa` APIs.
+
+## Tool
+
+A command line tool for generating consensus and MSA from a list of `fasta`/`fastq` files. The tool
+is built on top of `libcudapoa` and showcases optimization strategies for writing high performance
+applications with `libcudapoa`.
+

From 301c6418296a975a7c81cffb077bf9f285b38219 Mon Sep 17 00:00:00 2001
From: Joyjit Daw <jdaw@nvidia.com>
Date: Thu, 3 Dec 2020 17:06:28 -0500
Subject: [PATCH 10/11] [docs] update cudapoa string

---
 cudapoa/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudapoa/README.md b/cudapoa/README.md
index ffe8b86c0..7b42694eb 100644
--- a/cudapoa/README.md
+++ b/cudapoa/README.md
@@ -1,7 +1,7 @@
 # CUDAPOA
 
 The `cudapoa` package provides a GPU-accelerated implementation of the [Partial Order Alignment](https://simpsonlab.github.io/2015/05/01/understanding-poa/)
-algorithm. It is heavily influenced by [SPOA](https://github.com/rvaser/spoa) and in many cases can be considered a GPU-accelerated replacement. Features include:
+algorithm. It is heavily influenced by [SPOA](https://github.com/rvaser/spoa) and in many cases can be considered a GPU-accelerated replacement.
 
 ## Library
 Built as `libcudapoa.[so|a]`

From 04185fc49f35784d4c9d0b76b0dfd5e8d64b18b7 Mon Sep 17 00:00:00 2001
From: Joyjit Daw <jdaw@nvidia.com>
Date: Thu, 3 Dec 2020 18:50:04 -0500
Subject: [PATCH 11/11] [cudapoa] add missing include in src file

---
 cudapoa/src/cudapoa.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cudapoa/src/cudapoa.cpp b/cudapoa/src/cudapoa.cpp
index c16b08d73..1069250c3 100644
--- a/cudapoa/src/cudapoa.cpp
+++ b/cudapoa/src/cudapoa.cpp
@@ -17,6 +17,8 @@
 #include <claraparabricks/genomeworks/cudapoa/cudapoa.hpp>
 #include <claraparabricks/genomeworks/logging/logging.hpp>
 
+#include <stdexcept>
+
 namespace claraparabricks
 {