Merge branch 'dev-v0.6.0' into poa-gfa

NVIDIA-Genomics-Research · Sep 1, 2020 · 18210af · 18210af
2 parents f8126ef + e6be3b0
commit 18210af
Show file tree

Hide file tree

Showing 2 changed files with 107 additions and 93 deletions.
diff --git a/cudamapper/src/index_gpu.cuh b/cudamapper/src/index_gpu.cuh
@@ -718,7 +718,7 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,
 
     std::uint64_t total_basepairs = 0;
     std::vector<ArrayBlock> read_id_to_basepairs_section_h;
-    std::vector<io::FastaSequence> fasta_reads;
+    std::vector<read_id_t> local_to_global_read_id;
 
     number_of_basepairs_in_longest_read_ = 0;
 
@@ -727,21 +727,21 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,
         GW_NVTX_RANGE(profiler, "IndexGPU::generate_index::count_basepairs");
         for (read_id_t read_id = first_read_id; read_id < past_the_last_read_id; ++read_id)
         {
-            fasta_reads.emplace_back(parser.get_sequence_by_id(read_id));
-            const std::string& read_basepairs = fasta_reads.back().seq;
-            const std::string& read_name      = fasta_reads.back().name;
+            const io::FastaSequence& sequence = parser.get_sequence_by_id(read_id);
+            const std::string& read_basepairs = sequence.seq;
             if (read_basepairs.length() >= window_size_ + kmer_size_ - 1)
             {
                 // TODO: make sure that no read is longer than what fits into position_in_read_t
-                read_id_to_basepairs_section_h.emplace_back(ArrayBlock{total_basepairs, static_cast<std::uint32_t>(read_basepairs.length())});
+                local_to_global_read_id.push_back(read_id);
+                read_id_to_basepairs_section_h.emplace_back(ArrayBlock{total_basepairs, get_size<std::uint32_t>(read_basepairs)});
                 total_basepairs += read_basepairs.length();
-                number_of_basepairs_in_longest_read_ = std::max(number_of_basepairs_in_longest_read_, static_cast<position_in_read_t>(read_basepairs.length()));
+                number_of_basepairs_in_longest_read_ = std::max(number_of_basepairs_in_longest_read_, get_size<position_in_read_t>(read_basepairs));
             }
             else
             {
                 // TODO: Implement this skipping in a correct manner
                 GW_LOG_INFO("Skipping read {}. It has {} basepairs, one window covers {} basepairs",
-                            read_name,
+                            sequence.name,
                             read_basepairs.length(),
                             window_size_ + kmer_size_ - 1);
             }
@@ -768,16 +768,14 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,
         GW_NVTX_RANGE(profiler, "IndexGPU::generate_index::merge_basepairs");
         // copy basepairs from each read into one big array
         // read_id starts from first_read_id which can have an arbitrary value, local_read_id always starts from 0
-        for (read_id_t local_read_id = 0; local_read_id < number_of_reads_; ++local_read_id)
+        for (read_id_t local_read_id = 0; local_read_id < local_to_global_read_id.size(); ++local_read_id)
         {
-            const std::string& read_basepairs = fasta_reads[local_read_id].seq;
+            const std::string& read_basepairs = parser.get_sequence_by_id(local_to_global_read_id[local_read_id]).seq;
             std::copy(std::begin(read_basepairs),
                       std::end(read_basepairs),
                       std::next(std::begin(merged_basepairs_h), read_id_to_basepairs_section_h[local_read_id].first_element_));
         }
     }
-    fasta_reads.clear();
-    fasta_reads.shrink_to_fit();
 
     // move basepairs to the device
     device_buffer<decltype(read_id_to_basepairs_section_h)::value_type> read_id_to_basepairs_section_d(read_id_to_basepairs_section_h.size(), allocator_, cuda_stream_);
@@ -798,7 +796,7 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,
 
     // sketch elements get generated here
     auto sketch_elements = SketchElementImpl::generate_sketch_elements(allocator_,
-                                                                       number_of_reads_,
+                                                                       local_to_global_read_id.size(), // number of valid reads
                                                                        kmer_size_,
                                                                        window_size_,
                                                                        first_read_id,

diff --git a/cudamapper/tests/Test_CudamapperIndexGPU.cu b/cudamapper/tests/Test_CudamapperIndexGPU.cu
@@ -1578,87 +1578,103 @@ TEST(TestCudamapperIndexGPU, CCCATACC_2_8)
                   expected_number_of_basepairs_in_longest_read);
 }
 
-// TODO: Cover this case as well
-//TEST(TestCudamapperIndexGPU, CATCAAG_AAGCTA_3_5)
-//{
-//    // *** One Read is shorter than one full window, the other is not ***
-//
-//    // >read_0
-//    // CATCAAG
-//    // >read_1
-//    // AAGCTA
-//
-//    // ** CATCAAG **
-//
-//    // kmer representation: forward, reverse
-//    // CAT:  103 <032>
-//    // ATC: <031> 203
-//    // TCA: <310> 320
-//    // CAA: <100> 332
-//    // AAG: <002> 133
-//
-//    // front end minimizers: representation, position_in_read, direction, read_id
-//    // CAT   : 032 0 R 0
-//    // CATC  : 031 1 F 0
-//    // CATCA : 031 1 F 0
-//    // CATCAA: 031 1 F 0
-//
-//    // central minimizers
-//    // CATCAAG: 002 4 F 0
-//
-//    // back end minimizers
-//    // ATCAAG: 002 4 F 0
-//    // TCAAG : 002 4 F 0
-//    // CAAG  : 002 4 F 0
-//    // AAG   : 002 4 F 0
-//
-//    // ** AAGCTA **
-//    // ** read does not fit one array **
-//
-//    // All minimizers: ATG(0r0), ATC(1f0), AAG(4f0)
-//
-//    // (2r1) means position 2, reverse direction, read 1
-//    // (1,2) means array block start at element 1 and has 2 elements
-//
-//    //              0         1         2
-//    // data arrays: AAG(4f0), ATC(1f0), ATG(0r0)
-//
-//    const std::string filename         = std::string(CUDAMAPPER_BENCHMARK_DATA_DIR) + "/catcaag_aagcta.fasta";
-//    const std::uint64_t minimizer_size = 3;
-//    const std::uint64_t window_size    = 5;
-//
-//    std::vector<representation_t> expected_representations;
-//    std::vector<position_in_read_t> expected_positions_in_reads;
-//    std::vector<read_id_t> expected_read_ids;
-//    std::vector<SketchElement::DirectionOfRepresentation> expected_directions_of_reads;
-//    expected_representations.push_back(0b000010); // AAG(4f0)
-//    expected_positions_in_reads.push_back(4);
-//    expected_read_ids.push_back(0);
-//    expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
-//    expected_representations.push_back(0b001101); // ATC(1f0)
-//    expected_positions_in_reads.push_back(1);
-//    expected_read_ids.push_back(0);
-//    expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
-//    expected_representations.push_back(0b001110); // ATG(0r0)
-//    expected_positions_in_reads.push_back(0);
-//    expected_read_ids.push_back(0);
-//    expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::REVERSE);
-//
-//    const read_id_t expected_number_of_reads                              = 1;
-//    const position_in_read_t expected_number_of_basepairs_in_longest_read = 7;
-//
-//    test_function(filename,
-//                  0,
-//                  2,
-//                  minimizer_size,
-//                  window_size,
-//                  expected_representations,
-//                  expected_positions_in_reads,
-//                  expected_read_ids,
-//                  expected_directions_of_reads,
-//                  expected_number_of_reads,
-//                  expected_number_of_basepairs_in_longest_read); // <- only one read goes into index, the other is too short
-//}
+TEST(TestCudamapperIndexGPU, CATCAAG_AAGCTA_3_5)
+{
+    // *** One Read is shorter than one full window, the other is not ***
+
+    // >read_0
+    // CATCAAG
+    // >read_1
+    // AAGCTA
+
+    // ** CATCAAG **
+
+    // kmer representation: forward, reverse
+    // CAT:  103 <032>
+    // ATC: <031> 203
+    // TCA: <310> 320
+    // CAA: <100> 332
+    // AAG: <002> 133
+
+    // front end minimizers: representation, position_in_read, direction, read_id
+    // CAT   : 032 0 R 0
+    // CATC  : 031 1 F 0
+    // CATCA : 031 1 F 0
+    // CATCAA: 031 1 F 0
+
+    // central minimizers
+    // CATCAAG: 002 4 F 0
+
+    // back end minimizers
+    // ATCAAG: 002 4 F 0
+    // TCAAG : 002 4 F 0
+    // CAAG  : 002 4 F 0
+    // AAG   : 002 4 F 0
+
+    // ** AAGCTA **
+    // ** read does not fit one array **
+
+    // All minimizers: ATG(0r0), ATC(1f0), AAG(4f0)
+
+    // (2r1) means position 2, reverse direction, read 1
+    // (1,2) means array block start at element 1 and has 2 elements
+
+    //              0         1         2
+    // data arrays: AAG(4f0), ATC(1f0), ATG(0r0)
+
+    const std::string filename         = std::string(CUDAMAPPER_BENCHMARK_DATA_DIR) + "/catcaag_aagcta.fasta";
+    const std::uint64_t minimizer_size = 3;
+    const std::uint64_t window_size    = 5;
+
+    std::vector<representation_t> expected_representations;
+    std::vector<position_in_read_t> expected_positions_in_reads;
+    std::vector<read_id_t> expected_read_ids;
+    std::vector<SketchElement::DirectionOfRepresentation> expected_directions_of_reads;
+    std::vector<representation_t> expected_unique_representations;
+    std::vector<std::uint32_t> expected_first_occurrence_of_representations;
+
+    expected_representations.push_back(0b000010); // AAG(4f0)
+    expected_positions_in_reads.push_back(4);
+    expected_read_ids.push_back(0);
+    expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
+    expected_unique_representations.push_back(0b000010);
+    expected_first_occurrence_of_representations.push_back(0);
+    expected_representations.push_back(0b001101); // ATC(1f0)
+    expected_positions_in_reads.push_back(1);
+    expected_read_ids.push_back(0);
+    expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
+    expected_unique_representations.push_back(0b001101);
+    expected_first_occurrence_of_representations.push_back(1);
+    expected_representations.push_back(0b001110); // ATG(0r0)
+    expected_positions_in_reads.push_back(0);
+    expected_read_ids.push_back(0);
+    expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::REVERSE);
+    expected_unique_representations.push_back(0b001110);
+    expected_first_occurrence_of_representations.push_back(2);
+
+    expected_first_occurrence_of_representations.push_back(3);
+
+    const read_id_t expected_number_of_reads                              = 2;
+    const read_id_t expected_smallest_read_id                             = 0;
+    const read_id_t expected_largest_read_id                              = 1;
+    const position_in_read_t expected_number_of_basepairs_in_longest_read = 7;
+
+    test_function(filename,
+                  0,
+                  2,
+                  expected_smallest_read_id,
+                  expected_largest_read_id,
+                  minimizer_size,
+                  window_size,
+                  expected_representations,
+                  expected_positions_in_reads,
+                  expected_read_ids,
+                  expected_directions_of_reads,
+                  expected_unique_representations,
+                  expected_first_occurrence_of_representations,
+                  expected_number_of_reads,
+                  expected_number_of_basepairs_in_longest_read);
+}
 
 TEST(TestCudamapperIndexGPU, CCCATACC_3_5)
 {