Skip to content

Commit

Permalink
Merge branch 'dev-v0.6.0' into poa-gfa
Browse files Browse the repository at this point in the history
  • Loading branch information
edawson authored Sep 1, 2020
2 parents f8126ef + e6be3b0 commit 18210af
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 93 deletions.
22 changes: 10 additions & 12 deletions cudamapper/src/index_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,

std::uint64_t total_basepairs = 0;
std::vector<ArrayBlock> read_id_to_basepairs_section_h;
std::vector<io::FastaSequence> fasta_reads;
std::vector<read_id_t> local_to_global_read_id;

number_of_basepairs_in_longest_read_ = 0;

Expand All @@ -727,21 +727,21 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,
GW_NVTX_RANGE(profiler, "IndexGPU::generate_index::count_basepairs");
for (read_id_t read_id = first_read_id; read_id < past_the_last_read_id; ++read_id)
{
fasta_reads.emplace_back(parser.get_sequence_by_id(read_id));
const std::string& read_basepairs = fasta_reads.back().seq;
const std::string& read_name = fasta_reads.back().name;
const io::FastaSequence& sequence = parser.get_sequence_by_id(read_id);
const std::string& read_basepairs = sequence.seq;
if (read_basepairs.length() >= window_size_ + kmer_size_ - 1)
{
// TODO: make sure that no read is longer than what fits into position_in_read_t
read_id_to_basepairs_section_h.emplace_back(ArrayBlock{total_basepairs, static_cast<std::uint32_t>(read_basepairs.length())});
local_to_global_read_id.push_back(read_id);
read_id_to_basepairs_section_h.emplace_back(ArrayBlock{total_basepairs, get_size<std::uint32_t>(read_basepairs)});
total_basepairs += read_basepairs.length();
number_of_basepairs_in_longest_read_ = std::max(number_of_basepairs_in_longest_read_, static_cast<position_in_read_t>(read_basepairs.length()));
number_of_basepairs_in_longest_read_ = std::max(number_of_basepairs_in_longest_read_, get_size<position_in_read_t>(read_basepairs));
}
else
{
// TODO: Implement this skipping in a correct manner
GW_LOG_INFO("Skipping read {}. It has {} basepairs, one window covers {} basepairs",
read_name,
sequence.name,
read_basepairs.length(),
window_size_ + kmer_size_ - 1);
}
Expand All @@ -768,16 +768,14 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,
GW_NVTX_RANGE(profiler, "IndexGPU::generate_index::merge_basepairs");
// copy basepairs from each read into one big array
// read_id starts from first_read_id which can have an arbitrary value, local_read_id always starts from 0
for (read_id_t local_read_id = 0; local_read_id < number_of_reads_; ++local_read_id)
for (read_id_t local_read_id = 0; local_read_id < local_to_global_read_id.size(); ++local_read_id)
{
const std::string& read_basepairs = fasta_reads[local_read_id].seq;
const std::string& read_basepairs = parser.get_sequence_by_id(local_to_global_read_id[local_read_id]).seq;
std::copy(std::begin(read_basepairs),
std::end(read_basepairs),
std::next(std::begin(merged_basepairs_h), read_id_to_basepairs_section_h[local_read_id].first_element_));
}
}
fasta_reads.clear();
fasta_reads.shrink_to_fit();

// move basepairs to the device
device_buffer<decltype(read_id_to_basepairs_section_h)::value_type> read_id_to_basepairs_section_d(read_id_to_basepairs_section_h.size(), allocator_, cuda_stream_);
Expand All @@ -798,7 +796,7 @@ void IndexGPU<SketchElementImpl>::generate_index(const io::FastaParser& parser,

// sketch elements get generated here
auto sketch_elements = SketchElementImpl::generate_sketch_elements(allocator_,
number_of_reads_,
local_to_global_read_id.size(), // number of valid reads
kmer_size_,
window_size_,
first_read_id,
Expand Down
178 changes: 97 additions & 81 deletions cudamapper/tests/Test_CudamapperIndexGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1578,87 +1578,103 @@ TEST(TestCudamapperIndexGPU, CCCATACC_2_8)
expected_number_of_basepairs_in_longest_read);
}

// TODO: Cover this case as well
//TEST(TestCudamapperIndexGPU, CATCAAG_AAGCTA_3_5)
//{
// // *** One Read is shorter than one full window, the other is not ***
//
// // >read_0
// // CATCAAG
// // >read_1
// // AAGCTA
//
// // ** CATCAAG **
//
// // kmer representation: forward, reverse
// // CAT: 103 <032>
// // ATC: <031> 203
// // TCA: <310> 320
// // CAA: <100> 332
// // AAG: <002> 133
//
// // front end minimizers: representation, position_in_read, direction, read_id
// // CAT : 032 0 R 0
// // CATC : 031 1 F 0
// // CATCA : 031 1 F 0
// // CATCAA: 031 1 F 0
//
// // central minimizers
// // CATCAAG: 002 4 F 0
//
// // back end minimizers
// // ATCAAG: 002 4 F 0
// // TCAAG : 002 4 F 0
// // CAAG : 002 4 F 0
// // AAG : 002 4 F 0
//
// // ** AAGCTA **
// // ** read does not fit one array **
//
// // All minimizers: ATG(0r0), ATC(1f0), AAG(4f0)
//
// // (2r1) means position 2, reverse direction, read 1
// // (1,2) means array block start at element 1 and has 2 elements
//
// // 0 1 2
// // data arrays: AAG(4f0), ATC(1f0), ATG(0r0)
//
// const std::string filename = std::string(CUDAMAPPER_BENCHMARK_DATA_DIR) + "/catcaag_aagcta.fasta";
// const std::uint64_t minimizer_size = 3;
// const std::uint64_t window_size = 5;
//
// std::vector<representation_t> expected_representations;
// std::vector<position_in_read_t> expected_positions_in_reads;
// std::vector<read_id_t> expected_read_ids;
// std::vector<SketchElement::DirectionOfRepresentation> expected_directions_of_reads;
// expected_representations.push_back(0b000010); // AAG(4f0)
// expected_positions_in_reads.push_back(4);
// expected_read_ids.push_back(0);
// expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
// expected_representations.push_back(0b001101); // ATC(1f0)
// expected_positions_in_reads.push_back(1);
// expected_read_ids.push_back(0);
// expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
// expected_representations.push_back(0b001110); // ATG(0r0)
// expected_positions_in_reads.push_back(0);
// expected_read_ids.push_back(0);
// expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::REVERSE);
//
// const read_id_t expected_number_of_reads = 1;
// const position_in_read_t expected_number_of_basepairs_in_longest_read = 7;
//
// test_function(filename,
// 0,
// 2,
// minimizer_size,
// window_size,
// expected_representations,
// expected_positions_in_reads,
// expected_read_ids,
// expected_directions_of_reads,
// expected_number_of_reads,
// expected_number_of_basepairs_in_longest_read); // <- only one read goes into index, the other is too short
//}
TEST(TestCudamapperIndexGPU, CATCAAG_AAGCTA_3_5)
{
// *** One Read is shorter than one full window, the other is not ***

// >read_0
// CATCAAG
// >read_1
// AAGCTA

// ** CATCAAG **

// kmer representation: forward, reverse
// CAT: 103 <032>
// ATC: <031> 203
// TCA: <310> 320
// CAA: <100> 332
// AAG: <002> 133

// front end minimizers: representation, position_in_read, direction, read_id
// CAT : 032 0 R 0
// CATC : 031 1 F 0
// CATCA : 031 1 F 0
// CATCAA: 031 1 F 0

// central minimizers
// CATCAAG: 002 4 F 0

// back end minimizers
// ATCAAG: 002 4 F 0
// TCAAG : 002 4 F 0
// CAAG : 002 4 F 0
// AAG : 002 4 F 0

// ** AAGCTA **
// ** read does not fit one array **

// All minimizers: ATG(0r0), ATC(1f0), AAG(4f0)

// (2r1) means position 2, reverse direction, read 1
// (1,2) means array block start at element 1 and has 2 elements

// 0 1 2
// data arrays: AAG(4f0), ATC(1f0), ATG(0r0)

const std::string filename = std::string(CUDAMAPPER_BENCHMARK_DATA_DIR) + "/catcaag_aagcta.fasta";
const std::uint64_t minimizer_size = 3;
const std::uint64_t window_size = 5;

std::vector<representation_t> expected_representations;
std::vector<position_in_read_t> expected_positions_in_reads;
std::vector<read_id_t> expected_read_ids;
std::vector<SketchElement::DirectionOfRepresentation> expected_directions_of_reads;
std::vector<representation_t> expected_unique_representations;
std::vector<std::uint32_t> expected_first_occurrence_of_representations;

expected_representations.push_back(0b000010); // AAG(4f0)
expected_positions_in_reads.push_back(4);
expected_read_ids.push_back(0);
expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
expected_unique_representations.push_back(0b000010);
expected_first_occurrence_of_representations.push_back(0);
expected_representations.push_back(0b001101); // ATC(1f0)
expected_positions_in_reads.push_back(1);
expected_read_ids.push_back(0);
expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::FORWARD);
expected_unique_representations.push_back(0b001101);
expected_first_occurrence_of_representations.push_back(1);
expected_representations.push_back(0b001110); // ATG(0r0)
expected_positions_in_reads.push_back(0);
expected_read_ids.push_back(0);
expected_directions_of_reads.push_back(SketchElement::DirectionOfRepresentation::REVERSE);
expected_unique_representations.push_back(0b001110);
expected_first_occurrence_of_representations.push_back(2);

expected_first_occurrence_of_representations.push_back(3);

const read_id_t expected_number_of_reads = 2;
const read_id_t expected_smallest_read_id = 0;
const read_id_t expected_largest_read_id = 1;
const position_in_read_t expected_number_of_basepairs_in_longest_read = 7;

test_function(filename,
0,
2,
expected_smallest_read_id,
expected_largest_read_id,
minimizer_size,
window_size,
expected_representations,
expected_positions_in_reads,
expected_read_ids,
expected_directions_of_reads,
expected_unique_representations,
expected_first_occurrence_of_representations,
expected_number_of_reads,
expected_number_of_basepairs_in_longest_read);
}

TEST(TestCudamapperIndexGPU, CCCATACC_3_5)
{
Expand Down

0 comments on commit 18210af

Please sign in to comment.