Skip to content

Commit

Permalink
Merge pull request #552 from r-mafi/cudapoa_optimization_v2
Browse files Browse the repository at this point in the history
[cudapoa] improving cudaPOA performance
  • Loading branch information
Joyjit Daw authored Sep 14, 2020
2 parents 30e8f80 + 7d14cbe commit 0e9a6f3
Show file tree
Hide file tree
Showing 25 changed files with 845 additions and 659 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct BatchConfig
BandMode band_mode;

/// constructor- set upper limit parameters based on max_seq_sz and band_width
BatchConfig(int32_t max_seq_sz = 1024, int32_t max_seq_per_poa = 100, int32_t band_width = 256, BandMode banding = BandMode::full_band);
BatchConfig(int32_t max_seq_sz = 1024, int32_t max_seq_per_poa = 100, int32_t band_width = 256, BandMode banding = BandMode::full_band, float adapive_storage_factor = 2.0);

/// constructor- set all parameters separately
BatchConfig(int32_t max_seq_sz, int32_t max_consensus_sz, int32_t max_nodes_per_w,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void get_multi_batch_sizes(std::vector<BatchConfig>& list_of_batch_sizes,
bool msa_flag = false,
int32_t band_width = 256,
BandMode band_mode = BandMode::adaptive_band,
float adaptive_storage_factor = 2.0f,
std::vector<int32_t>* bins_capacity = nullptr,
float gpu_memory_usage_quota = 0.9,
int32_t mismatch_score = -6,
Expand Down
7 changes: 6 additions & 1 deletion cudapoa/samples/sample_cudapoa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,13 +286,18 @@ int main(int argc, char** argv)
if (status == StatusType::success)
{
// Check if all sequences in POA group wre added successfully.
int32_t num_dropped_seq = 0;
for (const auto& s : seq_status)
{
if (s == StatusType::exceeded_maximum_sequence_size)
{
std::cerr << "Dropping sequence because sequence exceeded maximum size" << std::endl;
num_dropped_seq++;
}
}
if (num_dropped_seq > 0)
{
std::cerr << "Dropping " << num_dropped_seq << " sequence(s) in POA group " << batch_group_ids[i] << " because it exceeded maximum size" << std::endl;
}
i++;
}

Expand Down
9 changes: 0 additions & 9 deletions cudapoa/src/allocate_block.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,17 +253,10 @@ class BatchBlock
offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->outgoing_edge_count) * max_nodes_per_window_ * max_poas_);
graph_details_d->incoming_edge_weights = reinterpret_cast<decltype(graph_details_d->incoming_edge_weights)>(&block_data_d_[offset_d_]);
offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->incoming_edge_weights) * max_nodes_per_window_ * CUDAPOA_MAX_NODE_EDGES * max_poas_);
graph_details_d->outgoing_edge_weights = reinterpret_cast<decltype(graph_details_d->outgoing_edge_weights)>(&block_data_d_[offset_d_]);
offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->outgoing_edge_weights) * max_nodes_per_window_ * CUDAPOA_MAX_NODE_EDGES * max_poas_);
graph_details_d->sorted_poa = reinterpret_cast<decltype(graph_details_d->sorted_poa)>(&block_data_d_[offset_d_]);
offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->sorted_poa) * max_nodes_per_window_ * max_poas_);
graph_details_d->sorted_poa_node_map = reinterpret_cast<decltype(graph_details_d->sorted_poa_node_map)>(&block_data_d_[offset_d_]);
offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->sorted_poa_node_map) * max_nodes_per_window_ * max_poas_);
if (variable_bands_)
{
graph_details_d->node_distance_to_head = reinterpret_cast<decltype(graph_details_d->node_distance_to_head)>(&block_data_d_[offset_d_]);
offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->node_distance_to_head) * max_nodes_per_window_ * max_poas_);
}
graph_details_d->sorted_poa_local_edge_count = reinterpret_cast<decltype(graph_details_d->sorted_poa_local_edge_count)>(&block_data_d_[offset_d_]);
offset_d_ += cudautils::align<int64_t, 8>(sizeof(*graph_details_d->sorted_poa_local_edge_count) * max_nodes_per_window_ * max_poas_);
if (output_mask_ & OutputType::consensus)
Expand Down Expand Up @@ -332,10 +325,8 @@ class BatchBlock
device_size_per_poa += sizeof(*GraphDetails<SizeT>::outgoing_edges) * max_nodes_per_graph * CUDAPOA_MAX_NODE_EDGES; // graph_details_d_->outgoing_edges
device_size_per_poa += sizeof(*GraphDetails<SizeT>::outgoing_edge_count) * max_nodes_per_graph; // graph_details_d_->outgoing_edge_count
device_size_per_poa += sizeof(*GraphDetails<SizeT>::incoming_edge_weights) * max_nodes_per_graph * CUDAPOA_MAX_NODE_EDGES; // graph_details_d_->incoming_edge_weights
device_size_per_poa += sizeof(*GraphDetails<SizeT>::outgoing_edge_weights) * max_nodes_per_graph * CUDAPOA_MAX_NODE_EDGES; // graph_details_d_->outgoing_edge_weights
device_size_per_poa += sizeof(*GraphDetails<SizeT>::sorted_poa) * max_nodes_per_graph; // graph_details_d_->sorted_poa
device_size_per_poa += sizeof(*GraphDetails<SizeT>::sorted_poa_node_map) * max_nodes_per_graph; // graph_details_d_->sorted_poa_node_map
device_size_per_poa += variable_bands ? sizeof(*GraphDetails<SizeT>::node_distance_to_head) * max_nodes_per_graph : 0; // graph_details_d_->node_distance_to_head
device_size_per_poa += sizeof(*GraphDetails<SizeT>::sorted_poa_local_edge_count) * max_nodes_per_graph; // graph_details_d_->sorted_poa_local_edge_count
device_size_per_poa += (!msa_flag) ? sizeof(*GraphDetails<SizeT>::consensus_scores) * max_nodes_per_graph : 0; // graph_details_d_->consensus_scores
device_size_per_poa += (!msa_flag) ? sizeof(*GraphDetails<SizeT>::consensus_predecessors) * max_nodes_per_graph : 0; // graph_details_d_->consensus_predecessors
Expand Down
9 changes: 8 additions & 1 deletion cudapoa/src/application_parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
{"msa", no_argument, 0, 'a'},
{"band-mode", required_argument, 0, 'b'},
{"band-width", required_argument, 0, 'w'},
{"adaptive-storage", required_argument, 0, 's'},
{"dot", required_argument, 0, 'd'},
{"max-groups", required_argument, 0, 'M'},
{"gpu-mem-alloc", required_argument, 0, 'R'},
Expand All @@ -49,7 +50,7 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
{"help", no_argument, 0, 'h'},
};

std::string optstring = "i:ab:w:d:M:R:m:n:g:vh";
std::string optstring = "i:ab:w:s:d:M:R:m:n:g:vh";

int32_t argument = 0;
while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1)
Expand All @@ -72,6 +73,9 @@ ApplicationParameters::ApplicationParameters(int argc, char* argv[])
case 'w':
band_width = std::stoi(optarg);
break;
case 's':
adaptive_storage = std::stof(optarg);
break;
case 'd':
graph_output_path = std::string(optarg);
break;
Expand Down Expand Up @@ -185,6 +189,9 @@ void ApplicationParameters::help(int32_t exit_code)
-w, --band-width <int>
band-width for banded alignment (must be multiple of 128) [256])"
<< R"(
-s, --adaptive-storage <float>
factor to accommodate extra memory for adaptive score matrix. The factor represents ratio of adaptive-banded score matrix to static-banded score matrix [2.0])"
<< R"(
-d, --dot <file>
output path for printing graph in DOT format [disabled])"
<< R"(
Expand Down
1 change: 1 addition & 0 deletions cudapoa/src/application_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class ApplicationParameters
int32_t gap_score = -8;
int32_t match_score = 8;
double gpu_mem_allocation = 0.9;
float adaptive_storage = 2.0f;

private:
/// \brief verifies input file formats
Expand Down
5 changes: 3 additions & 2 deletions cudapoa/src/batch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ namespace cudapoa
{

/// constructor- set other parameters based on a minimum set of input arguments
BatchConfig::BatchConfig(int32_t max_seq_sz /*= 1024*/, int32_t max_seq_per_poa /*= 100*/, int32_t band_width /*= 256*/, BandMode banding /*= BandMode::full_band*/)
BatchConfig::BatchConfig(int32_t max_seq_sz /*= 1024*/, int32_t max_seq_per_poa /*= 100*/, int32_t band_width /*= 256*/,
BandMode banding /*= BandMode::full_band*/, float adapive_storage_factor /*= 2.0*/)
/// ensure a 4-byte boundary alignment for any allocated buffer
: max_sequence_size(max_seq_sz)
, max_consensus_size(2 * max_sequence_size)
Expand All @@ -58,7 +59,7 @@ BatchConfig::BatchConfig(int32_t max_seq_sz /*= 1024*/, int32_t max_seq_per_poa
max_nodes_per_graph = cudautils::align<int32_t, CELLS_PER_THREAD>(4 * max_sequence_size);
matrix_graph_dimension = cudautils::align<int32_t, CELLS_PER_THREAD>(max_nodes_per_graph);
// 8 = CUDAPOA_BANDED_MATRIX_RIGHT_PADDING, *2 is to reserve extra memory for cases with extended band-width
matrix_sequence_dimension = cudautils::align<int32_t, CELLS_PER_THREAD>(2 * (alignment_band_width + 8));
matrix_sequence_dimension = cudautils::align<int32_t, CELLS_PER_THREAD>(adapive_storage_factor * (alignment_band_width + 8));
}

throw_on_negative(max_seq_sz, "max_sequence_size cannot be negative.");
Expand Down
30 changes: 14 additions & 16 deletions cudapoa/src/cudapoa_add_alignment.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ namespace cudapoa
* @param[in] outgoing_edges Device buffer with outgoing edges per node
* @param[in] outgoing_edges_count Device buffer with number of outgoing edges per node
* @param[in] incoming_edge_w Device buffer with weight of incoming edges
* @param[in] outgoing_edge_w Device buffer with weight of outgoing edges
* @param[in] alignment_length Total length of new alignment
* @param[in] graph Device scratch space with sorted graph
* @param[in] alignment_graph Device buffer with nodes from graph in alignment
Expand All @@ -63,7 +62,7 @@ namespace cudapoa
* @return Status code for any errors encountered.
*/

template <typename SizeT>
template <typename SizeT, bool MSA = false>
__device__
uint8_t
addAlignmentToGraph(SizeT& new_node_count,
Expand All @@ -72,7 +71,7 @@ __device__
SizeT* node_alignments, uint16_t* node_alignment_count,
SizeT* incoming_edges, uint16_t* incoming_edge_count,
SizeT* outgoing_edges, uint16_t* outgoing_edge_count,
uint16_t* incoming_edge_w, uint16_t* /*outgoing_edge_w*/,
uint16_t* incoming_edge_w,
SizeT alignment_length,
SizeT* /*graph*/,
SizeT* alignment_graph,
Expand All @@ -85,8 +84,7 @@ __device__
uint16_t* outgoing_edges_coverage_count,
uint16_t s,
uint32_t max_sequences_per_poa,
uint32_t max_limit_nodes_per_window,
bool msa = false)
uint32_t max_limit_nodes_per_window)
{
//printf("Running addition for alignment %d\n", alignment_length);
SizeT head_node_id = -1;
Expand Down Expand Up @@ -152,7 +150,7 @@ __device__
//printf("aligned nodes are %d\n", num_aligned_node);
SizeT aligned_node_id = -1;
//printf("looping through alignments\n");
for (uint16_t n = 0; n < num_aligned_node; n++)
for (int32_t n = 0; n < num_aligned_node; n++)
{
SizeT aid = node_alignments[graph_node_id * CUDAPOA_MAX_NODE_ALIGNMENTS + n];
if (nodes[aid] == read_base)
Expand Down Expand Up @@ -187,7 +185,7 @@ __device__
node_coverage_counts[curr_node_id] = 0;
SizeT new_node_alignments = 0;

for (uint16_t n = 0; n < num_aligned_node; n++)
for (int32_t n = 0; n < num_aligned_node; n++)
{
SizeT aid = node_alignments[graph_node_id * CUDAPOA_MAX_NODE_ALIGNMENTS + n];
uint16_t aid_count = node_alignment_count[aid];
Expand All @@ -214,7 +212,7 @@ __device__
}

// for msa generation
if (msa && (read_pos == 0))
if (MSA && (read_pos == 0))
{
//begin node of the sequence, add its node_id (curr_node_id) to sequence_begin_nodes_ids
*sequence_begin_nodes_ids = curr_node_id;
Expand All @@ -226,7 +224,7 @@ __device__
{
bool edge_exists = false;
uint16_t in_count = incoming_edge_count[curr_node_id];
for (uint16_t e = 0; e < in_count; e++)
for (int32_t e = 0; e < in_count; e++)
{
if (incoming_edges[curr_node_id * CUDAPOA_MAX_NODE_EDGES + e] == head_node_id)
{
Expand All @@ -242,7 +240,7 @@ __device__
incoming_edge_count[curr_node_id] = in_count + 1;
uint16_t out_count = outgoing_edge_count[head_node_id];
outgoing_edges[head_node_id * CUDAPOA_MAX_NODE_EDGES + out_count] = curr_node_id;
if (msa)
if (MSA)
{
outgoing_edges_coverage_count[head_node_id * CUDAPOA_MAX_NODE_EDGES + out_count] = 1;
outgoing_edges_coverage[(head_node_id * CUDAPOA_MAX_NODE_EDGES + out_count) * max_sequences_per_poa] = s;
Expand All @@ -256,10 +254,10 @@ __device__
//printf("exceeded max edge count\n");
}
}
else if (msa) //if edge exists and for msa generation
else if (MSA) //if edge exists and for msa generation
{
uint16_t out_count = outgoing_edge_count[head_node_id];
for (uint16_t e = 0; e < out_count; e++)
for (int32_t e = 0; e < out_count; e++)
{
if (outgoing_edges[head_node_id * CUDAPOA_MAX_NODE_EDGES + e] == curr_node_id)
{
Expand Down Expand Up @@ -293,7 +291,7 @@ __global__ void addAlignmentKernel(uint8_t* nodes,
SizeT* node_alignments, uint16_t* node_alignment_count,
SizeT* incoming_edges, uint16_t* incoming_edge_count,
SizeT* outgoing_edges, uint16_t* outgoing_edge_count,
uint16_t* incoming_edge_w, uint16_t* outgoing_edge_w,
uint16_t* incoming_edge_w,
SizeT* alignment_length,
SizeT* graph,
SizeT* alignment_graph,
Expand All @@ -315,7 +313,7 @@ __global__ void addAlignmentKernel(uint8_t* nodes,
node_alignments, node_alignment_count,
incoming_edges, incoming_edge_count,
outgoing_edges, outgoing_edge_count,
incoming_edge_w, outgoing_edge_w,
incoming_edge_w,
*alignment_length,
graph,
alignment_graph,
Expand All @@ -339,7 +337,7 @@ void addAlignment(uint8_t* nodes,
SizeT* node_alignments, uint16_t* node_alignment_count,
SizeT* incoming_edges, uint16_t* incoming_edge_count,
SizeT* outgoing_edges, uint16_t* outgoing_edge_count,
uint16_t* incoming_edge_w, uint16_t* outgoing_edge_w,
uint16_t* incoming_edge_w,
SizeT* alignment_length,
SizeT* graph,
SizeT* alignment_graph,
Expand All @@ -359,7 +357,7 @@ void addAlignment(uint8_t* nodes,
node_alignments, node_alignment_count,
incoming_edges, incoming_edge_count,
outgoing_edges, outgoing_edge_count,
incoming_edge_w, outgoing_edge_w,
incoming_edge_w,
alignment_length,
graph,
alignment_graph,
Expand Down
4 changes: 2 additions & 2 deletions cudapoa/src/cudapoa_batch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ public:
{
output_status.emplace_back(genomeworks::cudapoa::StatusType::success);
uint16_t num_seqs = input_details_h_->window_details[poa].num_seqs;
for (uint16_t i = 0; i < num_seqs; i++)
for (int32_t i = 0; i < num_seqs; i++)
{
char* c = reinterpret_cast<char*>(&(output_details_h_->multiple_sequence_alignments[(poa * max_sequences_per_poa_ + i) * batch_size_.max_consensus_size]));
msa[poa].emplace_back(std::string(c));
Expand Down Expand Up @@ -379,7 +379,7 @@ public:
DirectedGraph::node_id_t sink = n;
graph.set_node_label(sink, std::string(1, static_cast<char>(nodes[n])));
uint16_t num_edges = graph_details_h_->incoming_edge_count[poa * max_nodes_per_window_ + n];
for (uint16_t e = 0; e < num_edges; e++)
for (int32_t e = 0; e < num_edges; e++)
{
int32_t idx = poa * max_nodes_per_window_ * CUDAPOA_MAX_NODE_EDGES + n * CUDAPOA_MAX_NODE_EDGES + e;
DirectedGraph::node_id_t src = graph_details_h_->incoming_edges[idx];
Expand Down
Loading

0 comments on commit 0e9a6f3

Please sign in to comment.