diff --git a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh index aa9b2897075..ac8864d7e8f 100644 --- a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh +++ b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -184,9 +184,9 @@ template void decompress_matrix_partition_to_edgelist( raft::handle_t const& handle, matrix_partition_device_view_t const matrix_partition, - vertex_t* edgelist_majors /* [INOUT] */, - vertex_t* edgelist_minors /* [INOUT] */, - std::optional edgelist_weights /* [INOUT] */, + vertex_t* edgelist_majors /* [OUT] */, + vertex_t* edgelist_minors /* [OUT] */, + std::optional edgelist_weights /* [OUT] */, std::optional> const& segment_offsets) { auto number_of_edges = matrix_partition.get_number_of_edges(); diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh index 254744d11d9..8cd1eced921 100644 --- a/cpp/include/cugraph/detail/graph_utils.cuh +++ b/cpp/include/cugraph/detail/graph_utils.cuh @@ -86,5 +86,15 @@ struct is_first_in_run_t { } }; +template +struct is_first_in_run_pair_t { + vertex_t const* vertices0{nullptr}; + vertex_t const* vertices1{nullptr}; + __device__ bool operator()(size_t i) const + { + return (i == 0) || ((vertices0[i - 1] != vertices0[i]) || (vertices1[i - 1] != vertices1[i])); + } +}; + } // namespace detail } // namespace cugraph diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 6234acf5559..e4cc48dfd99 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -45,51 +45,66 @@ namespace cugraph { namespace { +template +struct is_not_lower_triangular_t { + __device__ bool operator()(EdgeTupleType e) const + { + return thrust::get<0>(e) < thrust::get<1>(e); + } +}; + +template +struct is_not_self_loop_t { + __device__ bool operator()(EdgeTupleType e) const + { + return thrust::get<0>(e) != thrust::get<1>(e); + } +}; + template -edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, +edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_majors /* [INOUT] */, + vertex_t* edgelist_minors /* [INOUT] */, std::optional edgelist_weights /* [INOUT] */, edge_t number_of_edges, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - auto pair_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); + auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors, edgelist_minors)); if (edgelist_weights) { thrust::sort_by_key( - rmm::exec_policy(stream), pair_first, pair_first + number_of_edges, *edgelist_weights); - - rmm::device_uvector tmp_edgelist_major_vertices(number_of_edges, stream); - rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), - stream); - rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); - auto it = thrust::reduce_by_key( - rmm::exec_policy(stream), - pair_first, - pair_first + number_of_edges, - (*edgelist_weights), - thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), - tmp_edgelist_minor_vertices.begin())), - tmp_edgelist_weights.begin()); - auto ret = - static_cast(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it))); - - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), - tmp_edgelist_minor_vertices.begin(), - tmp_edgelist_weights.begin())); - thrust::copy(rmm::exec_policy(stream), + rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges, *edgelist_weights); + + auto num_uniques = + thrust::count_if(rmm::exec_policy(stream_view), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(number_of_edges)), + detail::is_first_in_run_pair_t{edgelist_majors, edgelist_minors}); + + rmm::device_uvector tmp_edgelist_majors(num_uniques, stream_view); + rmm::device_uvector tmp_edgelist_minors(tmp_edgelist_majors.size(), stream_view); + rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_majors.size(), stream_view); + thrust::reduce_by_key(rmm::exec_policy(stream_view), + pair_first, + pair_first + number_of_edges, + (*edgelist_weights), + thrust::make_zip_iterator(thrust::make_tuple( + tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin())), + tmp_edgelist_weights.begin()); + + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin(), tmp_edgelist_weights.begin())); + thrust::copy(rmm::exec_policy(stream_view), edge_first, - edge_first + ret, - thrust::make_zip_iterator(thrust::make_tuple( - edgelist_major_vertices, edgelist_minor_vertices, *edgelist_weights))); + edge_first + num_uniques, + thrust::make_zip_iterator( + thrust::make_tuple(edgelist_majors, edgelist_minors, *edgelist_weights))); - return ret; + return num_uniques; } else { - thrust::sort(rmm::exec_policy(stream), pair_first, pair_first + number_of_edges); + thrust::sort(rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges); return static_cast(thrust::distance( pair_first, - thrust::unique(rmm::exec_policy(stream), pair_first, pair_first + number_of_edges))); + thrust::unique(rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges))); } } @@ -106,34 +121,34 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( matrix_partition_device_view_t const matrix_partition, vertex_t const* major_label_first, AdjMatrixMinorLabelInputWrapper const minor_label_input, - std::optional> const& segment_offsets) + std::optional> const& segment_offsets, + bool lower_triangular_only) { static_assert(std::is_same_v); // FIXME: it might be possible to directly create relabled & coarsened edgelist from the // compressed sparse format to save memory - rmm::device_uvector edgelist_major_vertices(matrix_partition.get_number_of_edges(), - handle.get_stream()); - rmm::device_uvector edgelist_minor_vertices(edgelist_major_vertices.size(), - handle.get_stream()); + rmm::device_uvector edgelist_majors(matrix_partition.get_number_of_edges(), + handle.get_stream()); + rmm::device_uvector edgelist_minors(edgelist_majors.size(), handle.get_stream()); auto edgelist_weights = matrix_partition.get_weights() ? std::make_optional>( - edgelist_major_vertices.size(), handle.get_stream()) + edgelist_majors.size(), handle.get_stream()) : std::nullopt; detail::decompress_matrix_partition_to_edgelist( handle, matrix_partition, - edgelist_major_vertices.data(), - edgelist_minor_vertices.data(), + edgelist_majors.data(), + edgelist_minors.data(), edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, segment_offsets); - auto pair_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + auto pair_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin())); thrust::transform(handle.get_thrust_policy(), pair_first, - pair_first + edgelist_major_vertices.size(), + pair_first + edgelist_majors.size(), pair_first, [major_label_first, minor_label_input, @@ -144,24 +159,58 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( minor_label_input.get(thrust::get<1>(val) - minor_first)); }); + if (lower_triangular_only) { + if (edgelist_weights) { + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + edgelist_majors.begin(), edgelist_minors.begin(), (*edgelist_weights).begin())); + edgelist_majors.resize( + thrust::distance( + edge_first, + thrust::remove_if( + handle.get_thrust_policy(), + edge_first, + edge_first + edgelist_majors.size(), + is_not_lower_triangular_t>{})), + handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(edgelist_majors.size(), handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); + (*edgelist_weights).resize(edgelist_majors.size(), handle.get_stream()); + (*edgelist_weights).shrink_to_fit(handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin())); + edgelist_majors.resize( + thrust::distance( + edge_first, + thrust::remove_if(handle.get_thrust_policy(), + edge_first, + edge_first + edgelist_majors.size(), + is_not_lower_triangular_t>{})), + handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(edgelist_majors.size(), handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); + } + } + auto number_of_edges = groupby_e_and_coarsen_edgelist( - edgelist_major_vertices.data(), - edgelist_minor_vertices.data(), + edgelist_majors.data(), + edgelist_minors.data(), edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, - static_cast(edgelist_major_vertices.size()), + static_cast(edgelist_majors.size()), handle.get_stream()); - edgelist_major_vertices.resize(number_of_edges, handle.get_stream()); - edgelist_major_vertices.shrink_to_fit(handle.get_stream()); - edgelist_minor_vertices.resize(number_of_edges, handle.get_stream()); - edgelist_minor_vertices.shrink_to_fit(handle.get_stream()); + edgelist_majors.resize(number_of_edges, handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(number_of_edges, handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); if (edgelist_weights) { (*edgelist_weights).resize(number_of_edges, handle.get_stream()); (*edgelist_weights).shrink_to_fit(handle.get_stream()); } - return std::make_tuple(std::move(edgelist_major_vertices), - std::move(edgelist_minor_vertices), - std::move(edgelist_weights)); + return std::make_tuple( + std::move(edgelist_majors), std::move(edgelist_minors), std::move(edgelist_weights)); } } // namespace @@ -198,7 +247,11 @@ coarsen_graph( // currently, nothing to do } - // 1. construct coarsened edge list + // 1. construct coarsened edge lists from each local partition (if the input graph is symmetric, + // start with only the lower triangular edges after relabeling, this is to prevent edge weights in + // the coarsened graph becoming asymmmetric due to limited floatping point resolution) + + bool lower_triangular_only = graph_view.is_symmetric(); std::conditional_t< store_transposed, @@ -213,27 +266,16 @@ coarsen_graph( copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels); } - std::vector> coarsened_edgelist_major_vertices{}; - std::vector> coarsened_edgelist_minor_vertices{}; + std::vector> coarsened_edgelist_majors{}; + std::vector> coarsened_edgelist_minors{}; auto coarsened_edgelist_weights = graph_view.is_weighted() ? std::make_optional>>({}) : std::nullopt; - coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions()); - coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size()); + coarsened_edgelist_majors.reserve(graph_view.get_number_of_local_adj_matrix_partitions()); + coarsened_edgelist_minors.reserve(coarsened_edgelist_majors.size()); if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights).reserve(coarsened_edgelist_major_vertices.size()); - } - for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream()); - coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream()); - if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights).emplace_back(0, handle.get_stream()); - } + (*coarsened_edgelist_weights).reserve(coarsened_edgelist_majors.size()); } - // FIXME: we may compare performance/memory footprint with the hash_based approach especially when - // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst - // case). We may be able to limit the memory requirement close to the final coarsened edgelist - // with the hash based approach. for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { // 1-1. locally construct coarsened edge list @@ -248,110 +290,254 @@ coarsen_graph( static_cast(i), handle.get_stream()); - auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] = + auto [edgelist_majors, edgelist_minors, edgelist_weights] = decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( handle, matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)), major_labels.data(), adj_matrix_minor_labels.device_view(), - graph_view.get_local_adj_matrix_partition_segment_offsets(i)); + graph_view.get_local_adj_matrix_partition_segment_offsets(i), + lower_triangular_only); // 1-2. globally shuffle - std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) = + std::tie(edgelist_majors, edgelist_minors, edgelist_weights) = cugraph::detail::shuffle_edgelist_by_gpu_id(handle, - std::move(edgelist_major_vertices), - std::move(edgelist_minor_vertices), + std::move(edgelist_majors), + std::move(edgelist_minors), std::move(edgelist_weights)); - // 1-3. append data to local adjacency matrix partitions + // 1-3. groupby and coarsen again - // FIXME: we can skip this if groupby_gpu_id_and_shuffle_values is updated to return sorted edge - // list based on the final matrix partition (maybe add - // groupby_adj_matrix_partition_and_shuffle_values). + auto coarsened_size = groupby_e_and_coarsen_edgelist( + edgelist_majors.data(), + edgelist_minors.data(), + edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, + edgelist_majors.size(), + handle.get_stream()); + edgelist_majors.resize(coarsened_size, handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(edgelist_majors.size(), handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); + if (edgelist_weights) { + (*edgelist_weights).resize(edgelist_majors.size(), handle.get_stream()); + (*edgelist_weights).shrink_to_fit(handle.get_stream()); + } - auto counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id( - handle, edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights); + coarsened_edgelist_majors.push_back(std::move(edgelist_majors)); + coarsened_edgelist_minors.push_back(std::move(edgelist_minors)); + if (edgelist_weights) { (*coarsened_edgelist_weights).push_back(std::move(*edgelist_weights)); } + } - std::vector h_counts(counts.size()); - raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream()); - handle.sync_stream(); + // 2. concatenate and groupby and coarsen again (and if the input graph is symmetric, create a + // copy excluding self loops and globally shuffle) - std::vector h_displacements(h_counts.size(), size_t{0}); - std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1); + edge_t tot_count{0}; + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { + tot_count += coarsened_edgelist_majors[i].size(); + } - for (int j = 0; j < col_comm_size; ++j) { - auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( - edgelist_major_vertices.begin() + h_displacements[j], - edgelist_minor_vertices.begin() + h_displacements[j], - edgelist_weights ? std::optional{(*edgelist_weights).data() + h_displacements[j]} - : std::nullopt, - h_counts[j], - handle.get_stream()); + rmm::device_uvector concatenated_edgelist_majors(tot_count, handle.get_stream()); + size_t major_offset{0}; + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + coarsened_edgelist_majors[i].begin(), + coarsened_edgelist_majors[i].end(), + concatenated_edgelist_majors.begin() + major_offset); + major_offset += coarsened_edgelist_majors[i].size(); + coarsened_edgelist_majors[i].resize(0, handle.get_stream()); + coarsened_edgelist_majors[i].shrink_to_fit(handle.get_stream()); + } - auto cur_size = coarsened_edgelist_major_vertices[j].size(); - // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we - // can reserve address space to avoid expensive reallocation. - // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management - coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges, - handle.get_stream()); - coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(), - handle.get_stream()); + rmm::device_uvector concatenated_edgelist_minors(tot_count, handle.get_stream()); + size_t minor_offset{0}; + for (size_t i = 0; i < coarsened_edgelist_minors.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + coarsened_edgelist_minors[i].begin(), + coarsened_edgelist_minors[i].end(), + concatenated_edgelist_minors.begin() + minor_offset); + minor_offset += coarsened_edgelist_minors[i].size(); + coarsened_edgelist_minors[i].resize(0, handle.get_stream()); + coarsened_edgelist_minors[i].shrink_to_fit(handle.get_stream()); + } - if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_major_vertices[j].size(), - handle.get_stream()); + std::optional> concatenated_edgelist_weights{std::nullopt}; + if (coarsened_edgelist_weights) { + concatenated_edgelist_weights = rmm::device_uvector(tot_count, handle.get_stream()); + size_t weight_offset{0}; + for (size_t i = 0; i < (*coarsened_edgelist_weights).size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + (*coarsened_edgelist_weights)[i].begin(), + (*coarsened_edgelist_weights)[i].end(), + (*concatenated_edgelist_weights).begin() + weight_offset); + weight_offset += (*coarsened_edgelist_weights)[i].size(); + (*coarsened_edgelist_weights)[i].resize(0, handle.get_stream()); + (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream()); + } + } - auto src_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), - edgelist_minor_vertices.begin(), - (*edgelist_weights).begin())) + - h_displacements[j]; - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), - coarsened_edgelist_minor_vertices[j].begin(), - (*coarsened_edgelist_weights)[j].begin())) + - cur_size; - thrust::copy(handle.get_thrust_policy(), - src_edge_first, - src_edge_first + number_of_partition_edges, - dst_edge_first); - } else { - auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple( - edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) + - h_displacements[j]; - auto dst_edge_first = thrust::make_zip_iterator( - thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), - coarsened_edgelist_minor_vertices[j].begin())) + - cur_size; - thrust::copy(handle.get_thrust_policy(), - src_edge_first, - src_edge_first + number_of_partition_edges, - dst_edge_first); - } + auto concatenated_and_coarsened_size = groupby_e_and_coarsen_edgelist( + concatenated_edgelist_majors.data(), + concatenated_edgelist_minors.data(), + concatenated_edgelist_weights + ? std::optional{(*concatenated_edgelist_weights).data()} + : std::nullopt, + concatenated_edgelist_majors.size(), + handle.get_stream()); + concatenated_edgelist_majors.resize(concatenated_and_coarsened_size, handle.get_stream()); + concatenated_edgelist_majors.shrink_to_fit(handle.get_stream()); + concatenated_edgelist_minors.resize(concatenated_edgelist_majors.size(), handle.get_stream()); + concatenated_edgelist_minors.shrink_to_fit(handle.get_stream()); + if (concatenated_edgelist_weights) { + (*concatenated_edgelist_weights) + .resize(concatenated_edgelist_majors.size(), handle.get_stream()); + (*concatenated_edgelist_weights).shrink_to_fit(handle.get_stream()); + } + + std::optional> reversed_edgelist_majors{std::nullopt}; + std::optional> reversed_edgelist_minors{std::nullopt}; + std::optional> reversed_edgelist_weights{std::nullopt}; + if (lower_triangular_only) { + if (concatenated_edgelist_weights) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(concatenated_edgelist_majors.begin(), + concatenated_edgelist_minors.begin(), + (*concatenated_edgelist_weights).begin())); + auto last = + thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + concatenated_edgelist_majors.size(), + is_not_self_loop_t>{}); + reversed_edgelist_majors = + rmm::device_uvector(thrust::distance(edge_first, last), handle.get_stream()); + reversed_edgelist_minors = + rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + reversed_edgelist_weights = + rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + thrust::copy( + handle.get_thrust_policy(), + edge_first, + edge_first + (*reversed_edgelist_majors).size(), + thrust::make_zip_iterator(thrust::make_tuple((*reversed_edgelist_minors).begin(), + (*reversed_edgelist_majors).begin(), + (*reversed_edgelist_weights).begin()))); + } else { + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + concatenated_edgelist_majors.begin(), concatenated_edgelist_minors.begin())); + auto last = thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + concatenated_edgelist_majors.size(), + is_not_self_loop_t>{}); + reversed_edgelist_majors = + rmm::device_uvector(thrust::distance(edge_first, last), handle.get_stream()); + reversed_edgelist_minors = + rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edge_first, + edge_first + (*reversed_edgelist_majors).size(), + thrust::make_zip_iterator(thrust::make_tuple( + (*reversed_edgelist_minors).begin(), (*reversed_edgelist_majors).begin()))); } + + std::tie(*reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights) = + cugraph::detail::shuffle_edgelist_by_gpu_id(handle, + std::move(*reversed_edgelist_majors), + std::move(*reversed_edgelist_minors), + std::move(reversed_edgelist_weights)); + } + + // 3. split concatenated edge list to local partitions + + auto concatenated_counts = + groupby_and_count_edgelist_by_local_partition_id(handle, + concatenated_edgelist_majors, + concatenated_edgelist_minors, + concatenated_edgelist_weights); + + std::vector h_concatenated_counts(concatenated_counts.size()); + raft::update_host(h_concatenated_counts.data(), + concatenated_counts.data(), + concatenated_counts.size(), + handle.get_stream()); + + std::optional> h_reversed_counts{std::nullopt}; + if (reversed_edgelist_majors) { + auto reversed_counts = groupby_and_count_edgelist_by_local_partition_id( + handle, *reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights); + + h_reversed_counts = std::vector(reversed_counts.size()); + raft::update_host((*h_reversed_counts).data(), + reversed_counts.data(), + reversed_counts.size(), + handle.get_stream()); + } + + handle.sync_stream(); + + std::vector h_concatenated_displacements(h_concatenated_counts.size(), size_t{0}); + std::partial_sum(h_concatenated_counts.begin(), + h_concatenated_counts.end() - 1, + h_concatenated_displacements.begin() + 1); + + std::optional> h_reversed_displacements{std::nullopt}; + if (h_reversed_counts) { + h_reversed_displacements = std::vector((*h_reversed_counts).size(), size_t{0}); + std::partial_sum((*h_reversed_counts).begin(), + (*h_reversed_counts).end() - 1, + (*h_reversed_displacements).begin() + 1); } - for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { - auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( - coarsened_edgelist_major_vertices[i].data(), - coarsened_edgelist_minor_vertices[i].data(), - coarsened_edgelist_weights ? std::optional{(*coarsened_edgelist_weights)[i].data()} - : std::nullopt, - static_cast(coarsened_edgelist_major_vertices[i].size()), + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { + coarsened_edgelist_majors[i].resize( + h_concatenated_counts[i] + (h_reversed_counts ? (*h_reversed_counts)[i] : size_t{0}), handle.get_stream()); - coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream()); - coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_minors[i].resize(coarsened_edgelist_majors[i].size(), handle.get_stream()); if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights)[i].resize(number_of_partition_edges, handle.get_stream()); - (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream()); + (*coarsened_edgelist_weights)[i].resize(coarsened_edgelist_majors[i].size(), + handle.get_stream()); + } + + thrust::copy(handle.get_thrust_policy(), + concatenated_edgelist_majors.begin() + h_concatenated_displacements[i], + concatenated_edgelist_majors.begin() + + (h_concatenated_displacements[i] + h_concatenated_counts[i]), + coarsened_edgelist_majors[i].begin()); + thrust::copy(handle.get_thrust_policy(), + concatenated_edgelist_minors.begin() + h_concatenated_displacements[i], + concatenated_edgelist_minors.begin() + + (h_concatenated_displacements[i] + h_concatenated_counts[i]), + coarsened_edgelist_minors[i].begin()); + if (coarsened_edgelist_weights) { + thrust::copy(handle.get_thrust_policy(), + (*concatenated_edgelist_weights).begin() + h_concatenated_displacements[i], + (*concatenated_edgelist_weights).begin() + + (h_concatenated_displacements[i] + h_concatenated_counts[i]), + (*coarsened_edgelist_weights)[i].begin()); + } + + if (reversed_edgelist_majors) { + thrust::copy(handle.get_thrust_policy(), + (*reversed_edgelist_majors).begin() + (*h_reversed_displacements)[i], + (*reversed_edgelist_majors).begin() + + ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), + coarsened_edgelist_majors[i].begin() + h_concatenated_counts[i]); + thrust::copy(handle.get_thrust_policy(), + (*reversed_edgelist_minors).begin() + (*h_reversed_displacements)[i], + (*reversed_edgelist_minors).begin() + + ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), + coarsened_edgelist_minors[i].begin() + h_concatenated_counts[i]); + if (coarsened_edgelist_weights) { + thrust::copy(handle.get_thrust_policy(), + (*reversed_edgelist_weights).begin() + (*h_reversed_displacements)[i], + (*reversed_edgelist_weights).begin() + + ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), + (*coarsened_edgelist_weights)[i].begin() + h_concatenated_counts[i]); + } } } - // 3. find unique labels for this GPU + // 4. find unique labels for this GPU rmm::device_uvector unique_labels(graph_view.get_number_of_local_vertices(), handle.get_stream()); @@ -373,18 +559,18 @@ coarsen_graph( thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())), handle.get_stream()); - // 4. renumber + // 5. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); renumber_meta_t meta{}; { - std::vector major_ptrs(coarsened_edgelist_major_vertices.size()); + std::vector major_ptrs(coarsened_edgelist_majors.size()); std::vector minor_ptrs(major_ptrs.size()); std::vector counts(major_ptrs.size()); - for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { - major_ptrs[i] = coarsened_edgelist_major_vertices[i].data(); - minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data(); - counts[i] = static_cast(coarsened_edgelist_major_vertices[i].size()); + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { + major_ptrs[i] = coarsened_edgelist_majors[i].data(); + minor_ptrs[i] = coarsened_edgelist_minors[i].data(); + counts[i] = static_cast(coarsened_edgelist_majors[i].size()); } std::tie(renumber_map_labels, meta) = renumber_edgelist( handle, @@ -396,20 +582,20 @@ coarsen_graph( do_expensive_check); } - // 5. build a graph + // 6. build a graph std::vector> edgelists{}; edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); for (size_t i = 0; i < edgelists.size(); ++i) { - edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data() - : coarsened_edgelist_major_vertices[i].data(); - edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data() - : coarsened_edgelist_minor_vertices[i].data(); + edgelists[i].p_src_vertices = + store_transposed ? coarsened_edgelist_minors[i].data() : coarsened_edgelist_majors[i].data(); + edgelists[i].p_dst_vertices = + store_transposed ? coarsened_edgelist_majors[i].data() : coarsened_edgelist_minors[i].data(); edgelists[i].p_edge_weights = coarsened_edgelist_weights ? std::optional{(*coarsened_edgelist_weights)[i].data()} : std::nullopt, - edgelists[i].number_of_edges = static_cast(coarsened_edgelist_major_vertices[i].size()); + edgelists[i].number_of_edges = static_cast(coarsened_edgelist_majors[i].size()); } return std::make_tuple( @@ -445,16 +631,73 @@ coarsen_graph( // currently, nothing to do } - auto [coarsened_edgelist_major_vertices, - coarsened_edgelist_minor_vertices, - coarsened_edgelist_weights] = + bool lower_triangular_only = graph_view.is_symmetric(); + + auto [coarsened_edgelist_majors, coarsened_edgelist_minors, coarsened_edgelist_weights] = decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( handle, matrix_partition_device_view_t( graph_view.get_matrix_partition_view()), labels, detail::minor_properties_device_view_t(labels), - graph_view.get_local_adj_matrix_partition_segment_offsets(0)); + graph_view.get_local_adj_matrix_partition_segment_offsets(0), + lower_triangular_only); + + if (lower_triangular_only) { + if (coarsened_edgelist_weights) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(), + coarsened_edgelist_minors.begin(), + (*coarsened_edgelist_weights).begin())); + auto last = + thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + coarsened_edgelist_majors.size(), + is_not_self_loop_t>{}); + + auto cur_size = coarsened_edgelist_majors.size(); + auto reversed_size = static_cast(thrust::distance(edge_first, last)); + + coarsened_edgelist_majors.resize(cur_size + reversed_size, handle.get_stream()); + coarsened_edgelist_minors.resize(coarsened_edgelist_majors.size(), handle.get_stream()); + (*coarsened_edgelist_weights).resize(coarsened_edgelist_majors.size(), handle.get_stream()); + + edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(), + coarsened_edgelist_minors.begin(), + (*coarsened_edgelist_weights).begin())); + thrust::copy( + handle.get_thrust_policy(), + edge_first, + edge_first + reversed_size, + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_minors.begin(), + coarsened_edgelist_majors.begin(), + (*coarsened_edgelist_weights).begin())) + + cur_size); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(coarsened_edgelist_majors.begin(), coarsened_edgelist_minors.begin())); + auto last = thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + coarsened_edgelist_majors.size(), + is_not_self_loop_t>{}); + + auto cur_size = coarsened_edgelist_majors.size(); + auto reversed_size = static_cast(thrust::distance(edge_first, last)); + + coarsened_edgelist_majors.resize(cur_size + reversed_size, handle.get_stream()); + coarsened_edgelist_minors.resize(coarsened_edgelist_majors.size(), handle.get_stream()); + + edge_first = thrust::make_zip_iterator( + thrust::make_tuple(coarsened_edgelist_majors.begin(), coarsened_edgelist_minors.begin())); + thrust::copy(handle.get_thrust_policy(), + edge_first, + edge_first + reversed_size, + thrust::make_zip_iterator(thrust::make_tuple( + coarsened_edgelist_minors.begin(), coarsened_edgelist_majors.begin())) + + cur_size); + } + } rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -470,20 +713,20 @@ coarsen_graph( auto [renumber_map_labels, meta] = renumber_edgelist( handle, std::optional>{std::move(unique_labels)}, - coarsened_edgelist_major_vertices.data(), - coarsened_edgelist_minor_vertices.data(), - static_cast(coarsened_edgelist_major_vertices.size()), + coarsened_edgelist_majors.data(), + coarsened_edgelist_minors.data(), + static_cast(coarsened_edgelist_majors.size()), do_expensive_check); edgelist_t edgelist{}; - edgelist.p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data() - : coarsened_edgelist_major_vertices.data(); - edgelist.p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data() - : coarsened_edgelist_minor_vertices.data(); + edgelist.p_src_vertices = + store_transposed ? coarsened_edgelist_minors.data() : coarsened_edgelist_majors.data(); + edgelist.p_dst_vertices = + store_transposed ? coarsened_edgelist_majors.data() : coarsened_edgelist_minors.data(); edgelist.p_edge_weights = coarsened_edgelist_weights ? std::optional{(*coarsened_edgelist_weights).data()} : std::nullopt; - edgelist.number_of_edges = static_cast(coarsened_edgelist_major_vertices.size()); + edgelist.number_of_edges = static_cast(coarsened_edgelist_majors.size()); return std::make_tuple( std::make_unique>( diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp index b86cfdee5c6..364a0b8a68e 100644 --- a/cpp/tests/community/louvain_test.cpp +++ b/cpp/tests/community/louvain_test.cpp @@ -9,6 +9,7 @@ * */ #include +#include #include #include @@ -90,15 +91,28 @@ class Tests_Louvain auto [louvain_usecase, input_usecase] = param; raft::handle_t handle{}; + HighResClock hr_clock{}; // Can't currently check correctness if we renumber bool renumber = true; if (louvain_usecase.check_correctness_) renumber = false; + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + auto [graph, d_renumber_map_labels] = cugraph::test::construct_graph( handle, input_usecase, true, renumber); + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n"; + } + auto graph_view = graph.view(); // "FIXME": remove this check once we drop support for Pascal @@ -109,6 +123,11 @@ class Tests_Louvain cudaDeviceProp device_prop; RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0)); + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + if (device_prop.major < 7) { EXPECT_THROW(louvain(graph_view, graph_view.get_number_of_local_vertices(), @@ -123,6 +142,13 @@ class Tests_Louvain louvain_usecase.expected_level_, louvain_usecase.expected_modularity_); } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "Louvain took " << elapsed_time * 1e-6 << " s.\n"; + } } template diff --git a/cpp/tests/structure/coarsen_graph_test.cpp b/cpp/tests/structure/coarsen_graph_test.cpp index dedcb2a718d..dc9298813be 100644 --- a/cpp/tests/structure/coarsen_graph_test.cpp +++ b/cpp/tests/structure/coarsen_graph_test.cpp @@ -433,7 +433,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Combine( // enable correctness checks ::testing::Values(CoarsenGraph_Usecase{0.2, false}, CoarsenGraph_Usecase{0.2, true}), - ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)))); + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false), + cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false)))); INSTANTIATE_TEST_SUITE_P( file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with @@ -457,6 +458,7 @@ INSTANTIATE_TEST_SUITE_P( // disable correctness checks for large graphs ::testing::Values(CoarsenGraph_Usecase{0.2, false, false}, CoarsenGraph_Usecase{0.2, true, false}), - ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); + ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false), + cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false)))); CUGRAPH_TEST_PROGRAM_MAIN()