Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce peak memory requirement in graph creation (part 1/2) #2070

Merged
merged 13 commits into from
Feb 22, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cpp/include/cugraph/detail/shuffle_wrappers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id(
* @param[in/out] d_edgelist_minors Vertex IDs for columns (if the graph adjacency matrix is stored
* as is) or rows (if the graph adjacency matrix is stored transposed)
* @param[in/out] d_edgelist_weights Optional edge weights
* @param[in] groupby_and_count_local_partition If set to true, groupby and count edges based on
* (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the
* @param[in] groupby_and_count_local_partition_by_minor If set to true, groupby and count edges
* based on (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the
* compute_gpu_id_from_vertex_t function to the minor vertex ID). If set to false, groupby and count
* edges by just local partition ID.
*
Expand All @@ -91,7 +91,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
rmm::device_uvector<vertex_t>& d_edgelist_majors,
rmm::device_uvector<vertex_t>& d_edgelist_minors,
std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
bool groupby_and_count_local_partition = false);
bool groupby_and_count_local_partition_by_minor = false);

} // namespace detail
} // namespace cugraph
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
rmm::device_uvector<weight_t> rx_key_aggregated_edge_weights(0, handle.get_stream());
std::forward_as_tuple(
std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) =
groupby_gpuid_and_shuffle_values(
groupby_gpu_id_and_shuffle_values(
col_comm,
triplet_first,
triplet_first + tmp_major_vertices.size(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ transform_reduce_by_adj_matrix_row_col_key_e(
rmm::device_uvector<vertex_t> rx_unique_keys(0, handle.get_stream());
auto rx_value_for_unique_key_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
std::tie(rx_unique_keys, rx_value_for_unique_key_buffer, std::ignore) =
groupby_gpuid_and_shuffle_kv_pairs(
groupby_gpu_id_and_shuffle_kv_pairs(
comm,
tmp_keys.begin(),
tmp_keys.end(),
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cugraph/utilities/collect_comm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ collect_values_for_keys(raft::comms::comms_t const& comm,
{
rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
std::vector<size_t> rx_value_counts{};
std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
comm,
unique_keys.begin(),
unique_keys.end(),
Expand Down Expand Up @@ -228,7 +228,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const& comm,
{
rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
std::vector<size_t> rx_value_counts{};
std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
comm,
unique_keys.begin(),
unique_keys.end(),
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cugraph/utilities/cython.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ template <typename vertex_t, typename edge_t, typename weight_t>
std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
raft::handle_t const& handle,
vertex_t*
edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
edgelist_major_vertices, // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
vertex_t* edgelist_minor_vertices, // [IN / OUT]
weight_t* edgelist_weights, // [IN / OUT]
edge_t num_edgelist_edges);
Expand Down
266 changes: 231 additions & 35 deletions cpp/include/cugraph/utilities/shuffle_comm.cuh

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cpp/src/community/louvain.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ class Louvain {
thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin()));

std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) =
groupby_gpuid_and_shuffle_values(
groupby_gpu_id_and_shuffle_values(
handle_.get_comms(),
pair_first,
pair_first + current_graph_view_.get_number_of_local_vertices(),
Expand Down
30 changes: 15 additions & 15 deletions cpp/src/components/weakly_connected_components_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -371,17 +371,17 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
// with fewer than one root per GPU
if (std::reduce(first_candidate_degrees.begin(), first_candidate_degrees.end()) >
degree_sum_threshold * comm_size) {
std::vector<std::tuple<edge_t, int>> degree_gpuid_pairs(comm_size);
std::vector<std::tuple<edge_t, int>> degree_gpu_id_pairs(comm_size);
for (int i = 0; i < comm_size; ++i) {
degree_gpuid_pairs[i] = std::make_tuple(first_candidate_degrees[i], i);
degree_gpu_id_pairs[i] = std::make_tuple(first_candidate_degrees[i], i);
}
std::sort(degree_gpuid_pairs.begin(), degree_gpuid_pairs.end(), [](auto lhs, auto rhs) {
std::sort(degree_gpu_id_pairs.begin(), degree_gpu_id_pairs.end(), [](auto lhs, auto rhs) {
return std::get<0>(lhs) > std::get<0>(rhs);
});
edge_t sum{0};
for (size_t i = 0; i < degree_gpuid_pairs.size(); ++i) {
sum += std::get<0>(degree_gpuid_pairs[i]);
init_max_new_root_counts[std::get<1>(degree_gpuid_pairs[i])] = 1;
for (size_t i = 0; i < degree_gpu_id_pairs.size(); ++i) {
sum += std::get<0>(degree_gpu_id_pairs[i]);
init_max_new_root_counts[std::get<1>(degree_gpu_id_pairs[i])] = 1;
if (sum > degree_sum_threshold * comm_size) { break; }
}
}
Expand All @@ -390,18 +390,18 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
else if (level_graph_view.get_number_of_vertices() <=
static_cast<vertex_t>(handle.get_comms().get_size() *
ceil(1.0 / max_new_roots_ratio))) {
std::vector<int> gpuids{};
gpuids.reserve(
std::vector<int> gpu_ids{};
gpu_ids.reserve(
std::reduce(new_root_candidate_counts.begin(), new_root_candidate_counts.end()));
for (size_t i = 0; i < new_root_candidate_counts.size(); ++i) {
gpuids.insert(gpuids.end(), new_root_candidate_counts[i], static_cast<int>(i));
gpu_ids.insert(gpu_ids.end(), new_root_candidate_counts[i], static_cast<int>(i));
}
std::random_device rd{};
std::shuffle(gpuids.begin(), gpuids.end(), std::mt19937(rd()));
gpuids.resize(
std::max(static_cast<vertex_t>(gpuids.size() * max_new_roots_ratio), vertex_t{1}));
for (size_t i = 0; i < gpuids.size(); ++i) {
++init_max_new_root_counts[gpuids[i]];
std::shuffle(gpu_ids.begin(), gpu_ids.end(), std::mt19937(rd()));
gpu_ids.resize(
std::max(static_cast<vertex_t>(gpu_ids.size() * max_new_roots_ratio), vertex_t{1}));
for (size_t i = 0; i < gpu_ids.size(); ++i) {
++init_max_new_root_counts[gpu_ids[i]];
}
} else {
std::fill(init_max_new_root_counts.begin(),
Expand Down Expand Up @@ -678,7 +678,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
auto const col_comm_size = col_comm.get_size();

std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values(
std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
comm,
get_dataframe_buffer_begin(edge_buffer),
get_dataframe_buffer_end(edge_buffer),
Expand Down
23 changes: 18 additions & 5 deletions cpp/src/detail/shuffle_wrappers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
std::forward_as_tuple(
std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights),
std::ignore) =
cugraph::groupby_gpuid_and_shuffle_values(
cugraph::groupby_gpu_id_and_shuffle_values(
comm, // handle.get_comms(),
edge_first,
edge_first + d_edgelist_majors.size(),
Expand All @@ -67,7 +67,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,

std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors),
std::ignore) =
cugraph::groupby_gpuid_and_shuffle_values(
cugraph::groupby_gpu_id_and_shuffle_values(
comm, // handle.get_comms(),
edge_first,
edge_first + d_edgelist_majors.size(),
Expand Down Expand Up @@ -124,7 +124,7 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id(raft::handle_t const& h
auto const comm_size = comm.get_size();

rmm::device_uvector<vertex_t> d_rx_vertices(0, handle.get_stream());
std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values(
std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
comm, // handle.get_comms(),
d_vertices.begin(),
d_vertices.end(),
Expand All @@ -147,7 +147,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
rmm::device_uvector<vertex_t>& d_edgelist_majors,
rmm::device_uvector<vertex_t>& d_edgelist_minors,
std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
bool groupby_and_count_local_partition)
bool groupby_and_count_local_partition_by_minor)
{
auto& comm = handle.get_comms();
auto const comm_size = comm.get_size();
Expand All @@ -159,10 +159,19 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
auto const col_comm_size = col_comm.get_size();
auto const col_comm_rank = col_comm.get_rank();

auto total_global_mem = handle.get_device_properties().totalGlobalMem;
auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0});
auto mem_frugal =
d_edgelist_majors.size() * element_size >=
total_global_mem /
4; // if the data size exceeds 1/4 of the device memory (1/4 is a tuning parameter),
seunghwak marked this conversation as resolved.
Show resolved Hide resolved
// groupby_and_count requires temporary buffer comparable to the input data size, if
// mem_frugal is set to true, temporary buffer size can be reduced up to 50%

auto pair_first = thrust::make_zip_iterator(
thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin()));

if (groupby_and_count_local_partition) {
if (groupby_and_count_local_partition_by_minor) {
auto local_partition_id_gpu_id_pair_op =
[comm_size,
row_comm_size,
Expand All @@ -183,11 +192,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
d_edgelist_weights->begin(),
local_partition_id_gpu_id_pair_op,
comm_size,
mem_frugal,
handle.get_stream())
: cugraph::groupby_and_count(pair_first,
pair_first + d_edgelist_majors.size(),
local_partition_id_gpu_id_pair_op,
comm_size,
mem_frugal,
handle.get_stream());
} else {
auto local_partition_id_op =
Expand All @@ -203,11 +214,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
d_edgelist_weights->begin(),
local_partition_id_op,
col_comm_size,
mem_frugal,
handle.get_stream())
: cugraph::groupby_and_count(pair_first,
pair_first + d_edgelist_majors.size(),
local_partition_id_op,
col_comm_size,
mem_frugal,
handle.get_stream());
}
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/structure/coarsen_graph_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ coarsen_graph(

// 1-3. append data to local adjacency matrix partitions

// FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge
// FIXME: we can skip this if groupby_gpu_id_and_shuffle_values is updated to return sorted edge
// list based on the final matrix partition (maybe add
// groupby_adj_matrix_partition_and_shuffle_values).

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/structure/relabel_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ void relabel(raft::handle_t const& handle,
thrust::make_tuple(label_pair_old_labels.begin(), label_pair_new_labels.begin()));
std::forward_as_tuple(std::tie(rx_label_pair_old_labels, rx_label_pair_new_labels),
std::ignore) =
groupby_gpuid_and_shuffle_values(
groupby_gpu_id_and_shuffle_values(
handle.get_comms(),
pair_first,
pair_first + num_label_pairs,
Expand Down Expand Up @@ -136,7 +136,7 @@ void relabel(raft::handle_t const& handle,
{
rmm::device_uvector<vertex_t> rx_unique_old_labels(0, handle.get_stream());
std::vector<size_t> rx_value_counts{};
std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpuid_and_shuffle_values(
std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
handle.get_comms(),
unique_old_labels.begin(),
unique_old_labels.end(),
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/utilities/cython.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,7 @@ template <typename vertex_t, typename edge_t, typename weight_t>
std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
raft::handle_t const& handle,
vertex_t*
edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
edgelist_major_vertices, // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
vertex_t* edgelist_minor_vertices, // [IN / OUT]
weight_t* edgelist_weights, // [IN / OUT]
edge_t num_edgelist_edges)
Expand All @@ -1204,7 +1204,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
std::forward_as_tuple(
std::tie(ptr_ret->get_major(), ptr_ret->get_minor(), ptr_ret->get_weights()),
std::ignore) =
cugraph::groupby_gpuid_and_shuffle_values(
cugraph::groupby_gpu_id_and_shuffle_values(
comm, // handle.get_comms(),
zip_edge,
zip_edge + num_edgelist_edges,
Expand All @@ -1220,7 +1220,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(

std::forward_as_tuple(std::tie(ptr_ret->get_major(), ptr_ret->get_minor()),
std::ignore) =
cugraph::groupby_gpuid_and_shuffle_values(
cugraph::groupby_gpu_id_and_shuffle_values(
comm, // handle.get_comms(),
zip_edge,
zip_edge + num_edgelist_edges,
Expand Down Expand Up @@ -1248,11 +1248,13 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
ptr_ret->get_weights().data(),
local_partition_id_op,
col_comm_size,
false,
handle.get_stream())
: cugraph::groupby_and_count(pair_first,
pair_first + ptr_ret->get_major().size(),
local_partition_id_op,
col_comm_size,
false,
handle.get_stream());

std::vector<size_t> h_edge_counts(edge_counts.size());
Expand Down
Loading