rapidsai · rapids-bot · Feb 22, 2022 · Feb 12, 2022 · Feb 12, 2022 · Feb 13, 2022
@@ -76,8 +76,8 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id(
  * @param[in/out] d_edgelist_minors Vertex IDs for columns (if the graph adjacency matrix is stored
  * as is) or rows (if the graph adjacency matrix is stored transposed)
  * @param[in/out] d_edgelist_weights Optional edge weights
- * @param[in] groupby_and_count_local_partition If set to true, groupby and count edges based on
- * (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the
+ * @param[in] groupby_and_count_local_partition_by_minor If set to true, groupby and count edges
+ * based on (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the
  * compute_gpu_id_from_vertex_t function to the minor vertex ID). If set to false, groupby and count
  * edges by just local partition ID.
  *
@@ -91,7 +91,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   rmm::device_uvector<vertex_t>& d_edgelist_majors,
   rmm::device_uvector<vertex_t>& d_edgelist_minors,
   std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
-  bool groupby_and_count_local_partition = false);
+  bool groupby_and_count_local_partition_by_minor = false);
 
 }  // namespace detail
 }  // namespace cugraph
@@ -482,7 +482,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
       rmm::device_uvector<weight_t> rx_key_aggregated_edge_weights(0, handle.get_stream());
       std::forward_as_tuple(
         std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) =
-        groupby_gpuid_and_shuffle_values(
+        groupby_gpu_id_and_shuffle_values(
           col_comm,
           triplet_first,
           triplet_first + tmp_major_vertices.size(),

@@ -505,7 +505,7 @@ transform_reduce_by_adj_matrix_row_col_key_e(
       rmm::device_uvector<vertex_t> rx_unique_keys(0, handle.get_stream());
       auto rx_value_for_unique_key_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
       std::tie(rx_unique_keys, rx_value_for_unique_key_buffer, std::ignore) =
-        groupby_gpuid_and_shuffle_kv_pairs(
+        groupby_gpu_id_and_shuffle_kv_pairs(
           comm,
           tmp_keys.begin(),
           tmp_keys.end(),

@@ -103,7 +103,7 @@ collect_values_for_keys(raft::comms::comms_t const& comm,
   {
     rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
     std::vector<size_t> rx_value_counts{};
-    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
       comm,
       unique_keys.begin(),
       unique_keys.end(),
@@ -228,7 +228,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const& comm,
   {
     rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
     std::vector<size_t> rx_value_counts{};
-    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
       comm,
       unique_keys.begin(),
       unique_keys.end(),

@@ -588,7 +588,7 @@ template <typename vertex_t, typename edge_t, typename weight_t>
 std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
   raft::handle_t const& handle,
   vertex_t*
-    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
   vertex_t* edgelist_minor_vertices,  // [IN / OUT]
   weight_t* edgelist_weights,         // [IN / OUT]
   edge_t num_edgelist_edges);

@@ -308,7 +308,7 @@ class Louvain {
         thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin()));
 
       std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) =
-        groupby_gpuid_and_shuffle_values(
+        groupby_gpu_id_and_shuffle_values(
           handle_.get_comms(),
           pair_first,
           pair_first + current_graph_view_.get_number_of_local_vertices(),

@@ -371,17 +371,17 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         // with fewer than one root per GPU
         if (std::reduce(first_candidate_degrees.begin(), first_candidate_degrees.end()) >
             degree_sum_threshold * comm_size) {
-          std::vector<std::tuple<edge_t, int>> degree_gpuid_pairs(comm_size);
+          std::vector<std::tuple<edge_t, int>> degree_gpu_id_pairs(comm_size);
           for (int i = 0; i < comm_size; ++i) {
-            degree_gpuid_pairs[i] = std::make_tuple(first_candidate_degrees[i], i);
+            degree_gpu_id_pairs[i] = std::make_tuple(first_candidate_degrees[i], i);
           }
-          std::sort(degree_gpuid_pairs.begin(), degree_gpuid_pairs.end(), [](auto lhs, auto rhs) {
+          std::sort(degree_gpu_id_pairs.begin(), degree_gpu_id_pairs.end(), [](auto lhs, auto rhs) {
             return std::get<0>(lhs) > std::get<0>(rhs);
           });
           edge_t sum{0};
-          for (size_t i = 0; i < degree_gpuid_pairs.size(); ++i) {
-            sum += std::get<0>(degree_gpuid_pairs[i]);
-            init_max_new_root_counts[std::get<1>(degree_gpuid_pairs[i])] = 1;
+          for (size_t i = 0; i < degree_gpu_id_pairs.size(); ++i) {
+            sum += std::get<0>(degree_gpu_id_pairs[i]);
+            init_max_new_root_counts[std::get<1>(degree_gpu_id_pairs[i])] = 1;
             if (sum > degree_sum_threshold * comm_size) { break; }
           }
         }
@@ -390,18 +390,18 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         else if (level_graph_view.get_number_of_vertices() <=
                  static_cast<vertex_t>(handle.get_comms().get_size() *
                                        ceil(1.0 / max_new_roots_ratio))) {
-          std::vector<int> gpuids{};
-          gpuids.reserve(
+          std::vector<int> gpu_ids{};
+          gpu_ids.reserve(
             std::reduce(new_root_candidate_counts.begin(), new_root_candidate_counts.end()));
           for (size_t i = 0; i < new_root_candidate_counts.size(); ++i) {
-            gpuids.insert(gpuids.end(), new_root_candidate_counts[i], static_cast<int>(i));
+            gpu_ids.insert(gpu_ids.end(), new_root_candidate_counts[i], static_cast<int>(i));
           }
           std::random_device rd{};
-          std::shuffle(gpuids.begin(), gpuids.end(), std::mt19937(rd()));
-          gpuids.resize(
-            std::max(static_cast<vertex_t>(gpuids.size() * max_new_roots_ratio), vertex_t{1}));
-          for (size_t i = 0; i < gpuids.size(); ++i) {
-            ++init_max_new_root_counts[gpuids[i]];
+          std::shuffle(gpu_ids.begin(), gpu_ids.end(), std::mt19937(rd()));
+          gpu_ids.resize(
+            std::max(static_cast<vertex_t>(gpu_ids.size() * max_new_roots_ratio), vertex_t{1}));
+          for (size_t i = 0; i < gpu_ids.size(); ++i) {
+            ++init_max_new_root_counts[gpu_ids[i]];
           }
         } else {
           std::fill(init_max_new_root_counts.begin(),
@@ -678,7 +678,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
         auto const col_comm_size = col_comm.get_size();
 
-        std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values(
+        std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
           comm,
           get_dataframe_buffer_begin(edge_buffer),
           get_dataframe_buffer_end(edge_buffer),

@@ -51,7 +51,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
     std::forward_as_tuple(
       std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights),
       std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         edge_first,
         edge_first + d_edgelist_majors.size(),
@@ -67,7 +67,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
 
     std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors),
                           std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         edge_first,
         edge_first + d_edgelist_majors.size(),
@@ -124,7 +124,7 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id(raft::handle_t const& h
   auto const comm_size = comm.get_size();
 
   rmm::device_uvector<vertex_t> d_rx_vertices(0, handle.get_stream());
-  std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values(
+  std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
     comm,  // handle.get_comms(),
     d_vertices.begin(),
     d_vertices.end(),
@@ -147,7 +147,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   rmm::device_uvector<vertex_t>& d_edgelist_majors,
   rmm::device_uvector<vertex_t>& d_edgelist_minors,
   std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
-  bool groupby_and_count_local_partition)
+  bool groupby_and_count_local_partition_by_minor)
 {
   auto& comm               = handle.get_comms();
   auto const comm_size     = comm.get_size();
@@ -159,10 +159,19 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   auto const col_comm_size = col_comm.get_size();
   auto const col_comm_rank = col_comm.get_rank();
 
+  auto total_global_mem = handle.get_device_properties().totalGlobalMem;
+  auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0});
+  auto mem_frugal =
+    d_edgelist_majors.size() * element_size >=
+    total_global_mem /
+      4;  // if the data size exceeds 1/4 of the device memory (1/4 is a tuning parameter),
+          // groupby_and_count requires temporary buffer comparable to the input data size, if
+          // mem_frugal is set to true, temporary buffer size can be reduced up to 50%
+
   auto pair_first = thrust::make_zip_iterator(
     thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin()));
 
-  if (groupby_and_count_local_partition) {
+  if (groupby_and_count_local_partition_by_minor) {
     auto local_partition_id_gpu_id_pair_op =
       [comm_size,
        row_comm_size,
@@ -183,11 +192,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
                                                            d_edgelist_weights->begin(),
                                                            local_partition_id_gpu_id_pair_op,
                                                            comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream())
                               : cugraph::groupby_and_count(pair_first,
                                                            pair_first + d_edgelist_majors.size(),
                                                            local_partition_id_gpu_id_pair_op,
                                                            comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream());
   } else {
     auto local_partition_id_op =
@@ -203,11 +214,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
                                                            d_edgelist_weights->begin(),
                                                            local_partition_id_op,
                                                            col_comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream())
                               : cugraph::groupby_and_count(pair_first,
                                                            pair_first + d_edgelist_majors.size(),
                                                            local_partition_id_op,
                                                            col_comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream());
   }
 }

@@ -267,7 +267,7 @@ coarsen_graph(
 
     // 1-3. append data to local adjacency matrix partitions
 
-    // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge
+    // FIXME: we can skip this if groupby_gpu_id_and_shuffle_values is updated to return sorted edge
     // list based on the final matrix partition (maybe add
     // groupby_adj_matrix_partition_and_shuffle_values).
 

@@ -95,7 +95,7 @@ void relabel(raft::handle_t const& handle,
           thrust::make_tuple(label_pair_old_labels.begin(), label_pair_new_labels.begin()));
         std::forward_as_tuple(std::tie(rx_label_pair_old_labels, rx_label_pair_new_labels),
                               std::ignore) =
-          groupby_gpuid_and_shuffle_values(
+          groupby_gpu_id_and_shuffle_values(
             handle.get_comms(),
             pair_first,
             pair_first + num_label_pairs,
@@ -136,7 +136,7 @@ void relabel(raft::handle_t const& handle,
       {
         rmm::device_uvector<vertex_t> rx_unique_old_labels(0, handle.get_stream());
         std::vector<size_t> rx_value_counts{};
-        std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+        std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
           handle.get_comms(),
           unique_old_labels.begin(),
           unique_old_labels.end(),

@@ -1182,7 +1182,7 @@ template <typename vertex_t, typename edge_t, typename weight_t>
 std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
   raft::handle_t const& handle,
   vertex_t*
-    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
   vertex_t* edgelist_minor_vertices,  // [IN / OUT]
   weight_t* edgelist_weights,         // [IN / OUT]
   edge_t num_edgelist_edges)
@@ -1204,7 +1204,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
     std::forward_as_tuple(
       std::tie(ptr_ret->get_major(), ptr_ret->get_minor(), ptr_ret->get_weights()),
       std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         zip_edge,
         zip_edge + num_edgelist_edges,
@@ -1220,7 +1220,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
 
     std::forward_as_tuple(std::tie(ptr_ret->get_major(), ptr_ret->get_minor()),
                           std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         zip_edge,
         zip_edge + num_edgelist_edges,
@@ -1248,11 +1248,13 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
                                                     ptr_ret->get_weights().data(),
                                                     local_partition_id_op,
                                                     col_comm_size,
+                                                    false,
                                                     handle.get_stream())
                        : cugraph::groupby_and_count(pair_first,
                                                     pair_first + ptr_ret->get_major().size(),
                                                     local_partition_id_op,
                                                     col_comm_size,
+                                                    false,
                                                     handle.get_stream());
 
   std::vector<size_t> h_edge_counts(edge_counts.size());