Merge pull request #3752 from rapidsai/branch-23.08

Forward-merge branch-23.08 to branch-23.10
rapidsai · Jul 27, 2023 · 4951f04 · 4951f04
2 parents 8a85705 + ad74817
commit 4951f04
Show file tree

Hide file tree

Showing 15 changed files with 2,048 additions and 61 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -270,6 +270,7 @@ set(CUGRAPH_SOURCES
     src/traversal/bfs_sg.cu
     src/traversal/bfs_mg.cu
     src/traversal/sssp_sg.cu
+    src/traversal/od_shortest_distances_sg.cu
     src/traversal/sssp_mg.cu
     src/link_analysis/hits_sg.cu
     src/link_analysis/hits_mg.cu

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
@@ -1179,6 +1179,45 @@ void sssp(raft::handle_t const& handle,
           weight_t cutoff         = std::numeric_limits<weight_t>::max(),
           bool do_expensive_check = false);
 
+/*
+ * @brief Compute the shortest distances from the given origins to all the given destinations.
+ *
+ * This algorithm is designed for large diameter graphs. For small diameter graphs, running the
+ * cugraph::sssp function in a sequentially executed loop might be faster. This algorithms currently
+ * works only for single-GPU (we are not aware of large diameter graphs that won't fit in a single
+ * GPU).
+ *
+ * @throws cugraph::logic_error on erroneous input arguments.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param edge_weight_view View object holding edge weights for @p graph_view.
+ * @param origins An array of origins (starting vertices) to find shortest distances. There should
+ * be no duplicates in @p origins.
+ * @param destinations An array of destinations (end vertices) to find shortest distances. There
+ * should be no duplicates in @p destinations.
+ * @param cutoff Any destinations farther than @p cutoff will be marked as unreachable.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return A vector of size @p origins.size() * @p destinations.size(). The i'th element of the
+ * returned vector is the shortest distance from the (i / @p destinations.size())'th origin to the
+ * (i % @p destinations.size())'th destination.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+rmm::device_uvector<weight_t> od_shortest_distances(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  edge_property_view_t<edge_t, weight_t const*> edge_weight_view,
+  raft::device_span<vertex_t const> origins,
+  raft::device_span<vertex_t const> destinations,
+  weight_t cutoff         = std::numeric_limits<weight_t>::max(),
+  bool do_expensive_check = false);
+
 /**
  * @brief Compute PageRank scores.
  *

diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -156,13 +156,15 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   {
   }
 
-  size_t compute_number_of_edges(raft::device_span<vertex_t const> majors,
+  template <typename MajorIterator>
+  size_t compute_number_of_edges(MajorIterator major_first,
+                                 MajorIterator major_last,
                                  rmm::cuda_stream_view stream) const
   {
     return dcs_nzd_vertices_ ? thrust::transform_reduce(
                                  rmm::exec_policy(stream),
-                                 majors.begin(),
-                                 majors.end(),
+                                 major_first,
+                                 major_last,
                                  detail::local_degree_op_t<
                                    vertex_t,
                                    edge_t,
@@ -176,8 +178,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                  thrust::plus<size_t>())
                              : thrust::transform_reduce(
                                  rmm::exec_policy(stream),
-                                 majors.begin(),
-                                 majors.end(),
+                                 major_first,
+                                 major_last,
                                  detail::local_degree_op_t<
                                    vertex_t,
                                    edge_t,
@@ -217,15 +219,17 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return local_degrees;
   }
 
-  rmm::device_uvector<edge_t> compute_local_degrees(raft::device_span<vertex_t const> majors,
+  template <typename MajorIterator>
+  rmm::device_uvector<edge_t> compute_local_degrees(MajorIterator major_first,
+                                                    MajorIterator major_last,
                                                     rmm::cuda_stream_view stream) const
   {
-    rmm::device_uvector<edge_t> local_degrees(majors.size(), stream);
+    rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
       thrust::transform(rmm::exec_policy(stream),
-                        majors.begin(),
-                        majors.end(),
+                        major_first,
+                        major_last,
                         local_degrees.begin(),
                         detail::local_degree_op_t<vertex_t, edge_t, edge_t, multi_gpu, true>{
                           this->offsets_,
@@ -235,8 +239,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     } else {
       thrust::transform(
         rmm::exec_policy(stream),
-        majors.begin(),
-        majors.end(),
+        major_first,
+        major_last,
         local_degrees.begin(),
         detail::local_degree_op_t<vertex_t, edge_t, edge_t, multi_gpu, false>{
           this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */});
@@ -349,13 +353,15 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   {
   }
 
-  size_t compute_number_of_edges(raft::device_span<vertex_t const> majors,
+  template <typename MajorIterator>
+  size_t compute_number_of_edges(MajorIterator major_first,
+                                 MajorIterator major_last,
                                  rmm::cuda_stream_view stream) const
   {
     return thrust::transform_reduce(
       rmm::exec_policy(stream),
-      majors.begin(),
-      majors.end(),
+      major_first,
+      major_last,
       detail::local_degree_op_t<vertex_t,
                                 edge_t,
                                 size_t /* no limit on majors.size(), so edge_t can overflow */,
@@ -383,13 +389,15 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return local_degrees;
   }
 
-  rmm::device_uvector<edge_t> compute_local_degrees(raft::device_span<vertex_t const> majors,
+  template <typename MajorIterator>
+  rmm::device_uvector<edge_t> compute_local_degrees(MajorIterator major_first,
+                                                    MajorIterator major_last,
                                                     rmm::cuda_stream_view stream) const
   {
-    rmm::device_uvector<edge_t> local_degrees(majors.size(), stream);
+    rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     thrust::transform(rmm::exec_policy(stream),
-                      majors.begin(),
-                      majors.end(),
+                      major_first,
+                      major_last,
                       local_degrees.begin(),
                       detail::local_degree_op_t<vertex_t, edge_t, edge_t, multi_gpu, false>{
                         this->offsets_,

diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
@@ -99,6 +99,22 @@ auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_
     std::make_index_sequence<tuple_size>(), buffer_size, stream_view);
 }
 
+template <typename BufferType>
+void reserve_dataframe_buffer(BufferType& buffer,
+                              size_t new_buffer_capacity,
+                              rmm::cuda_stream_view stream_view)
+{
+  static_assert(is_std_tuple_of_arithmetic_vectors<std::remove_cv_t<BufferType>>::value ||
+                is_arithmetic_vector<std::remove_cv_t<BufferType>, rmm::device_uvector>::value);
+  if constexpr (is_std_tuple_of_arithmetic_vectors<std::remove_cv_t<BufferType>>::value) {
+    std::apply([new_buffer_capacity, stream_view](
+                 auto&&... args) { (args.reserve(new_buffer_capacity, stream_view), ...); },
+               buffer);
+  } else {
+    buffer.reserve(new_buffer_capacity, stream_view);
+  }
+}
+
 template <typename BufferType>
 void resize_dataframe_buffer(BufferType& buffer,
                              size_t new_buffer_size,

diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -223,7 +223,7 @@ struct v_op_t {
   decltype(thrust::make_zip_iterator(thrust::make_tuple(
     static_cast<vertex_type*>(nullptr), static_cast<vertex_type*>(nullptr)))) edge_buffer_first{};
   // FIXME: we can use cuda::atomic instead but currently on a system with x86 + GPU, this requires
-  // placing the atomic barrier on managed memory and this adds additional complication.
+  // placing the atomic variable on managed memory and this adds additional complication.
   size_t* num_edge_inserts{};
   size_t bucket_idx_next{};
   size_t bucket_idx_conflict{};  // relevant only if GraphViewType::is_multi_gpu is true
@@ -501,7 +501,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
     auto edge_buffer =
       allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(0, handle.get_stream());
     // FIXME: we can use cuda::atomic instead but currently on a system with x86 + GPU, this
-    // requires placing the atomic variable on managed memory and this make it less attractive.
+    // requires placing the atomic variable on managed memory and this adds additional complication.
     rmm::device_scalar<size_t> num_edge_inserts(size_t{0}, handle.get_stream());
 
     auto edge_dst_components =

diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -877,9 +877,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
 
     auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
     auto max_pushes      = edge_partition.compute_number_of_edges(
-      raft::device_span<vertex_t const>(edge_partition_frontier_major_first,
-                                        edge_partition_frontier_major_last),
-      handle.get_stream());
+      edge_partition_frontier_major_first, edge_partition_frontier_major_last, handle.get_stream());
 
     auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes;
     resize_optional_dataframe_buffer<output_key_t>(