From 5a18cdea9ebea13ca9930a3e3ec4498cdbb88c7c Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Tue, 13 Jun 2023 23:02:22 -0500
Subject: [PATCH 01/10] Adds `fail_on_nonconvergence` option to `pagerank` to
 provide pagerank results even on non-convergence (#3639)

closes #3613

Prior to this PR, `pagerank` will raise a `RuntimeError` if it fails to converge, often because the `max_iter` param is set too small (intentionally or otherwise).  This PR adds the optional paramter `fail_on_nonconvergence` which defaults to `True` (ie. the current behavior to ensure backwards-compatibility) that allows a caller to run `pagerank` and get results even if it did not converge.  When `fail_on_nonconvergence` is `False`, `pagerank` will return a tuple containing the pagerank results and a bool indicating if the results converged or not).

Authors:
  - Rick Ratzel (https://github.com/rlratzel)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Naim (https://github.com/naimnv)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/3639
---
 cpp/include/cugraph/algorithms.hpp            |  71 +++++
 cpp/include/cugraph_c/centrality_algorithms.h | 132 ++++++++
 cpp/src/c_api/centrality_result.cpp           |  14 +-
 cpp/src/c_api/centrality_result.hpp           |   4 +-
 cpp/src/c_api/pagerank.cpp                    | 210 +++++++++++--
 cpp/src/link_analysis/pagerank_impl.cuh       | 284 +++++++++++-------
 cpp/src/link_analysis/pagerank_mg.cu          |  80 ++++-
 cpp/src/link_analysis/pagerank_sg.cu          |  80 ++++-
 cpp/tests/c_api/mg_pagerank_test.c            | 248 +++++++++++++++
 cpp/tests/c_api/pagerank_test.c               | 239 +++++++++++++++
 cpp/tests/link_analysis/mg_pagerank_test.cpp  |  39 +--
 cpp/tests/link_analysis/pagerank_test.cpp     |  17 +-
 python/cugraph/cugraph/__init__.py            |   1 +
 .../cugraph/dask/link_analysis/pagerank.py    | 144 ++++++---
 python/cugraph/cugraph/exceptions.py          |  26 ++
 .../cugraph/cugraph/link_analysis/pagerank.py | 127 +++++---
 .../tests/link_analysis/test_pagerank.py      |  46 +++
 .../tests/link_analysis/test_pagerank_mg.py   |  80 ++++-
 python/pylibcugraph/pylibcugraph/__init__.py  |   2 +
 .../_cugraph_c/centrality_algorithms.pxd      |  46 ++-
 .../pylibcugraph/pylibcugraph/exceptions.py   |  26 ++
 python/pylibcugraph/pylibcugraph/pagerank.pyx |  94 ++++--
 .../pylibcugraph/personalized_pagerank.pyx    | 117 +++++---
 23 files changed, 1776 insertions(+), 351 deletions(-)
 create mode 100644 python/cugraph/cugraph/exceptions.py
 create mode 100644 python/pylibcugraph/pylibcugraph/exceptions.py

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 3bb98ce4150..51212d9d568 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1181,6 +1181,9 @@ void sssp(raft::handle_t const& handle,
 /**
  * @brief Compute PageRank scores.
  *
+ * @deprecated This API will be deprecated to replaced by the new version below
+ *             that returns metadata about the algorithm.
+ *
  * This function computes general (if @p personalization_vertices is `nullptr`) or personalized (if
  * @p personalization_vertices is not `nullptr`.) PageRank scores.
  *
@@ -1236,6 +1239,74 @@ void pagerank(raft::handle_t const& handle,
               bool has_initial_guess  = false,
               bool do_expensive_check = false);
 
+/**
+ * @brief Metadata about the execution of one of the centrality algorithms
+ */
+// FIXME:  This structure should be propagated to other algorithms that converge
+//   (eigenvector centrality, hits and katz centrality)
+//
+struct centrality_algorithm_metadata_t {
+  size_t number_of_iterations_{};
+  bool converged_{};
+};
+
+/**
+ * @brief Compute PageRank scores.
+ *
+ * This function computes general (if @p personalization_vertices is `nullptr`) or personalized (if
+ * @p personalization_vertices is not `nullptr`.) PageRank scores.
+ *
+ * @throws cugraph::logic_error on erroneous input arguments or if fails to converge before @p
+ * max_iterations.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam result_t Type of PageRank scores.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view. If @p
+ * edge_weight_view.has_value() == false, edge weights are assumed to be 1.0.
+ * @param precomputed_vertex_out_weight_sums Pointer to an array storing sums of out-going edge
+ * weights for the vertices (for re-use) or `std::nullopt`. If `std::nullopt`, these values are
+ * freshly computed. Computing these values outside this function reduces the number of memory
+ * allocations/deallocations and computing if a user repeatedly computes PageRank scores using the
+ * same graph with different personalization vectors.
+ * @param personalization Optional tuple containing device spans of vertex identifiers and
+ * personalization values for the vertices (compute personalized PageRank) or `std::nullopt`
+ * (compute general PageRank).
+ * @param initial_pageranks Optional device span containing initial PageRank values.  If
+ * specified this array will be used as the initial values and the PageRank values will be
+ * updated in place.  If not specified then the initial values will be set to 1.0 divided by
+ * the number of vertices in the graph and the return value will contain an `rmm::device_uvector`
+ * containing the resulting PageRank values.
+ * @param alpha PageRank damping factor.
+ * @param epsilon Error tolerance to check convergence. Convergence is assumed if the sum of the
+ * differences in PageRank values between two consecutive iterations is less than the number of
+ * vertices in the graph multiplied by @p epsilon.
+ * @param max_iterations Maximum number of PageRank iterations.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple containing the optional pagerank results (populated if @p initial_pageranks is
+ * set to `std::nullopt`) and a metadata structure with metadata indicating how many iterations
+ * were run and whether the algorithm converged or not.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<result_t>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, true, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<raft::device_span<weight_t const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<vertex_t const>, raft::device_span<result_t const>>>
+    personalization,
+  std::optional<raft::device_span<result_t const>> initial_pageranks,
+  result_t alpha,
+  result_t epsilon,
+  size_t max_iterations   = 500,
+  bool do_expensive_check = false);
+
 /**
  * @brief Compute Eigenvector Centrality scores.
  *
diff --git a/cpp/include/cugraph_c/centrality_algorithms.h b/cpp/include/cugraph_c/centrality_algorithms.h
index 5fa3520a9cb..ca60d3bfad4 100644
--- a/cpp/include/cugraph_c/centrality_algorithms.h
+++ b/cpp/include/cugraph_c/centrality_algorithms.h
@@ -56,6 +56,22 @@ cugraph_type_erased_device_array_view_t* cugraph_centrality_result_get_vertices(
 cugraph_type_erased_device_array_view_t* cugraph_centrality_result_get_values(
   cugraph_centrality_result_t* result);
 
+/**
+ * @brief     Get the number of iterations executed from the algorithm metadata
+ *
+ * @param [in]   result   The result from a centrality algorithm
+ * @return the number of iterations
+ */
+size_t cugraph_centrality_result_get_num_iterations(cugraph_centrality_result_t* result);
+
+/**
+ * @brief     Returns true if the centrality algorithm converged
+ *
+ * @param [in]   result   The result from a centrality algorithm
+ * @return True if the centrality algorithm converged, false otherwise
+ */
+bool_t cugraph_centrality_result_converged(cugraph_centrality_result_t* result);
+
 /**
  * @brief     Free centrality result
  *
@@ -114,9 +130,68 @@ cugraph_error_code_t cugraph_pagerank(
   cugraph_centrality_result_t** result,
   cugraph_error_t** error);
 
+/**
+ * @brief     Compute pagerank
+ *
+ * @deprecated This version of pagerank should be dropped in favor
+ *             of the cugraph_pagerank_allow_nonconvergence version.
+ *             Eventually that version will be renamed to this version.
+ *
+ * @param [in]  handle      Handle for accessing resources
+ * @param [in]  graph       Pointer to graph
+ * @param [in]  precomputed_vertex_out_weight_vertices
+ *                          Optionally send in precomputed sum of vertex out weights
+ *                          (a performance optimization).  This defines the vertices.
+ *                          Set to NULL if no value is passed.
+ * @param [in]  precomputed_vertex_out_weight_sums
+ *                          Optionally send in precomputed sum of vertex out weights
+ *                          (a performance optimization).  Set to NULL if
+ *                          no value is passed.
+ * @param [in]  initial_guess_vertices
+ *                          Optionally send in an initial guess of the pagerank values
+ *                          (a performance optimization).  This defines the vertices.
+ *                          Set to NULL if no value is passed. If NULL, initial PageRank
+ *                          values are set to 1.0 divided by the number of vertices in
+ *                          the graph.
+ * @param [in]  initial_guess_values
+ *                          Optionally send in an initial guess of the pagerank values
+ *                          (a performance optimization).  Set to NULL if
+ *                          no value is passed. If NULL, initial PageRank values are set
+ *                          to 1.0 divided by the number of vertices in the graph.
+ * @param [in]  alpha       PageRank damping factor.
+ * @param [in]  epsilon     Error tolerance to check convergence. Convergence is assumed
+ *                          if the sum of the differences in PageRank values between two
+ *                          consecutive iterations is less than the number of vertices
+ *                          in the graph multiplied by @p epsilon.
+ * @param [in]  max_iterations Maximum number of PageRank iterations.
+ * @param [in]  do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
+ * @param [out] result      Opaque pointer to pagerank results
+ * @param [out] error       Pointer to an error object storing details of any error.  Will
+ *                          be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_pagerank_allow_nonconvergence(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_vertices,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_sums,
+  const cugraph_type_erased_device_array_view_t* initial_guess_vertices,
+  const cugraph_type_erased_device_array_view_t* initial_guess_values,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool_t do_expensive_check,
+  cugraph_centrality_result_t** result,
+  cugraph_error_t** error);
+
 /**
  * @brief     Compute personalized pagerank
  *
+ * @deprecated This version of personalized pagerank should be dropped in favor
+ *             of the cugraph_personalized_pagerank_allow_nonconvergence version.
+ *             Eventually that version will be renamed to this version.
+ *
  * @param [in]  handle      Handle for accessing resources
  * @param [in]  graph       Pointer to graph
  * @param [in]  precomputed_vertex_out_weight_vertices
@@ -171,6 +246,63 @@ cugraph_error_code_t cugraph_personalized_pagerank(
   cugraph_centrality_result_t** result,
   cugraph_error_t** error);
 
+/**
+ * @brief     Compute personalized pagerank
+ *
+ * @param [in]  handle      Handle for accessing resources
+ * @param [in]  graph       Pointer to graph
+ * @param [in]  precomputed_vertex_out_weight_vertices
+ *                          Optionally send in precomputed sum of vertex out weights
+ *                          (a performance optimization).  This defines the vertices.
+ *                          Set to NULL if no value is passed.
+ * @param [in]  precomputed_vertex_out_weight_sums
+ *                          Optionally send in precomputed sum of vertex out weights
+ *                          (a performance optimization).  Set to NULL if
+ *                          no value is passed.
+ * @param [in]  initial_guess_vertices
+ *                          Optionally send in an initial guess of the pagerank values
+ *                          (a performance optimization).  This defines the vertices.
+ *                          Set to NULL if no value is passed. If NULL, initial PageRank
+ *                          values are set to 1.0 divided by the number of vertices in
+ *                          the graph.
+ * @param [in]  initial_guess_values
+ *                          Optionally send in an initial guess of the pagerank values
+ *                          (a performance optimization).  Set to NULL if
+ *                          no value is passed. If NULL, initial PageRank values are set
+ *                          to 1.0 divided by the number of vertices in the graph.
+ * @param [in]  personalization_vertices Pointer to an array storing personalization vertex
+ * identifiers (compute personalized PageRank).
+ * @param [in]  personalization_values Pointer to an array storing personalization values for the
+ * vertices in the personalization set.
+ * @param [in]  alpha       PageRank damping factor.
+ * @param [in]  epsilon     Error tolerance to check convergence. Convergence is assumed
+ *                          if the sum of the differences in PageRank values between two
+ *                          consecutive iterations is less than the number of vertices
+ *                          in the graph multiplied by @p epsilon.
+ * @param [in]  max_iterations Maximum number of PageRank iterations.
+ * @param [in]  do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
+ * @param [out] result      Opaque pointer to pagerank results
+ * @param [out] error       Pointer to an error object storing details of any error.  Will
+ *                          be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_personalized_pagerank_allow_nonconvergence(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_vertices,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_sums,
+  const cugraph_type_erased_device_array_view_t* initial_guess_vertices,
+  const cugraph_type_erased_device_array_view_t* initial_guess_values,
+  const cugraph_type_erased_device_array_view_t* personalization_vertices,
+  const cugraph_type_erased_device_array_view_t* personalization_values,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool_t do_expensive_check,
+  cugraph_centrality_result_t** result,
+  cugraph_error_t** error);
+
 /**
  * @brief     Compute eigenvector centrality
  *
diff --git a/cpp/src/c_api/centrality_result.cpp b/cpp/src/c_api/centrality_result.cpp
index c3ded9fbd89..08e7c0341f2 100644
--- a/cpp/src/c_api/centrality_result.cpp
+++ b/cpp/src/c_api/centrality_result.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,6 +34,18 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_centrality_result_ge
     internal_pointer->values_->view());
 }
 
+size_t cugraph_centrality_result_get_num_iterations(cugraph_centrality_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_centrality_result_t*>(result);
+  return internal_pointer->num_iterations_;
+}
+
+bool_t cugraph_centrality_result_converged(cugraph_centrality_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_centrality_result_t*>(result);
+  return internal_pointer->converged_ ? bool_t::TRUE : bool_t::FALSE;
+}
+
 extern "C" void cugraph_centrality_result_free(cugraph_centrality_result_t* result)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_centrality_result_t*>(result);
diff --git a/cpp/src/c_api/centrality_result.hpp b/cpp/src/c_api/centrality_result.hpp
index e39db686152..068dd838c93 100644
--- a/cpp/src/c_api/centrality_result.hpp
+++ b/cpp/src/c_api/centrality_result.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@ namespace c_api {
 struct cugraph_centrality_result_t {
   cugraph_type_erased_device_array_t* vertex_ids_{};
   cugraph_type_erased_device_array_t* values_{};
+  size_t num_iterations_{0};
+  bool converged_{false};
 };
 
 struct cugraph_edge_centrality_result_t {
diff --git a/cpp/src/c_api/pagerank.cpp b/cpp/src/c_api/pagerank.cpp
index 2565a1aebe2..50eda152c67 100644
--- a/cpp/src/c_api/pagerank.cpp
+++ b/cpp/src/c_api/pagerank.cpp
@@ -120,9 +120,7 @@ struct pagerank_functor : public cugraph::c_api::abstract_functor {
 
       auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
 
-      rmm::device_uvector<weight_t> pageranks(graph_view.local_vertex_partition_range_size(),
-                                              handle_.get_stream());
-
+      rmm::device_uvector<weight_t> initial_pageranks(0, handle_.get_stream());
       rmm::device_uvector<vertex_t> personalization_vertices(0, handle_.get_stream());
       rmm::device_uvector<weight_t> personalization_values(0, handle_.get_stream());
 
@@ -201,7 +199,7 @@ struct pagerank_functor : public cugraph::c_api::abstract_functor {
                    initial_guess_values.size(),
                    handle_.get_stream());
 
-        pageranks = cugraph::detail::
+        initial_pageranks = cugraph::detail::
           collect_local_vertex_values_from_ext_vertex_value_pairs<vertex_t, weight_t, multi_gpu>(
             handle_,
             std::move(initial_guess_vertices),
@@ -213,25 +211,30 @@ struct pagerank_functor : public cugraph::c_api::abstract_functor {
             do_expensive_check_);
       }
 
-      cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, multi_gpu>(
-        handle_,
-        graph_view,
-        (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
-        precomputed_vertex_out_weight_sums_
-          ? std::make_optional(precomputed_vertex_out_weight_sums.data())
-          : std::nullopt,
-        personalization_vertices_ ? std::make_optional(personalization_vertices.data())
-                                  : std::nullopt,
-        personalization_values_ ? std::make_optional(personalization_values.data()) : std::nullopt,
-        personalization_vertices_
-          ? std::make_optional(static_cast<vertex_t>(personalization_vertices.size()))
-          : std::nullopt,
-        pageranks.data(),
-        static_cast<weight_t>(alpha_),
-        static_cast<weight_t>(epsilon_),
-        max_iterations_,
-        initial_guess_values_ != nullptr,
-        do_expensive_check_);
+      auto [pageranks, metadata] =
+        cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, multi_gpu>(
+          handle_,
+          graph_view,
+          (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+          precomputed_vertex_out_weight_sums_
+            ? std::make_optional(
+                raft::device_span<weight_t const>{precomputed_vertex_out_weight_sums.data(),
+                                                  precomputed_vertex_out_weight_sums.size()})
+            : std::nullopt,
+          personalization_vertices_
+            ? std::make_optional(
+                std::make_tuple(raft::device_span<vertex_t const>{personalization_vertices.data(),
+                                                                  personalization_vertices.size()},
+                                raft::device_span<weight_t const>{personalization_values.data(),
+                                                                  personalization_values.size()}))
+            : std::nullopt,
+          initial_guess_values_ != nullptr ? std::make_optional(raft::device_span<weight_t const>{
+                                               initial_pageranks.data(), initial_pageranks.size()})
+                                           : std::nullopt,
+          static_cast<weight_t>(alpha_),
+          static_cast<weight_t>(epsilon_),
+          max_iterations_,
+          do_expensive_check_);
 
       rmm::device_uvector<vertex_t> vertex_ids(graph_view.local_vertex_partition_range_size(),
                                                handle_.get_stream());
@@ -239,7 +242,9 @@ struct pagerank_functor : public cugraph::c_api::abstract_functor {
 
       result_ = new cugraph::c_api::cugraph_centrality_result_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(vertex_ids, graph_->vertex_type_),
-        new cugraph::c_api::cugraph_type_erased_device_array_t(pageranks, graph_->weight_type_)};
+        new cugraph::c_api::cugraph_type_erased_device_array_t(pageranks, graph_->weight_type_),
+        metadata.number_of_iterations_,
+        metadata.converged_};
     }
   }
 };
@@ -305,6 +310,75 @@ extern "C" cugraph_error_code_t cugraph_pagerank(
                            max_iterations,
                            do_expensive_check);
 
+  auto return_value = cugraph::c_api::run_algorithm(graph, functor, result, error);
+
+  CAPI_EXPECTS(cugraph_centrality_result_converged(*result) == bool_t::TRUE,
+               CUGRAPH_UNKNOWN_ERROR,
+               "PageRank failed to converge.",
+               *error);
+
+  return return_value;
+}
+
+extern "C" cugraph_error_code_t cugraph_pagerank_allow_nonconvergence(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_vertices,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_sums,
+  const cugraph_type_erased_device_array_view_t* initial_guess_vertices,
+  const cugraph_type_erased_device_array_view_t* initial_guess_values,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool_t do_expensive_check,
+  cugraph_centrality_result_t** result,
+  cugraph_error_t** error)
+{
+  if (precomputed_vertex_out_weight_vertices != nullptr) {
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     precomputed_vertex_out_weight_vertices)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and precomputed_vertex_out_weight_vertices must match",
+                 *error);
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->weight_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     precomputed_vertex_out_weight_sums)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and precomputed_vertex_out_weight_sums must match",
+                 *error);
+  }
+  if (initial_guess_vertices != nullptr) {
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     initial_guess_vertices)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and initial_guess_vertices must match",
+                 *error);
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->weight_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     initial_guess_values)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and initial_guess_values must match",
+                 *error);
+  }
+  pagerank_functor functor(handle,
+                           graph,
+                           precomputed_vertex_out_weight_vertices,
+                           precomputed_vertex_out_weight_sums,
+                           initial_guess_vertices,
+                           initial_guess_values,
+                           nullptr,
+                           nullptr,
+                           alpha,
+                           epsilon,
+                           max_iterations,
+                           do_expensive_check);
+
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
 
@@ -373,6 +447,94 @@ extern "C" cugraph_error_code_t cugraph_personalized_pagerank(
                  *error);
   }
 
+  pagerank_functor functor(handle,
+                           graph,
+                           precomputed_vertex_out_weight_vertices,
+                           precomputed_vertex_out_weight_sums,
+                           initial_guess_vertices,
+                           initial_guess_values,
+                           personalization_vertices,
+                           personalization_values,
+                           alpha,
+                           epsilon,
+                           max_iterations,
+                           do_expensive_check);
+
+  auto return_value = cugraph::c_api::run_algorithm(graph, functor, result, error);
+
+  CAPI_EXPECTS(cugraph_centrality_result_converged(*result) == bool_t::TRUE,
+               CUGRAPH_UNKNOWN_ERROR,
+               "PageRank failed to converge.",
+               *error);
+
+  return return_value;
+}
+
+extern "C" cugraph_error_code_t cugraph_personalized_pagerank_allow_nonconvergence(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_vertices,
+  const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_sums,
+  const cugraph_type_erased_device_array_view_t* initial_guess_vertices,
+  const cugraph_type_erased_device_array_view_t* initial_guess_values,
+  const cugraph_type_erased_device_array_view_t* personalization_vertices,
+  const cugraph_type_erased_device_array_view_t* personalization_values,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool_t do_expensive_check,
+  cugraph_centrality_result_t** result,
+  cugraph_error_t** error)
+{
+  if (precomputed_vertex_out_weight_vertices != nullptr) {
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     precomputed_vertex_out_weight_vertices)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and precomputed_vertex_out_weight_vertices must match",
+                 *error);
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->weight_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     precomputed_vertex_out_weight_sums)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and precomputed_vertex_out_weight_sums must match",
+                 *error);
+  }
+  if (initial_guess_vertices != nullptr) {
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     initial_guess_vertices)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and initial_guess_vertices must match",
+                 *error);
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->weight_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     initial_guess_values)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and initial_guess_values must match",
+                 *error);
+  }
+  if (personalization_vertices != nullptr) {
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     personalization_vertices)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and personalization_vector must match",
+                 *error);
+    CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->weight_type_ ==
+                   reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                     personalization_values)
+                     ->type_,
+                 CUGRAPH_INVALID_INPUT,
+                 "vertex type of graph and personalization_vector must match",
+                 *error);
+  }
+
   pagerank_functor functor(handle,
                            graph,
                            precomputed_vertex_out_weight_vertices,
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 49d1a3eabb9..3a84cdedfda 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -46,22 +46,19 @@
 namespace cugraph {
 namespace detail {
 
-// FIXME: personalization_vector_size is confusing in OPG (local or aggregate?)
 template <typename GraphViewType, typename weight_t, typename result_t>
-void pagerank(
+centrality_algorithm_metadata_t pagerank(
   raft::handle_t const& handle,
   GraphViewType const& pull_graph_view,
   std::optional<edge_property_view_t<typename GraphViewType::edge_type, weight_t const*>>
     edge_weight_view,
-  std::optional<weight_t const*> precomputed_vertex_out_weight_sums,
-  std::optional<typename GraphViewType::vertex_type const*> personalization_vertices,
-  std::optional<result_t const*> personalization_values,
-  std::optional<typename GraphViewType::vertex_type> personalization_vector_size,
-  result_t* pageranks,
+  std::optional<raft::device_span<weight_t const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<typename GraphViewType::vertex_type const>,
+                           raft::device_span<result_t const>>> personalization,
+  raft::device_span<result_t> pageranks,
   result_t alpha,
   result_t epsilon,
   size_t max_iterations,
-  bool has_initial_guess,
   bool do_expensive_check)
 {
   using vertex_t = typename GraphViewType::vertex_type;
@@ -75,27 +72,26 @@ void pagerank(
                 "GraphViewType should support the pull model.");
 
   auto const num_vertices = pull_graph_view.number_of_vertices();
-  if (num_vertices == 0) { return; }
+  if (num_vertices == 0) { return centrality_algorithm_metadata_t{0, true}; }
 
   auto aggregate_personalization_vector_size =
-    personalization_vertices ? GraphViewType::is_multi_gpu
-                                 ? host_scalar_allreduce(handle.get_comms(),
-                                                         *personalization_vector_size,
-                                                         raft::comms::op_t::SUM,
-                                                         handle.get_stream())
-                                 : *personalization_vector_size
-                             : vertex_t{0};
+    personalization ? GraphViewType::is_multi_gpu
+                        ? host_scalar_allreduce(handle.get_comms(),
+                                                std::get<0>(*personalization).size(),
+                                                raft::comms::op_t::SUM,
+                                                handle.get_stream())
+                        : std::get<0>(*personalization).size()
+                    : vertex_t{0};
 
   // 1. check input arguments
 
-  CUGRAPH_EXPECTS((personalization_vertices.has_value() == false) ||
-                    (personalization_values.has_value() && personalization_vector_size.has_value()),
-                  "Invalid input argument: if personalization_vertices.has_value() is true, "
-                  "personalization_values.has_value() and personalization_vector_size.has_value() "
-                  "should be true as well.");
+  CUGRAPH_EXPECTS((personalization.has_value() == false) ||
+                    (std::get<0>(*personalization).size() == std::get<1>(*personalization).size()),
+                  "Invalid input argument: if personalization.has_value() is true, the size of "
+                  "vertices and values should match");
   CUGRAPH_EXPECTS(
-    (personalization_vertices.has_value() == false) || (aggregate_personalization_vector_size > 0),
-    "Invalid input argument: if personalization_vertices.has_value() is true, the input "
+    (personalization.has_value() == false) || (aggregate_personalization_vector_size > 0),
+    "Invalid input argument: if personalizations.has_value() is true, the input "
     "personalization vector size should not be 0.");
   CUGRAPH_EXPECTS((alpha >= 0.0) && (alpha <= 1.0),
                   "Invalid input argument: alpha should be in [0.0, 1.0].");
@@ -106,7 +102,7 @@ void pagerank(
       auto num_negative_precomputed_vertex_out_weight_sums =
         count_if_v(handle,
                    pull_graph_view,
-                   *precomputed_vertex_out_weight_sums,
+                   precomputed_vertex_out_weight_sums->data(),
                    [] __device__(auto, auto val) { return val < result_t{0.0}; });
       CUGRAPH_EXPECTS(
         num_negative_precomputed_vertex_out_weight_sums == 0,
@@ -126,17 +122,10 @@ void pagerank(
         "Invalid input argument: input edge weights should have non-negative values.");
     }
 
-    if (has_initial_guess) {
-      auto num_negative_values = count_if_v(
-        handle, pull_graph_view, pageranks, [] __device__(auto, auto val) { return val < 0.0; });
-      CUGRAPH_EXPECTS(num_negative_values == 0,
-                      "Invalid input argument: initial guess values should be non-negative.");
-    }
-
     if constexpr (GraphViewType::is_multi_gpu) {
       auto num_gpus_with_valid_personalization_vector =
         host_scalar_allreduce(handle.get_comms(),
-                              personalization_vertices ? int{1} : int{0},
+                              personalization ? int{1} : int{0},
                               raft::comms::op_t::SUM,
                               handle.get_stream());
       CUGRAPH_EXPECTS(
@@ -151,8 +140,8 @@ void pagerank(
         pull_graph_view.local_vertex_partition_view());
       auto num_invalid_vertices =
         thrust::count_if(handle.get_thrust_policy(),
-                         *personalization_vertices,
-                         *personalization_vertices + *personalization_vector_size,
+                         std::get<0>(*personalization).begin(),
+                         std::get<0>(*personalization).end(),
                          [vertex_partition] __device__(auto val) {
                            return !(vertex_partition.is_valid_vertex(val) &&
                                     vertex_partition.in_local_vertex_partition_range_nocheck(val));
@@ -163,17 +152,36 @@ void pagerank(
       }
       CUGRAPH_EXPECTS(num_invalid_vertices == 0,
                       "Invalid input argument: peresonalization vertices have invalid vertex IDs.");
-      auto num_negative_values =
-        thrust::count_if(handle.get_thrust_policy(),
-                         *personalization_values,
-                         *personalization_values + *personalization_vector_size,
-                         [] __device__(auto val) { return val < 0.0; });
+      auto num_negative_values = thrust::count_if(handle.get_thrust_policy(),
+                                                  std::get<1>(*personalization).begin(),
+                                                  std::get<1>(*personalization).end(),
+                                                  [] __device__(auto val) { return val < 0.0; });
       if constexpr (GraphViewType::is_multi_gpu) {
         num_negative_values = host_scalar_allreduce(
           handle.get_comms(), num_negative_values, raft::comms::op_t::SUM, handle.get_stream());
       }
       CUGRAPH_EXPECTS(num_negative_values == 0,
                       "Invalid input argument: peresonalization values should be non-negative.");
+
+      rmm::device_uvector<vertex_t> check_for_duplicates(std::get<0>(*personalization).size(),
+                                                         handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   std::get<0>(*personalization).begin(),
+                   std::get<0>(*personalization).end(),
+                   check_for_duplicates.begin());
+
+      thrust::sort(
+        handle.get_thrust_policy(), check_for_duplicates.begin(), check_for_duplicates.end());
+
+      auto num_uniques =
+        thrust::count_if(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(check_for_duplicates.size()),
+                         detail::is_first_in_run_t<vertex_t const*>{check_for_duplicates.data()});
+
+      CUGRAPH_EXPECTS(
+        static_cast<size_t>(num_uniques) == check_for_duplicates.size(),
+        "Invalid input argument: personalization vertices not contain duplicate entries.");
     }
   }
 
@@ -196,35 +204,16 @@ void pagerank(
     }
   }
   auto vertex_out_weight_sums = precomputed_vertex_out_weight_sums
-                                  ? *precomputed_vertex_out_weight_sums
+                                  ? (*precomputed_vertex_out_weight_sums).data()
                                   : (*tmp_vertex_out_weight_sums).data();
 
-  // 3. initialize pagerank values
-
-  if (has_initial_guess) {
-    auto sum = reduce_v(handle, pull_graph_view, pageranks, result_t{0.0});
-    CUGRAPH_EXPECTS(sum > 0.0,
-                    "Invalid input argument: sum of the PageRank initial "
-                    "guess values should be positive.");
-    thrust::transform(handle.get_thrust_policy(),
-                      pageranks,
-                      pageranks + pull_graph_view.local_vertex_partition_range_size(),
-                      pageranks,
-                      [sum] __device__(auto val) { return val / sum; });
-  } else {
-    thrust::fill(handle.get_thrust_policy(),
-                 pageranks,
-                 pageranks + pull_graph_view.local_vertex_partition_range_size(),
-                 result_t{1.0} / static_cast<result_t>(num_vertices));
-  }
-
-  // 4. sum the personalization values
+  // 3. sum the personalization values
 
   result_t personalization_sum{0.0};
   if (aggregate_personalization_vector_size > 0) {
     personalization_sum = thrust::reduce(handle.get_thrust_policy(),
-                                         *personalization_values,
-                                         *personalization_values + *personalization_vector_size,
+                                         std::get<1>(*personalization).begin(),
+                                         std::get<1>(*personalization).end(),
                                          result_t{0.0});
     if constexpr (GraphViewType::is_multi_gpu) {
       personalization_sum = host_scalar_allreduce(
@@ -243,18 +232,13 @@ void pagerank(
   edge_src_property_t<GraphViewType, result_t> edge_src_pageranks(handle, pull_graph_view);
   size_t iter{0};
   while (true) {
-    thrust::copy(handle.get_thrust_policy(),
-                 pageranks,
-                 pageranks + pull_graph_view.local_vertex_partition_range_size(),
-                 old_pageranks.data());
-
-    auto vertex_val_first =
-      thrust::make_zip_iterator(thrust::make_tuple(pageranks, vertex_out_weight_sums));
+    thrust::copy(
+      handle.get_thrust_policy(), pageranks.begin(), pageranks.end(), old_pageranks.data());
 
     auto dangling_sum = transform_reduce_v(
       handle,
       pull_graph_view,
-      vertex_val_first,
+      thrust::make_zip_iterator(pageranks.begin(), vertex_out_weight_sums),
       [] __device__(auto, auto val) {
         auto const pagerank       = thrust::get<0>(val);
         auto const out_weight_sum = thrust::get<1>(val);
@@ -262,19 +246,21 @@ void pagerank(
       },
       result_t{0.0});
 
-    thrust::transform(handle.get_thrust_policy(),
-                      vertex_val_first,
-                      vertex_val_first + pull_graph_view.local_vertex_partition_range_size(),
-                      pageranks,
-                      [] __device__(auto val) {
-                        auto const pagerank       = thrust::get<0>(val);
-                        auto const out_weight_sum = thrust::get<1>(val);
-                        auto const divisor =
-                          out_weight_sum == result_t{0.0} ? result_t{1.0} : out_weight_sum;
-                        return pagerank / divisor;
-                      });
+    thrust::transform(
+      handle.get_thrust_policy(),
+      thrust::make_zip_iterator(pageranks.begin(), vertex_out_weight_sums),
+      thrust::make_zip_iterator(
+        pageranks.end(),
+        vertex_out_weight_sums + pull_graph_view.local_vertex_partition_range_size()),
+      pageranks.begin(),
+      [] __device__(auto val) {
+        auto const pagerank       = thrust::get<0>(val);
+        auto const out_weight_sum = thrust::get<1>(val);
+        auto const divisor = out_weight_sum == result_t{0.0} ? result_t{1.0} : out_weight_sum;
+        return pagerank / divisor;
+      });
 
-    update_edge_src_property(handle, pull_graph_view, pageranks, edge_src_pageranks);
+    update_edge_src_property(handle, pull_graph_view, pageranks.data(), edge_src_pageranks);
 
     auto unvarying_part = aggregate_personalization_vector_size == 0
                             ? (dangling_sum * alpha + static_cast<result_t>(1.0 - alpha)) /
@@ -293,7 +279,7 @@ void pagerank(
         },
         unvarying_part,
         reduce_op::plus<result_t>{},
-        pageranks);
+        pageranks.begin());
     } else {
       per_v_transform_reduce_incoming_e(
         handle,
@@ -306,20 +292,23 @@ void pagerank(
         },
         unvarying_part,
         reduce_op::plus<result_t>{},
-        pageranks);
+        pageranks.begin());
     }
 
     if (aggregate_personalization_vector_size > 0) {
       auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
         pull_graph_view.local_vertex_partition_view());
-      auto val_first = thrust::make_zip_iterator(
-        thrust::make_tuple(*personalization_vertices, *personalization_values));
       thrust::for_each(
         handle.get_thrust_policy(),
-        val_first,
-        val_first + *personalization_vector_size,
-        [vertex_partition, pageranks, dangling_sum, personalization_sum, alpha] __device__(
-          auto val) {
+        thrust::make_zip_iterator(thrust::make_tuple(std::get<0>(*personalization).begin(),
+                                                     std::get<1>(*personalization).begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(std::get<0>(*personalization).end(),
+                                                     std::get<1>(*personalization).end())),
+        [vertex_partition,
+         pageranks = pageranks.data(),
+         dangling_sum,
+         personalization_sum,
+         alpha] __device__(auto val) {
           auto v     = thrust::get<0>(val);
           auto value = thrust::get<1>(val);
           *(pageranks + vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v)) +=
@@ -331,7 +320,7 @@ void pagerank(
     auto diff_sum = transform_reduce_v(
       handle,
       pull_graph_view,
-      thrust::make_zip_iterator(thrust::make_tuple(pageranks, old_pageranks.data())),
+      thrust::make_zip_iterator(thrust::make_tuple(pageranks.begin(), old_pageranks.begin())),
       [] __device__(auto, auto val) { return std::abs(thrust::get<0>(val) - thrust::get<1>(val)); },
       result_t{0.0});
 
@@ -340,9 +329,11 @@ void pagerank(
     if (diff_sum < epsilon) {
       break;
     } else if (iter >= max_iterations) {
-      CUGRAPH_FAIL("PageRank failed to converge.");
+      break;
     }
   }
+
+  return centrality_algorithm_metadata_t{iter, (iter < max_iterations)};
 }
 
 }  // namespace detail
@@ -364,19 +355,102 @@ void pagerank(raft::handle_t const& handle,
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
-  detail::pagerank(handle,
-                   graph_view,
-                   edge_weight_view,
-                   precomputed_vertex_out_weight_sums,
-                   personalization_vertices,
-                   personalization_values,
-                   personalization_vector_size,
-                   pageranks,
-                   alpha,
-                   epsilon,
-                   max_iterations,
-                   has_initial_guess,
-                   do_expensive_check);
+  CUGRAPH_EXPECTS((personalization_vertices.has_value() == false) ||
+                    (personalization_values.has_value() && personalization_vector_size.has_value()),
+                  "Invalid input argument: if personalization_vertices.has_value() is true, ");
+
+  // initialize pagerank values
+  if (has_initial_guess) {
+    if (do_expensive_check) {
+      auto num_negative_values = count_if_v(
+        handle, graph_view, pageranks, [] __device__(auto, auto val) { return val < 0.0; });
+      CUGRAPH_EXPECTS(num_negative_values == 0,
+                      "Invalid input argument: initial guess values should be non-negative.");
+    }
+
+    auto sum = reduce_v(handle, graph_view, pageranks, result_t{0.0});
+    CUGRAPH_EXPECTS(sum > 0.0,
+                    "Invalid input argument: sum of the PageRank initial "
+                    "guess values should be positive.");
+    thrust::transform(handle.get_thrust_policy(),
+                      pageranks,
+                      pageranks + graph_view.local_vertex_partition_range_size(),
+                      pageranks,
+                      [sum] __device__(auto val) { return val / sum; });
+  } else {
+    thrust::fill(handle.get_thrust_policy(),
+                 pageranks,
+                 pageranks + graph_view.local_vertex_partition_range_size(),
+                 result_t{1.0} / static_cast<result_t>(graph_view.number_of_vertices()));
+  }
+
+  auto metadata = detail::pagerank(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::make_optional(raft::device_span<weight_t const>{
+      *precomputed_vertex_out_weight_sums,
+      static_cast<size_t>(graph_view.local_vertex_partition_range_size())}),
+    personalization_vertices
+      ? std::make_optional(std::make_tuple(
+          raft::device_span<vertex_t const>{*personalization_vertices,
+                                            static_cast<size_t>(*personalization_vector_size)},
+          raft::device_span<result_t const>{*personalization_values,
+                                            static_cast<size_t>(*personalization_vector_size)}))
+      : std::nullopt,
+    raft::device_span<result_t>{
+      pageranks, static_cast<size_t>(graph_view.local_vertex_partition_range_size())},
+    alpha,
+    epsilon,
+    max_iterations,
+    do_expensive_check);
+
+  CUGRAPH_EXPECTS(metadata.converged_, "PageRank failed to converge.");
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<result_t>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, true, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<raft::device_span<weight_t const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<vertex_t const>, raft::device_span<result_t const>>>
+    personalization,
+  std::optional<raft::device_span<result_t const>> initial_pageranks,
+  result_t alpha,
+  result_t epsilon,
+  size_t max_iterations,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
+  rmm::device_uvector<result_t> local_pageranks(graph_view.local_vertex_partition_range_size(),
+                                                handle.get_stream());
+  if (!initial_pageranks) {
+    thrust::fill(handle.get_thrust_policy(),
+                 local_pageranks.begin(),
+                 local_pageranks.end(),
+                 result_t{1.0} / graph_view.number_of_vertices());
+  } else {
+    thrust::copy(handle.get_thrust_policy(),
+                 initial_pageranks->begin(),
+                 initial_pageranks->end(),
+                 local_pageranks.begin());
+  }
+
+  auto metadata =
+    detail::pagerank(handle,
+                     graph_view,
+                     edge_weight_view,
+                     precomputed_vertex_out_weight_sums,
+                     personalization,
+                     raft::device_span<result_t>{local_pageranks.data(), local_pageranks.size()},
+                     alpha,
+                     epsilon,
+                     max_iterations,
+                     do_expensive_check);
+
+  return std::make_tuple(std::move(local_pageranks), metadata);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/link_analysis/pagerank_mg.cu b/cpp/src/link_analysis/pagerank_mg.cu
index d6dd5f60544..dc9892f69a8 100644
--- a/cpp/src/link_analysis/pagerank_mg.cu
+++ b/cpp/src/link_analysis/pagerank_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,4 +102,82 @@ template void pagerank(raft::handle_t const& handle,
                        bool has_initial_guess,
                        bool do_expensive_check);
 
+template std::tuple<rmm::device_uvector<float>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<raft::device_span<float const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<float const>>>
+    personalization,
+  std::optional<raft::device_span<float const>> initial_pageranks,
+  float alpha,
+  float epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<double>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<raft::device_span<double const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<double const>>>
+    personalization,
+  std::optional<raft::device_span<double const>> initial_pageranks,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<float>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<raft::device_span<float const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<float const>>>
+    personalization,
+  std::optional<raft::device_span<float const>> initial_pageranks,
+  float alpha,
+  float epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<double>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<raft::device_span<double const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<double const>>>
+    personalization,
+  std::optional<raft::device_span<double const>> initial_pageranks,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<float>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<raft::device_span<float const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int64_t const>, raft::device_span<float const>>>
+    personalization,
+  std::optional<raft::device_span<float const>> initial_pageranks,
+  float alpha,
+  float epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<double>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<raft::device_span<double const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int64_t const>, raft::device_span<double const>>>
+    personalization,
+  std::optional<raft::device_span<double const>> initial_pageranks,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
 }  // namespace cugraph
diff --git a/cpp/src/link_analysis/pagerank_sg.cu b/cpp/src/link_analysis/pagerank_sg.cu
index 3dc0adc45df..51d123fe337 100644
--- a/cpp/src/link_analysis/pagerank_sg.cu
+++ b/cpp/src/link_analysis/pagerank_sg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,4 +102,82 @@ template void pagerank(raft::handle_t const& handle,
                        bool has_initial_guess,
                        bool do_expensive_check);
 
+template std::tuple<rmm::device_uvector<float>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<raft::device_span<float const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<float const>>>
+    personalization,
+  std::optional<raft::device_span<float const>> initial_pageranks,
+  float alpha,
+  float epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<double>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<raft::device_span<double const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<double const>>>
+    personalization,
+  std::optional<raft::device_span<double const>> initial_pageranks,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<float>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<raft::device_span<float const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<float const>>>
+    personalization,
+  std::optional<raft::device_span<float const>> initial_pageranks,
+  float alpha,
+  float epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<double>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<raft::device_span<double const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<double const>>>
+    personalization,
+  std::optional<raft::device_span<double const>> initial_pageranks,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<float>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<raft::device_span<float const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int64_t const>, raft::device_span<float const>>>
+    personalization,
+  std::optional<raft::device_span<float const>> initial_pageranks,
+  float alpha,
+  float epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<double>, centrality_algorithm_metadata_t> pagerank(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<raft::device_span<double const>> precomputed_vertex_out_weight_sums,
+  std::optional<std::tuple<raft::device_span<int64_t const>, raft::device_span<double const>>>
+    personalization,
+  std::optional<raft::device_span<double const>> initial_pageranks,
+  double alpha,
+  double epsilon,
+  size_t max_iterations,
+  bool do_expensive_check);
+
 }  // namespace cugraph
diff --git a/cpp/tests/c_api/mg_pagerank_test.c b/cpp/tests/c_api/mg_pagerank_test.c
index 09925b9ac4e..9c142236808 100644
--- a/cpp/tests/c_api/mg_pagerank_test.c
+++ b/cpp/tests/c_api/mg_pagerank_test.c
@@ -100,6 +100,81 @@ int generic_pagerank_test(const cugraph_resource_handle_t* handle,
   return test_ret_value;
 }
 
+int generic_pagerank_nonconverging_test(const cugraph_resource_handle_t* handle,
+                                        vertex_t* h_src,
+                                        vertex_t* h_dst,
+                                        weight_t* h_wgt,
+                                        weight_t* h_result,
+                                        size_t num_vertices,
+                                        size_t num_edges,
+                                        bool_t store_transposed,
+                                        double alpha,
+                                        double epsilon,
+                                        size_t max_iterations)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* p_graph              = NULL;
+  cugraph_centrality_result_t* p_result = NULL;
+
+  ret_code = create_mg_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &p_graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_mg_test_graph failed.");
+
+  ret_code = cugraph_pagerank_allow_nonconvergence(handle,
+                                                   p_graph,
+                                                   NULL,
+                                                   NULL,
+                                                   NULL,
+                                                   NULL,
+                                                   alpha,
+                                                   epsilon,
+                                                   max_iterations,
+                                                   FALSE,
+                                                   &p_result,
+                                                   &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_pagerank failed.");
+
+  // NOTE: Because we get back vertex ids and pageranks, we can simply compare
+  //       the returned values with the expected results for the entire
+  //       graph.  Each GPU will have a subset of the total vertices, so
+  //       they will do a subset of the comparisons.
+  cugraph_type_erased_device_array_view_t* vertices;
+  cugraph_type_erased_device_array_view_t* pageranks;
+
+  vertices  = cugraph_centrality_result_get_vertices(p_result);
+  pageranks = cugraph_centrality_result_get_values(p_result);
+
+  size_t num_local_vertices = cugraph_type_erased_device_array_view_size(vertices);
+
+  vertex_t h_vertices[num_local_vertices];
+  weight_t h_pageranks[num_local_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_vertices, vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_pageranks, pageranks, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  for (int i = 0; (i < num_local_vertices) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                nearlyEqual(h_result[h_vertices[i]], h_pageranks[i], 0.001),
+                "pagerank results don't match");
+  }
+
+  cugraph_centrality_result_free(p_result);
+  cugraph_mg_graph_free(p_graph);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
 int generic_personalized_pagerank_test(const cugraph_resource_handle_t* handle,
                                        vertex_t* h_src,
                                        vertex_t* h_dst,
@@ -209,6 +284,115 @@ int generic_personalized_pagerank_test(const cugraph_resource_handle_t* handle,
   return test_ret_value;
 }
 
+int generic_personalized_pagerank_nonconverging_test(const cugraph_resource_handle_t* handle,
+                                                     vertex_t* h_src,
+                                                     vertex_t* h_dst,
+                                                     weight_t* h_wgt,
+                                                     weight_t* h_result,
+                                                     vertex_t* h_personalization_vertices,
+                                                     weight_t* h_personalization_values,
+                                                     size_t num_vertices,
+                                                     size_t num_edges,
+                                                     size_t num_personalization_vertices,
+                                                     bool_t store_transposed,
+                                                     double alpha,
+                                                     double epsilon,
+                                                     size_t max_iterations)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* p_graph                                               = NULL;
+  cugraph_centrality_result_t* p_result                                  = NULL;
+  cugraph_type_erased_device_array_t* personalization_vertices           = NULL;
+  cugraph_type_erased_device_array_t* personalization_values             = NULL;
+  cugraph_type_erased_device_array_view_t* personalization_vertices_view = NULL;
+  cugraph_type_erased_device_array_view_t* personalization_values_view   = NULL;
+
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  ret_code = create_mg_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &p_graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  if (cugraph_resource_handle_get_rank(handle) != 0) { num_personalization_vertices = 0; }
+
+  ret_code = cugraph_type_erased_device_array_create(
+    handle, num_personalization_vertices, vertex_tid, &personalization_vertices, &ret_error);
+  TEST_ASSERT(
+    test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_vertices create failed.");
+
+  ret_code = cugraph_type_erased_device_array_create(
+    handle, num_personalization_vertices, weight_tid, &personalization_values, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_values create failed.");
+
+  personalization_vertices_view = cugraph_type_erased_device_array_view(personalization_vertices);
+  personalization_values_view   = cugraph_type_erased_device_array_view(personalization_values);
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, personalization_vertices_view, (byte_t*)h_personalization_vertices, &ret_error);
+  TEST_ASSERT(
+    test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_vertices copy_from_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, personalization_values_view, (byte_t*)h_personalization_values, &ret_error);
+  TEST_ASSERT(
+    test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_values copy_from_host failed.");
+
+  ret_code = cugraph_personalized_pagerank_allow_nonconvergence(handle,
+                                                                p_graph,
+                                                                NULL,
+                                                                NULL,
+                                                                NULL,
+                                                                NULL,
+                                                                personalization_vertices_view,
+                                                                personalization_values_view,
+                                                                alpha,
+                                                                epsilon,
+                                                                max_iterations,
+                                                                FALSE,
+                                                                &p_result,
+                                                                &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_personalized_pagerank failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_personalized_pagerank failed.");
+
+  cugraph_type_erased_device_array_view_t* vertices;
+  cugraph_type_erased_device_array_view_t* pageranks;
+
+  vertices  = cugraph_centrality_result_get_vertices(p_result);
+  pageranks = cugraph_centrality_result_get_values(p_result);
+
+  size_t num_local_vertices = cugraph_type_erased_device_array_view_size(vertices);
+
+  vertex_t h_vertices[num_local_vertices];
+  weight_t h_pageranks[num_local_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_vertices, vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_pageranks, pageranks, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  for (int i = 0; (i < num_local_vertices) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                nearlyEqual(h_result[h_vertices[i]], h_pageranks[i], 0.001),
+                "pagerank results don't match");
+  }
+
+  cugraph_centrality_result_free(p_result);
+  cugraph_mg_graph_free(p_graph);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
 int test_pagerank(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 8;
@@ -323,6 +507,34 @@ int test_pagerank_4_with_transpose(const cugraph_resource_handle_t* handle)
                                max_iterations);
 }
 
+int test_pagerank_non_convergence(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges    = 8;
+  size_t num_vertices = 6;
+
+  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  weight_t h_result[] = {0.0776471, 0.167637, 0.0639699, 0.220202, 0.140046, 0.330498};
+
+  double alpha          = 0.95;
+  double epsilon        = 0.0001;
+  size_t max_iterations = 2;
+
+  // Pagerank wants store_transposed = TRUE
+  return generic_pagerank_nonconverging_test(handle,
+                                             h_src,
+                                             h_dst,
+                                             h_wgt,
+                                             h_result,
+                                             num_vertices,
+                                             num_edges,
+                                             TRUE,
+                                             alpha,
+                                             epsilon,
+                                             max_iterations);
+}
+
 int test_personalized_pagerank(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 3;
@@ -356,6 +568,40 @@ int test_personalized_pagerank(const cugraph_resource_handle_t* handle)
                                             max_iterations);
 }
 
+int test_personalized_pagerank_non_convergence(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges    = 3;
+  size_t num_vertices = 4;
+
+  vertex_t h_src[]    = {0, 1, 2};
+  vertex_t h_dst[]    = {1, 2, 3};
+  weight_t h_wgt[]    = {1.f, 1.f, 1.f};
+  weight_t h_result[] = { 0.03625, 0.285, 0.32125, 0.3575 };
+
+
+  vertex_t h_personalized_vertices[] = {0, 1, 2, 3};
+  weight_t h_personalized_values[]   = {0.1, 0.2, 0.3, 0.4};
+
+  double alpha          = 0.85;
+  double epsilon        = 1.0e-6;
+  size_t max_iterations = 1;
+
+  return  generic_personalized_pagerank_nonconverging_test(handle,
+                                                           h_src,
+                                                           h_dst,
+                                                           h_wgt,
+                                                           h_result,
+                                                           h_personalized_vertices,
+                                                           h_personalized_values,
+                                                           num_vertices,
+                                                           num_edges,
+                                                           num_vertices,
+                                                           FALSE,
+                                                           alpha,
+                                                           epsilon,
+                                                           max_iterations);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
@@ -368,7 +614,9 @@ int main(int argc, char** argv)
   result |= RUN_MG_TEST(test_pagerank_with_transpose, handle);
   result |= RUN_MG_TEST(test_pagerank_4, handle);
   result |= RUN_MG_TEST(test_pagerank_4_with_transpose, handle);
+  result |= RUN_MG_TEST(test_pagerank_non_convergence, handle);
   result |= RUN_MG_TEST(test_personalized_pagerank, handle);
+  result |= RUN_MG_TEST(test_personalized_pagerank_non_convergence, handle);
 
   cugraph_free_resource_handle(handle);
   free_mg_raft_handle(raft_handle);
diff --git a/cpp/tests/c_api/pagerank_test.c b/cpp/tests/c_api/pagerank_test.c
index 048750da06c..e12021cd16d 100644
--- a/cpp/tests/c_api/pagerank_test.c
+++ b/cpp/tests/c_api/pagerank_test.c
@@ -67,6 +67,82 @@ int generic_pagerank_test(vertex_t* h_src,
                               &p_result,
                               &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_pagerank failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  cugraph_type_erased_device_array_view_t* vertices;
+  cugraph_type_erased_device_array_view_t* pageranks;
+
+  vertices  = cugraph_centrality_result_get_vertices(p_result);
+  pageranks = cugraph_centrality_result_get_values(p_result);
+
+  vertex_t h_vertices[num_vertices];
+  weight_t h_pageranks[num_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    p_handle, (byte_t*)h_vertices, vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    p_handle, (byte_t*)h_pageranks, pageranks, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  for (int i = 0; (i < num_vertices) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                nearlyEqual(h_result[h_vertices[i]], h_pageranks[i], 0.001),
+                "pagerank results don't match");
+  }
+
+  cugraph_centrality_result_free(p_result);
+  cugraph_sg_graph_free(p_graph);
+  cugraph_free_resource_handle(p_handle);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
+int generic_pagerank_nonconverging_test(vertex_t* h_src,
+                                        vertex_t* h_dst,
+                                        weight_t* h_wgt,
+                                        weight_t* h_result,
+                                        size_t num_vertices,
+                                        size_t num_edges,
+                                        bool_t store_transposed,
+                                        double alpha,
+                                        double epsilon,
+                                        size_t max_iterations)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_resource_handle_t* p_handle   = NULL;
+  cugraph_graph_t* p_graph              = NULL;
+  cugraph_centrality_result_t* p_result = NULL;
+
+  p_handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, p_handle != NULL, "resource handle creation failed.");
+
+  ret_code = create_test_graph(
+    p_handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, FALSE, &p_graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = cugraph_pagerank_allow_nonconvergence(p_handle,
+                                                   p_graph,
+                                                   NULL,
+                                                   NULL,
+                                                   NULL,
+                                                   NULL,
+                                                   alpha,
+                                                   epsilon,
+                                                   max_iterations,
+                                                   FALSE,
+                                                   &p_result,
+                                                   &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_pagerank failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
   cugraph_type_erased_device_array_view_t* vertices;
   cugraph_type_erased_device_array_view_t* pageranks;
@@ -208,6 +284,115 @@ int generic_personalized_pagerank_test(vertex_t* h_src,
   return test_ret_value;
 }
 
+int generic_personalized_pagerank_nonconverging_test(vertex_t* h_src,
+                                                     vertex_t* h_dst,
+                                                     weight_t* h_wgt,
+                                                     weight_t* h_result,
+                                                     vertex_t* h_personalization_vertices,
+                                                     weight_t* h_personalization_values,
+                                                     size_t num_vertices,
+                                                     size_t num_edges,
+                                                     size_t num_personalization_vertices,
+                                                     bool_t store_transposed,
+                                                     double alpha,
+                                                     double epsilon,
+                                                     size_t max_iterations)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_resource_handle_t* p_handle                                    = NULL;
+  cugraph_graph_t* p_graph                                               = NULL;
+  cugraph_centrality_result_t* p_result                                  = NULL;
+  cugraph_type_erased_device_array_t* personalization_vertices           = NULL;
+  cugraph_type_erased_device_array_t* personalization_values             = NULL;
+  cugraph_type_erased_device_array_view_t* personalization_vertices_view = NULL;
+  cugraph_type_erased_device_array_view_t* personalization_values_view   = NULL;
+
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  p_handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, p_handle != NULL, "resource handle creation failed.");
+
+  ret_code = create_test_graph(
+    p_handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, FALSE, &p_graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = cugraph_type_erased_device_array_create(
+    p_handle, num_personalization_vertices, vertex_tid, &personalization_vertices, &ret_error);
+  TEST_ASSERT(
+    test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_vertices create failed.");
+
+  ret_code = cugraph_type_erased_device_array_create(
+    p_handle, num_personalization_vertices, weight_tid, &personalization_values, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_values create failed.");
+
+  personalization_vertices_view = cugraph_type_erased_device_array_view(personalization_vertices);
+  personalization_values_view   = cugraph_type_erased_device_array_view(personalization_values);
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    p_handle, personalization_vertices_view, (byte_t*)h_personalization_vertices, &ret_error);
+  TEST_ASSERT(
+    test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_vertices copy_from_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    p_handle, personalization_values_view, (byte_t*)h_personalization_values, &ret_error);
+  TEST_ASSERT(
+    test_ret_value, ret_code == CUGRAPH_SUCCESS, "personalization_values copy_from_host failed.");
+
+  ret_code = cugraph_personalized_pagerank_allow_nonconvergence(p_handle,
+                                                                p_graph,
+                                                                NULL,
+                                                                NULL,
+                                                                NULL,
+                                                                NULL,
+                                                                personalization_vertices_view,
+                                                                personalization_values_view,
+                                                                alpha,
+                                                                epsilon,
+                                                                max_iterations,
+                                                                FALSE,
+                                                                &p_result,
+                                                                &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_personalized_pagerank failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_personalized_pagerank failed.");
+
+  cugraph_type_erased_device_array_view_t* vertices;
+  cugraph_type_erased_device_array_view_t* pageranks;
+
+  vertices  = cugraph_centrality_result_get_vertices(p_result);
+  pageranks = cugraph_centrality_result_get_values(p_result);
+
+  vertex_t h_vertices[num_vertices];
+  weight_t h_pageranks[num_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    p_handle, (byte_t*)h_vertices, vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    p_handle, (byte_t*)h_pageranks, pageranks, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  for (int i = 0; (i < num_vertices) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                nearlyEqual(h_result[h_vertices[i]], h_pageranks[i], 0.001),
+                "pagerank results don't match");
+  }
+
+  cugraph_centrality_result_free(p_result);
+  cugraph_sg_graph_free(p_graph);
+  cugraph_free_resource_handle(p_handle);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
 int test_pagerank()
 {
   size_t num_edges    = 8;
@@ -286,6 +471,25 @@ int test_pagerank_4_with_transpose()
     h_src, h_dst, h_wgt, h_result, num_vertices, num_edges, TRUE, alpha, epsilon, max_iterations);
 }
 
+int test_pagerank_non_convergence()
+{
+  size_t num_edges    = 8;
+  size_t num_vertices = 6;
+
+  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  weight_t h_result[] = {0.0776471, 0.167637, 0.0639699, 0.220202, 0.140046, 0.330498};
+
+  double alpha          = 0.95;
+  double epsilon        = 0.0001;
+  size_t max_iterations = 2;
+
+  // Pagerank wants store_transposed = TRUE
+  return generic_pagerank_nonconverging_test(
+    h_src, h_dst, h_wgt, h_result, num_vertices, num_edges, TRUE, alpha, epsilon, max_iterations);
+}
+
 int test_personalized_pagerank()
 {
   size_t num_edges    = 3;
@@ -318,6 +522,39 @@ int test_personalized_pagerank()
                                             max_iterations);
 }
 
+int test_personalized_pagerank_non_convergence()
+{
+  size_t num_edges    = 3;
+  size_t num_vertices = 4;
+
+  vertex_t h_src[]    = {0, 1, 2};
+  vertex_t h_dst[]    = {1, 2, 3};
+  weight_t h_wgt[]    = {1.f, 1.f, 1.f};
+  weight_t h_result[] = { 0.03625, 0.285, 0.32125, 0.3575 };
+
+
+  vertex_t h_personalized_vertices[] = {0, 1, 2, 3};
+  weight_t h_personalized_values[]   = {0.1, 0.2, 0.3, 0.4};
+
+  double alpha          = 0.85;
+  double epsilon        = 1.0e-6;
+  size_t max_iterations = 1;
+
+  return  generic_personalized_pagerank_nonconverging_test(h_src,
+                                                           h_dst,
+                                                           h_wgt,
+                                                           h_result,
+                                                           h_personalized_vertices,
+                                                           h_personalized_values,
+                                                           num_vertices,
+                                                           num_edges,
+                                                           num_vertices,
+                                                           FALSE,
+                                                           alpha,
+                                                           epsilon,
+                                                           max_iterations);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
@@ -327,6 +564,8 @@ int main(int argc, char** argv)
   result |= RUN_TEST(test_pagerank_with_transpose);
   result |= RUN_TEST(test_pagerank_4);
   result |= RUN_TEST(test_pagerank_4_with_transpose);
+  result |= RUN_TEST(test_pagerank_non_convergence);
   result |= RUN_TEST(test_personalized_pagerank);
+  result |= RUN_TEST(test_personalized_pagerank_non_convergence);
   return result;
 }
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index b3d9e0271d0..922a6ff2781 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -120,30 +120,25 @@ class Tests_MGPageRank
     result_t constexpr alpha{0.85};
     result_t constexpr epsilon{1e-6};
 
-    rmm::device_uvector<result_t> d_mg_pageranks(mg_graph_view.local_vertex_partition_range_size(),
-                                                 handle_->get_stream());
-
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
       hr_timer.start("MG PageRank");
     }
 
-    cugraph::pagerank<vertex_t, edge_t, weight_t>(
+    auto [d_mg_pageranks, metadata] = cugraph::pagerank<vertex_t, edge_t, weight_t>(
       *handle_,
       mg_graph_view,
       mg_edge_weight_view,
       std::nullopt,
       d_mg_personalization_vertices
-        ? std::optional<vertex_t const*>{(*d_mg_personalization_vertices).data()}
-        : std::nullopt,
-      d_mg_personalization_values
-        ? std::optional<result_t const*>{(*d_mg_personalization_values).data()}
-        : std::nullopt,
-      d_mg_personalization_vertices
-        ? std::optional{static_cast<vertex_t>((*d_mg_personalization_vertices).size())}
+        ? std::make_optional(std::make_tuple(
+            raft::device_span<vertex_t const>{d_mg_personalization_vertices->data(),
+                                              d_mg_personalization_vertices->size()},
+            raft::device_span<result_t const>{d_mg_personalization_values->data(),
+                                              d_mg_personalization_values->size()}))
         : std::nullopt,
-      d_mg_pageranks.data(),
+      std::optional<raft::device_span<result_t const>>{std::nullopt},
       alpha,
       epsilon,
       std::numeric_limits<size_t>::max(),
@@ -211,25 +206,19 @@ class Tests_MGPageRank
 
         ASSERT_EQ(mg_graph_view.number_of_vertices(), sg_graph_view.number_of_vertices());
 
-        rmm::device_uvector<result_t> d_sg_pageranks(sg_graph_view.number_of_vertices(),
-                                                     handle_->get_stream());
-
-        cugraph::pagerank<vertex_t, edge_t, weight_t>(
+        auto [d_sg_pageranks, sg_metadata] = cugraph::pagerank<vertex_t, edge_t, weight_t>(
           *handle_,
           sg_graph_view,
           sg_edge_weight_view,
           std::nullopt,
           d_mg_aggregate_personalization_vertices
-            ? std::optional<vertex_t const*>{(*d_mg_aggregate_personalization_vertices).data()}
-            : std::nullopt,
-          d_mg_aggregate_personalization_values
-            ? std::optional<result_t const*>{(*d_mg_aggregate_personalization_values).data()}
-            : std::nullopt,
-          d_mg_aggregate_personalization_vertices
-            ? std::optional<vertex_t>{static_cast<vertex_t>(
-                (*d_mg_aggregate_personalization_vertices).size())}
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<vertex_t const>{d_mg_aggregate_personalization_vertices->data(),
+                                                  d_mg_aggregate_personalization_vertices->size()},
+                raft::device_span<result_t const>{d_mg_aggregate_personalization_values->data(),
+                                                  d_mg_aggregate_personalization_values->size()}))
             : std::nullopt,
-          d_sg_pageranks.data(),
+          std::optional<raft::device_span<result_t const>>{std::nullopt},
           alpha,
           epsilon,
           std::numeric_limits<size_t>::max(),  // max_iterations
diff --git a/cpp/tests/link_analysis/pagerank_test.cpp b/cpp/tests/link_analysis/pagerank_test.cpp
index adb4ea2fa54..0354b69b8a8 100644
--- a/cpp/tests/link_analysis/pagerank_test.cpp
+++ b/cpp/tests/link_analysis/pagerank_test.cpp
@@ -206,30 +206,27 @@ class Tests_PageRank
     result_t constexpr alpha{0.85};
     result_t constexpr epsilon{1e-6};
 
-    rmm::device_uvector<result_t> d_pageranks(graph_view.number_of_vertices(), handle.get_stream());
-
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       hr_timer.start("PageRank");
     }
 
-    cugraph::pagerank<vertex_t, edge_t, weight_t>(
+    auto [d_pageranks, metadata] = cugraph::pagerank<vertex_t, edge_t, weight_t>(
       handle,
       graph_view,
       edge_weight_view,
       std::nullopt,
       d_personalization_vertices
-        ? std::optional<vertex_t const*>{(*d_personalization_vertices).data()}
+        ? std::make_optional(
+            std::make_tuple(raft::device_span<vertex_t const>{d_personalization_vertices->data(),
+                                                              d_personalization_vertices->size()},
+                            raft::device_span<result_t const>{d_personalization_values->data(),
+                                                              d_personalization_values->size()}))
         : std::nullopt,
-      d_personalization_values ? std::optional<result_t const*>{(*d_personalization_values).data()}
-                               : std::nullopt,
-      d_personalization_vertices ? std::optional<vertex_t>{(*d_personalization_vertices).size()}
-                                 : std::nullopt,
-      d_pageranks.data(),
+      std::optional<raft::device_span<result_t const>>{std::nullopt},
       alpha,
       epsilon,
       std::numeric_limits<size_t>::max(),
-      false,
       false);
 
     if (cugraph::test::g_perf) {
diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py
index 8ed49ccdd1b..3b9c4e007e2 100644
--- a/python/cugraph/cugraph/__init__.py
+++ b/python/cugraph/cugraph/__init__.py
@@ -118,5 +118,6 @@
 
 from cugraph import gnn
 
+from cugraph import exceptions
 
 __version__ = "23.08.00"
diff --git a/python/cugraph/cugraph/dask/link_analysis/pagerank.py b/python/cugraph/cugraph/dask/link_analysis/pagerank.py
index 4aba5725c1b..2dfd25fa522 100644
--- a/python/cugraph/cugraph/dask/link_analysis/pagerank.py
+++ b/python/cugraph/cugraph/dask/link_analysis/pagerank.py
@@ -13,31 +13,41 @@
 # limitations under the License.
 #
 
+import warnings
+
+import dask
 from dask.distributed import wait, default_client
-import cugraph.dask.comms.comms as Comms
 import dask_cudf
 import cudf
 import numpy as np
-import warnings
-from cugraph.dask.common.input_utils import get_distributed_data
-
 from pylibcugraph import (
+    pagerank as plc_pagerank,
+    personalized_pagerank as plc_p_pagerank,
+    exceptions as plc_exceptions,
     ResourceHandle,
-    pagerank as pylibcugraph_pagerank,
-    personalized_pagerank as pylibcugraph_p_pagerank,
 )
 
+import cugraph.dask.comms.comms as Comms
+from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.exceptions import FailedToConvergeError
+
 
-def convert_to_cudf(cp_arrays):
+def convert_to_return_tuple(plc_pr_retval):
     """
-    Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
+    Using the PLC pagerank return tuple, creates a cudf DataFrame from the cupy
+    arrays and extracts the (optional) bool.
     """
-    cupy_vertices, cupy_pagerank = cp_arrays
+    if len(plc_pr_retval) == 3:
+        cupy_vertices, cupy_pagerank, converged = plc_pr_retval
+    else:
+        cupy_vertices, cupy_pagerank = plc_pr_retval
+        converged = True
+
     df = cudf.DataFrame()
     df["vertex"] = cupy_vertices
     df["pagerank"] = cupy_pagerank
 
-    return df
+    return (df, converged)
 
 
 # FIXME: Move this function to the utility module so that it can be
@@ -99,20 +109,26 @@ def _call_plc_pagerank(
     epsilon,
     max_iterations,
     do_expensive_check,
+    fail_on_nonconvergence,
 ):
-
-    return pylibcugraph_pagerank(
-        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
-        graph=mg_graph_x,
-        precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
-        precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
-        initial_guess_vertices=initial_guess_vertices,
-        initial_guess_values=initial_guess_values,
-        alpha=alpha,
-        epsilon=epsilon,
-        max_iterations=max_iterations,
-        do_expensive_check=do_expensive_check,
-    )
+    try:
+        return plc_pagerank(
+            resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+            graph=mg_graph_x,
+            precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
+            precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
+            initial_guess_vertices=initial_guess_vertices,
+            initial_guess_values=initial_guess_values,
+            alpha=alpha,
+            epsilon=epsilon,
+            max_iterations=max_iterations,
+            do_expensive_check=do_expensive_check,
+            fail_on_nonconvergence=fail_on_nonconvergence,
+        )
+    # Re-raise this as a cugraph exception so users trying to catch this do not
+    # have to know to import another package.
+    except plc_exceptions.FailedToConvergeError as exc:
+        raise FailedToConvergeError from exc
 
 
 def _call_plc_personalized_pagerank(
@@ -127,23 +143,30 @@ def _call_plc_personalized_pagerank(
     epsilon,
     max_iterations,
     do_expensive_check,
+    fail_on_nonconvergence,
 ):
     personalization_vertices = data_personalization["vertex"]
     personalization_values = data_personalization["values"]
-    return pylibcugraph_p_pagerank(
-        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
-        graph=mg_graph_x,
-        precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
-        precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
-        personalization_vertices=personalization_vertices,
-        personalization_values=personalization_values,
-        initial_guess_vertices=initial_guess_vertices,
-        initial_guess_values=initial_guess_values,
-        alpha=alpha,
-        epsilon=epsilon,
-        max_iterations=max_iterations,
-        do_expensive_check=do_expensive_check,
-    )
+    try:
+        return plc_p_pagerank(
+            resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+            graph=mg_graph_x,
+            precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
+            precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
+            personalization_vertices=personalization_vertices,
+            personalization_values=personalization_values,
+            initial_guess_vertices=initial_guess_vertices,
+            initial_guess_values=initial_guess_values,
+            alpha=alpha,
+            epsilon=epsilon,
+            max_iterations=max_iterations,
+            do_expensive_check=do_expensive_check,
+            fail_on_nonconvergence=fail_on_nonconvergence,
+        )
+    # Re-raise this as a cugraph exception so users trying to catch this do not
+    # have to know to import another package.
+    except plc_exceptions.FailedToConvergeError as exc:
+        raise FailedToConvergeError from exc
 
 
 def pagerank(
@@ -154,6 +177,7 @@ def pagerank(
     max_iter=100,
     tol=1.0e-5,
     nstart=None,
+    fail_on_nonconvergence=True,
 ):
     """
     Find the PageRank values for each vertex in a graph using multiple GPUs.
@@ -222,8 +246,18 @@ def pagerank(
         nstart['values'] : cudf.Series
             Pagerank values for vertices
 
+    fail_on_nonconvergence : bool (default=True)
+        If the solver does not reach convergence, raise an exception if
+        fail_on_nonconvergence is True. If fail_on_nonconvergence is False,
+        the return value is a tuple of (pagerank, converged) where pagerank is
+        a cudf.DataFrame as described below, and converged is a boolean
+        indicating if the solver converged (True) or not (False).
+
     Returns
     -------
+    The return value varies based on the value of the fail_on_nonconvergence
+    paramter.  If fail_on_nonconvergence is True:
+
     PageRank : dask_cudf.DataFrame
         GPU data frame containing two dask_cudf.Series of size V: the
         vertex identifiers and the corresponding PageRank values.
@@ -244,6 +278,12 @@ def pagerank(
         ddf['pagerank'] : dask_cudf.Series
             Contains the PageRank score
 
+    If fail_on_nonconvergence is False:
+
+    (PageRank, converged) : tuple of (dask_cudf.DataFrame, bool)
+       PageRank is the GPU dataframe described above, converged is a bool
+       indicating if the solver converged (True) or not (False).
+
     Examples
     --------
     >>> import cugraph.dask as dcg
@@ -328,6 +368,7 @@ def pagerank(
                 tol,
                 max_iter,
                 do_expensive_check,
+                fail_on_nonconvergence,
                 workers=[w],
                 allow_other_workers=False,
             )
@@ -347,6 +388,7 @@ def pagerank(
                 tol,
                 max_iter,
                 do_expensive_check,
+                fail_on_nonconvergence,
                 workers=[w],
                 allow_other_workers=False,
             )
@@ -355,17 +397,35 @@ def pagerank(
 
     wait(result)
 
-    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+
+    # Have each worker convert tuple of arrays and bool from PLC to cudf
+    # DataFrames and bools. This will be a list of futures.
+    result_tuples = [
+        client.submit(convert_to_return_tuple, cp_arrays) for cp_arrays in result
+    ]
 
-    wait(cudf_result)
+    # Convert the futures to dask delayed objects so the tuples can be
+    # split. nout=2 is passed since each tuple/iterable is a fixed length of 2.
+    result_tuples = [dask.delayed(r, nout=2) for r in result_tuples]
+
+    # Create the ddf and get the converged bool from the delayed objs.  Use a
+    # meta DataFrame to pass the expected dtypes for the DataFrame to prevent
+    # another compute to determine them automatically.
+    meta = cudf.DataFrame(columns=["vertex", "pagerank"])
+    meta = meta.astype({"pagerank": "float64", "vertex": vertex_dtype})
+    ddf = dask_cudf.from_delayed([t[0] for t in result_tuples], meta=meta).persist()
+    converged = all(dask.compute(*[t[1] for t in result_tuples]))
 
-    ddf = dask_cudf.from_delayed(cudf_result).persist()
     wait(ddf)
 
     # Wait until the inactive futures are released
-    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, result_tuples)])
 
     if input_graph.renumbered:
         ddf = input_graph.unrenumber(ddf, "vertex")
 
-    return ddf
+    if fail_on_nonconvergence:
+        return ddf
+    else:
+        return (ddf, converged)
diff --git a/python/cugraph/cugraph/exceptions.py b/python/cugraph/cugraph/exceptions.py
new file mode 100644
index 00000000000..64280603112
--- /dev/null
+++ b/python/cugraph/cugraph/exceptions.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Exception classes for cugraph.
+"""
+
+
+class FailedToConvergeError(Exception):
+    """
+    Raised when an algorithm fails to converge within a predetermined set of
+    constraints which vary based on the algorithm, and may or may not be
+    user-configurable.
+    """
+
+    pass
diff --git a/python/cugraph/cugraph/link_analysis/pagerank.py b/python/cugraph/cugraph/link_analysis/pagerank.py
index 6696512dcf0..d2b827fa7c8 100644
--- a/python/cugraph/cugraph/link_analysis/pagerank.py
+++ b/python/cugraph/cugraph/link_analysis/pagerank.py
@@ -11,20 +11,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.utilities import (
-    ensure_cugraph_obj_for_nx,
-    df_score_to_dictionary,
-)
+import warnings
+
 import cudf
 import numpy as np
-import warnings
 
 from pylibcugraph import (
-    pagerank as pylibcugraph_pagerank,
-    personalized_pagerank as pylibcugraph_p_pagerank,
+    pagerank as plc_pagerank,
+    personalized_pagerank as plc_p_pagerank,
+    exceptions as plc_exceptions,
     ResourceHandle,
 )
 
+from cugraph.utilities import (
+    ensure_cugraph_obj_for_nx,
+    df_score_to_dictionary,
+)
+from cugraph.exceptions import FailedToConvergeError
+
 
 def renumber_vertices(input_graph, input_df):
     if len(input_graph.renumber_map.implementation.col_names) > 1:
@@ -86,9 +90,9 @@ def pagerank(
     nstart=None,
     weight=None,
     dangling=None,
+    fail_on_nonconvergence=True,
 ):
-    """
-    Find the PageRank score for every vertex in a graph. cuGraph computes an
+    """Find the PageRank score for every vertex in a graph. cuGraph computes an
     approximation of the Pagerank eigenvector using the power method. The
     number of iterations depends on the properties of the network itself; it
     increases when the tolerance descreases and/or alpha increases toward the
@@ -163,8 +167,18 @@ def pagerank(
     dangling : dict, optional (default=None)
         This parameter is here for NetworkX compatibility and ignored
 
+    fail_on_nonconvergence : bool (default=True)
+        If the solver does not reach convergence, raise an exception if
+        fail_on_nonconvergence is True. If fail_on_nonconvergence is False,
+        the return value is a tuple of (pagerank, converged) where pagerank is
+        a cudf.DataFrame as described below, and converged is a boolean
+        indicating if the solver converged (True) or not (False).
+
     Returns
     -------
+    The return value varies based on the value of the fail_on_nonconvergence
+    paramter.  If fail_on_nonconvergence is True:
+
     PageRank : cudf.DataFrame
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding PageRank values.
@@ -185,6 +199,12 @@ def pagerank(
         df['pagerank'] : cudf.Series
             Contains the PageRank score
 
+    If fail_on_nonconvergence is False:
+
+    (PageRank, converged) : tuple of (cudf.DataFrame, bool)
+       PageRank is the GPU dataframe described above, converged is a bool
+       indicating if the solver converged (True) or not (False).
+
     Examples
     --------
     >>> from cugraph.experimental.datasets import karate
@@ -226,47 +246,55 @@ def pagerank(
         pre_vtx_o_wgt_vertices = precomputed_vertex_out_weight["vertex"]
         pre_vtx_o_wgt_sums = precomputed_vertex_out_weight["sums"]
 
-    if personalization is not None:
-        if not isinstance(personalization, cudf.DataFrame):
-            raise NotImplementedError(
-                "personalization other than a cudf dataframe " "currently not supported"
+    try:
+        if personalization is not None:
+            if not isinstance(personalization, cudf.DataFrame):
+                raise NotImplementedError(
+                    "personalization other than a cudf dataframe currently not "
+                    "supported"
+                )
+            if G.renumbered is True:
+                personalization = renumber_vertices(G, personalization)
+
+            personalization = ensure_valid_dtype(G, personalization, "personalization")
+
+            result_tuple = plc_p_pagerank(
+                resource_handle=ResourceHandle(),
+                graph=G._plc_graph,
+                precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
+                precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
+                personalization_vertices=personalization["vertex"],
+                personalization_values=personalization["values"],
+                initial_guess_vertices=initial_guess_vertices,
+                initial_guess_values=initial_guess_values,
+                alpha=alpha,
+                epsilon=tol,
+                max_iterations=max_iter,
+                do_expensive_check=do_expensive_check,
+                fail_on_nonconvergence=fail_on_nonconvergence,
             )
-        if G.renumbered is True:
-            personalization = renumber_vertices(G, personalization)
-
-        personalization = ensure_valid_dtype(G, personalization, "personalization")
-
-        vertex, pagerank_values = pylibcugraph_p_pagerank(
-            resource_handle=ResourceHandle(),
-            graph=G._plc_graph,
-            precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
-            precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
-            personalization_vertices=personalization["vertex"],
-            personalization_values=personalization["values"],
-            initial_guess_vertices=initial_guess_vertices,
-            initial_guess_values=initial_guess_values,
-            alpha=alpha,
-            epsilon=tol,
-            max_iterations=max_iter,
-            do_expensive_check=do_expensive_check,
-        )
-    else:
-        vertex, pagerank_values = pylibcugraph_pagerank(
-            resource_handle=ResourceHandle(),
-            graph=G._plc_graph,
-            precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
-            precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
-            initial_guess_vertices=initial_guess_vertices,
-            initial_guess_values=initial_guess_values,
-            alpha=alpha,
-            epsilon=tol,
-            max_iterations=max_iter,
-            do_expensive_check=do_expensive_check,
-        )
+        else:
+            result_tuple = plc_pagerank(
+                resource_handle=ResourceHandle(),
+                graph=G._plc_graph,
+                precomputed_vertex_out_weight_vertices=pre_vtx_o_wgt_vertices,
+                precomputed_vertex_out_weight_sums=pre_vtx_o_wgt_sums,
+                initial_guess_vertices=initial_guess_vertices,
+                initial_guess_values=initial_guess_values,
+                alpha=alpha,
+                epsilon=tol,
+                max_iterations=max_iter,
+                do_expensive_check=do_expensive_check,
+                fail_on_nonconvergence=fail_on_nonconvergence,
+            )
+    # Re-raise this as a cugraph exception so users trying to catch this do not
+    # have to know to import another package.
+    except plc_exceptions.FailedToConvergeError as exc:
+        raise FailedToConvergeError from exc
 
     df = cudf.DataFrame()
-    df["vertex"] = vertex
-    df["pagerank"] = pagerank_values
+    df["vertex"] = result_tuple[0]
+    df["pagerank"] = result_tuple[1]
 
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
@@ -274,4 +302,7 @@ def pagerank(
     if isNx is True:
         df = df_score_to_dictionary(df, "pagerank")
 
-    return df
+    if fail_on_nonconvergence:
+        return df
+    else:
+        return (df, result_tuple[2])
diff --git a/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py b/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py
index ba136963b60..b7487ae329c 100644
--- a/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py
+++ b/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py
@@ -432,3 +432,49 @@ def test_pagerank_transposed_false():
 
     with pytest.warns(UserWarning, match=warning_msg):
         cugraph.pagerank(G)
+
+
+@pytest.mark.sg
+def test_pagerank_non_convergence():
+    G = karate.get_graph(create_using=cugraph.Graph(directed=True))
+
+    # Not enough allowed iterations, should not converge
+    with pytest.raises(cugraph.exceptions.FailedToConvergeError):
+        df = cugraph.pagerank(G, max_iter=1, fail_on_nonconvergence=True)
+
+    # Not enough allowed iterations, should not converge but do not consider
+    # that an error
+    (df, converged) = cugraph.pagerank(G, max_iter=1, fail_on_nonconvergence=False)
+    assert type(df) is cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is False
+
+    # The default max_iter value should allow convergence for this graph
+    (df, converged) = cugraph.pagerank(G, fail_on_nonconvergence=False)
+    assert type(df) is cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is True
+
+    # Test personalized pagerank the same way
+    personalization = cudf.DataFrame()
+    personalization["vertex"] = [17, 26]
+    personalization["values"] = [0.5, 0.75]
+
+    with pytest.raises(cugraph.exceptions.FailedToConvergeError):
+        df = cugraph.pagerank(
+            G, max_iter=1, personalization=personalization, fail_on_nonconvergence=True
+        )
+
+    (df, converged) = cugraph.pagerank(
+        G, max_iter=1, personalization=personalization, fail_on_nonconvergence=False
+    )
+    assert type(df) is cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is False
+
+    (df, converged) = cugraph.pagerank(
+        G, personalization=personalization, fail_on_nonconvergence=False
+    )
+    assert type(df) is cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is True
diff --git a/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py b/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py
index 941974eea4f..14a512c59e5 100644
--- a/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py
+++ b/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py
@@ -48,6 +48,25 @@ def personalize(vertices, personalization_perc):
     return cu_personalization, personalization
 
 
+def create_distributed_karate_graph(store_transposed=True):
+    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
+
+    chunksize = dcg.get_chunksize(input_data_path)
+
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=True)
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst", store_transposed=store_transposed)
+
+    return dg
+
+
 # =============================================================================
 # Parameters
 # =============================================================================
@@ -197,20 +216,7 @@ def test_pagerank_invalid_personalization_dtype(dask_client):
 
 @pytest.mark.mg
 def test_dask_pagerank_transposed_false(dask_client):
-    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
-
-    chunksize = dcg.get_chunksize(input_data_path)
-
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=True)
-    dg.from_dask_cudf_edgelist(ddf, "src", "dst", store_transposed=False)
+    dg = create_distributed_karate_graph(store_transposed=False)
 
     warning_msg = (
         "Pagerank expects the 'store_transposed' "
@@ -220,3 +226,49 @@ def test_dask_pagerank_transposed_false(dask_client):
 
     with pytest.warns(UserWarning, match=warning_msg):
         dcg.pagerank(dg)
+
+
+@pytest.mark.mg
+def test_pagerank_non_convergence(dask_client):
+    dg = create_distributed_karate_graph()
+
+    # Not enough allowed iterations, should not converge
+    with pytest.raises(cugraph.exceptions.FailedToConvergeError):
+        ddf = dcg.pagerank(dg, max_iter=1, fail_on_nonconvergence=True)
+
+    # Not enough allowed iterations, should not converge but do not consider
+    # that an error
+    (ddf, converged) = dcg.pagerank(dg, max_iter=1, fail_on_nonconvergence=False)
+    assert type(ddf) is dask_cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is False
+
+    # The default max_iter value should allow convergence for this graph
+    (ddf, converged) = dcg.pagerank(dg, fail_on_nonconvergence=False)
+    assert type(ddf) is dask_cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is True
+
+    # Test personalized pagerank the same way
+    personalization = cudf.DataFrame()
+    personalization["vertex"] = [17, 26]
+    personalization["values"] = [0.5, 0.75]
+
+    with pytest.raises(cugraph.exceptions.FailedToConvergeError):
+        df = dcg.pagerank(
+            dg, max_iter=1, personalization=personalization, fail_on_nonconvergence=True
+        )
+
+    (df, converged) = dcg.pagerank(
+        dg, max_iter=1, personalization=personalization, fail_on_nonconvergence=False
+    )
+    assert type(df) is dask_cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is False
+
+    (df, converged) = dcg.pagerank(
+        dg, personalization=personalization, fail_on_nonconvergence=False
+    )
+    assert type(df) is dask_cudf.DataFrame
+    assert type(converged) is bool
+    assert converged is True
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index e0d7b6797d4..5c03d8f98cc 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -81,4 +81,6 @@
 
 from pylibcugraph.select_random_vertices import select_random_vertices
 
+from pylibcugraph import exceptions
+
 __version__ = "23.08.00"
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd
index 06838256f30..6cd02ed6f17 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd
@@ -47,6 +47,16 @@ cdef extern from "cugraph_c/centrality_algorithms.h":
             cugraph_centrality_result_t* result
         )
 
+    cdef size_t \
+        cugraph_centrality_result_get_num_iterations(
+            cugraph_centrality_result_t* result
+        )
+
+    cdef bool_t \
+        cugraph_centrality_result_converged(
+            cugraph_centrality_result_t* result
+        )
+
     cdef void \
         cugraph_centrality_result_free(
             cugraph_centrality_result_t* result
@@ -68,6 +78,22 @@ cdef extern from "cugraph_c/centrality_algorithms.h":
             cugraph_error_t** error
         )
 
+    cdef cugraph_error_code_t \
+        cugraph_pagerank_allow_nonconvergence(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_vertices,
+            const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_sums,
+            const cugraph_type_erased_device_array_view_t* initial_guess_vertices,
+            const cugraph_type_erased_device_array_view_t* initial_guess_values,
+            double alpha,
+            double epsilon,
+            size_t max_iterations,
+            bool_t do_expensive_check,
+            cugraph_centrality_result_t** result,
+            cugraph_error_t** error
+        )
+
     cdef cugraph_error_code_t \
         cugraph_personalized_pagerank(
             const cugraph_resource_handle_t* handle,
@@ -86,6 +112,24 @@ cdef extern from "cugraph_c/centrality_algorithms.h":
             cugraph_error_t** error
         )
 
+    cdef cugraph_error_code_t \
+        cugraph_personalized_pagerank_allow_nonconvergence(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_vertices,
+            const cugraph_type_erased_device_array_view_t* precomputed_vertex_out_weight_sums,
+            const cugraph_type_erased_device_array_view_t* initial_guess_vertices,
+            const cugraph_type_erased_device_array_view_t* initial_guess_values,
+            const cugraph_type_erased_device_array_view_t* personalization_vertices,
+            const cugraph_type_erased_device_array_view_t* personalization_values,
+            double alpha,
+            double epsilon,
+            size_t max_iterations,
+            bool_t do_expensive_check,
+            cugraph_centrality_result_t** result,
+            cugraph_error_t** error
+        )
+
     ###########################################################################
     # eigenvector centrality
     cdef cugraph_error_code_t \
@@ -167,4 +211,4 @@ cdef extern from "cugraph_c/centrality_algorithms.h":
             bool_t do_expensive_check,
             cugraph_centrality_result_t** result,
             cugraph_error_t** error
-        )
\ No newline at end of file
+        )
diff --git a/python/pylibcugraph/pylibcugraph/exceptions.py b/python/pylibcugraph/pylibcugraph/exceptions.py
new file mode 100644
index 00000000000..54b58d840b3
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/exceptions.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Exception classes for pylibcugraph.
+"""
+
+
+class FailedToConvergeError(Exception):
+    """
+    Raised when an algorithm fails to converge within a predetermined set of
+    constraints which vary based on the algorithm, and may or may not be
+    user-configurable.
+    """
+
+    pass
diff --git a/python/pylibcugraph/pylibcugraph/pagerank.pyx b/python/pylibcugraph/pylibcugraph/pagerank.pyx
index 7d8f7807ead..a5022072b4c 100644
--- a/python/pylibcugraph/pylibcugraph/pagerank.pyx
+++ b/python/pylibcugraph/pylibcugraph/pagerank.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -35,7 +35,8 @@ from pylibcugraph._cugraph_c.graph cimport (
 )
 from pylibcugraph._cugraph_c.centrality_algorithms cimport (
     cugraph_centrality_result_t,
-    cugraph_pagerank,
+    cugraph_pagerank_allow_nonconvergence,
+    cugraph_centrality_result_converged,
     cugraph_centrality_result_get_vertices,
     cugraph_centrality_result_get_values,
     cugraph_centrality_result_free,
@@ -53,6 +54,7 @@ from pylibcugraph.utils cimport (
     get_c_type_from_numpy_type,
     create_cugraph_type_erased_device_array_view_from_py_obj,
 )
+from pylibcugraph.exceptions import FailedToConvergeError
 
 
 def pagerank(ResourceHandle resource_handle,
@@ -64,7 +66,8 @@ def pagerank(ResourceHandle resource_handle,
             double alpha,
             double epsilon,
             size_t max_iterations,
-            bool_t do_expensive_check):
+            bool_t do_expensive_check,
+            fail_on_nonconvergence=True):
     """
     Find the PageRank score for every vertex in a graph by computing an
     approximation of the Pagerank eigenvector using the power method. The
@@ -123,13 +126,29 @@ def pagerank(ResourceHandle resource_handle,
         If True, performs more extensive tests on the inputs to ensure
         validitity, at the expense of increased run time.
 
+    fail_on_nonconvergence : bool (default=True)
+        If the solver does not reach convergence, raise an exception if
+        fail_on_nonconvergence is True. If fail_on_nonconvergence is False,
+        the return value is a tuple of (pagerank, converged) where pagerank is
+        a cudf.DataFrame as described below, and converged is a boolean
+        indicating if the solver converged (True) or not (False).
+
     Returns
     -------
-    A tuple of device arrays, where the first item in the tuple is a device
-    array containing the vertex identifiers, and the second item is a device
-    array containing the pagerank values for the corresponding vertices. For
-    example, the vertex identifier at the ith element of the vertex array has
-    the pagerank value of the ith element in the pagerank array.
+    The return value varies based on the value of the fail_on_nonconvergence
+    paramter.  If fail_on_nonconvergence is True:
+
+       A tuple of device arrays, where the first item in the tuple is a device
+       array containing the vertex identifiers, and the second item is a device
+       array containing the pagerank values for the corresponding vertices. For
+       example, the vertex identifier at the ith element of the vertex array
+       has the pagerank value of the ith element in the pagerank array.
+
+    If fail_on_nonconvergence is False:
+
+       A three-tuple where the first two items are the device arrays described
+       above, and the third is a bool indicating if the solver converged (True)
+       or not (False).
 
     Examples
     --------
@@ -195,30 +214,35 @@ def pagerank(ResourceHandle resource_handle,
     cdef cugraph_centrality_result_t* result_ptr
     cdef cugraph_error_code_t error_code
     cdef cugraph_error_t* error_ptr
+    cdef bool_t converged
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr
+    cdef cugraph_type_erased_device_array_view_t* pageranks_ptr
+
+    error_code = cugraph_pagerank_allow_nonconvergence(
+        c_resource_handle_ptr,
+        c_graph_ptr,
+        precomputed_vertex_out_weight_vertices_view_ptr,
+        precomputed_vertex_out_weight_sums_view_ptr,
+        initial_guess_vertices_view_ptr,
+        initial_guess_values_view_ptr,
+        alpha,
+        epsilon,
+        max_iterations,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_pagerank_allow_nonconvergence")
+
+    converged = cugraph_centrality_result_converged(result_ptr)
 
-    error_code = cugraph_pagerank(c_resource_handle_ptr,
-                                  c_graph_ptr,
-                                  precomputed_vertex_out_weight_vertices_view_ptr,
-                                  precomputed_vertex_out_weight_sums_view_ptr,
-                                  initial_guess_vertices_view_ptr,
-                                  initial_guess_values_view_ptr,
-                                  alpha,
-                                  epsilon,
-                                  max_iterations,
-                                  do_expensive_check,
-                                  &result_ptr,
-                                  &error_ptr)
-    assert_success(error_code, error_ptr, "cugraph_pagerank")
-
-    # Extract individual device array pointers from result and copy to cupy
-    # arrays for returning.
-    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
-        cugraph_centrality_result_get_vertices(result_ptr)
-    cdef cugraph_type_erased_device_array_view_t* pageranks_ptr = \
-        cugraph_centrality_result_get_values(result_ptr)
-
-    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
-    cupy_pageranks = copy_to_cupy_array(c_resource_handle_ptr, pageranks_ptr)
+    # Only extract results if necessary
+    if (fail_on_nonconvergence is False) or (converged is True):
+        # Extract individual device array pointers from result and copy to cupy
+        # arrays for returning.
+        vertices_ptr = cugraph_centrality_result_get_vertices(result_ptr)
+        pageranks_ptr = cugraph_centrality_result_get_values(result_ptr)
+        cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+        cupy_pageranks = copy_to_cupy_array(c_resource_handle_ptr, pageranks_ptr)
 
     # Free all pointers
     cugraph_centrality_result_free(result_ptr)
@@ -231,4 +255,10 @@ def pagerank(ResourceHandle resource_handle,
     if precomputed_vertex_out_weight_sums is not None:
         cugraph_type_erased_device_array_view_free(precomputed_vertex_out_weight_sums_view_ptr)
 
-    return (cupy_vertices, cupy_pageranks)
+    if fail_on_nonconvergence is False:
+        return (cupy_vertices, cupy_pageranks, bool(converged))
+    else:
+        if converged is True:
+            return (cupy_vertices, cupy_pageranks)
+        else:
+            raise FailedToConvergeError
diff --git a/python/pylibcugraph/pylibcugraph/personalized_pagerank.pyx b/python/pylibcugraph/pylibcugraph/personalized_pagerank.pyx
index 89b57f139a1..e60e7fa2c3e 100644
--- a/python/pylibcugraph/pylibcugraph/personalized_pagerank.pyx
+++ b/python/pylibcugraph/pylibcugraph/personalized_pagerank.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -35,7 +35,8 @@ from pylibcugraph._cugraph_c.graph cimport (
 )
 from pylibcugraph._cugraph_c.centrality_algorithms cimport (
     cugraph_centrality_result_t,
-    cugraph_personalized_pagerank,
+    cugraph_personalized_pagerank_allow_nonconvergence,
+    cugraph_centrality_result_converged,
     cugraph_centrality_result_get_vertices,
     cugraph_centrality_result_get_values,
     cugraph_centrality_result_free,
@@ -53,6 +54,7 @@ from pylibcugraph.utils cimport (
     get_c_type_from_numpy_type,
     create_cugraph_type_erased_device_array_view_from_py_obj,
 )
+from pylibcugraph.exceptions import FailedToConvergeError
 
 
 def personalized_pagerank(ResourceHandle resource_handle,
@@ -66,7 +68,8 @@ def personalized_pagerank(ResourceHandle resource_handle,
                           double alpha,
                           double epsilon,
                           size_t max_iterations,
-                          bool_t do_expensive_check):
+                          bool_t do_expensive_check,
+                          fail_on_nonconvergence=True):
     """
     Find the PageRank score for every vertex in a graph by computing an
     approximation of the Pagerank eigenvector using the power method. The
@@ -85,27 +88,21 @@ def personalized_pagerank(ResourceHandle resource_handle,
 
     precomputed_vertex_out_weight_vertices: device array type
         Subset of vertices of graph for precomputed_vertex_out_weight
-        (a performance optimization)
 
     precomputed_vertex_out_weight_sums : device array type
         Corresponding precomputed sum of outgoing vertices weight
-        (a performance optimization)
-    
+
     initial_guess_vertices : device array type
         Subset of vertices of graph for initial guess for pagerank values
-        (a performance optimization)
-    
+
     initial_guess_values : device array type
         Pagerank values for vertices
-        (a performance optimization)
-    
+
     personalization_vertices : device array type
         Subset of vertices of graph for personalization
-        (a performance optimization)
-    
+
     personalization_values : device array type
         Personalization values for vertices
-        (a performance optimization)
 
     alpha : double
         The damping factor alpha represents the probability to follow an
@@ -133,13 +130,29 @@ def personalized_pagerank(ResourceHandle resource_handle,
         If True, performs more extensive tests on the inputs to ensure
         validitity, at the expense of increased run time.
 
+    fail_on_nonconvergence : bool (default=True)
+        If the solver does not reach convergence, raise an exception if
+        fail_on_nonconvergence is True. If fail_on_nonconvergence is False,
+        the return value is a tuple of (pagerank, converged) where pagerank is
+        a cudf.DataFrame as described below, and converged is a boolean
+        indicating if the solver converged (True) or not (False).
+
     Returns
     -------
-    A tuple of device arrays, where the first item in the tuple is a device
-    array containing the vertex identifiers, and the second item is a device
-    array containing the pagerank values for the corresponding vertices. For
-    example, the vertex identifier at the ith element of the vertex array has
-    the pagerank value of the ith element in the pagerank array.
+    The return value varies based on the value of the fail_on_nonconvergence
+    paramter.  If fail_on_nonconvergence is True:
+
+       A tuple of device arrays, where the first item in the tuple is a device
+       array containing the vertex identifiers, and the second item is a device
+       array containing the pagerank values for the corresponding vertices. For
+       example, the vertex identifier at the ith element of the vertex array has
+       the pagerank value of the ith element in the pagerank array.
+
+    If fail_on_nonconvergence is False:
+
+       A three-tuple where the first two items are the device arrays described
+       above, and the third is a bool indicating if the solver converged (True)
+       or not (False).
 
     Examples
     --------
@@ -207,12 +220,12 @@ def personalized_pagerank(ResourceHandle resource_handle,
         precomputed_vertex_out_weight_sums_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 precomputed_vertex_out_weight_sums)
-    
+
     cdef cugraph_type_erased_device_array_view_t* \
         personalization_vertices_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 personalization_vertices)
-    
+
     cdef cugraph_type_erased_device_array_view_t* \
         personalization_values_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
@@ -221,32 +234,38 @@ def personalized_pagerank(ResourceHandle resource_handle,
     cdef cugraph_centrality_result_t* result_ptr
     cdef cugraph_error_code_t error_code
     cdef cugraph_error_t* error_ptr
+    cdef bool_t converged
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr
+    cdef cugraph_type_erased_device_array_view_t* pageranks_ptr
+
+    error_code = cugraph_personalized_pagerank_allow_nonconvergence(
+        c_resource_handle_ptr,
+        c_graph_ptr,
+        precomputed_vertex_out_weight_vertices_view_ptr,
+        precomputed_vertex_out_weight_sums_view_ptr,
+        initial_guess_vertices_view_ptr,
+        initial_guess_values_view_ptr,
+        personalization_vertices_view_ptr,
+        personalization_values_view_ptr,
+        alpha,
+        epsilon,
+        max_iterations,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(
+        error_code, error_ptr, "cugraph_personalized_pagerank_allow_nonconvergence")
+
+    converged = cugraph_centrality_result_converged(result_ptr)
 
-    error_code = cugraph_personalized_pagerank(c_resource_handle_ptr,
-                                               c_graph_ptr,
-                                               precomputed_vertex_out_weight_vertices_view_ptr,
-                                               precomputed_vertex_out_weight_sums_view_ptr,
-                                               initial_guess_vertices_view_ptr,
-                                               initial_guess_values_view_ptr,
-                                               personalization_vertices_view_ptr,
-                                               personalization_values_view_ptr,
-                                               alpha,
-                                               epsilon,
-                                               max_iterations,
-                                               do_expensive_check,
-                                               &result_ptr,
-                                               &error_ptr)
-    assert_success(error_code, error_ptr, "cugraph_personalized_pagerank")
-
-    # Extract individual device array pointers from result and copy to cupy
-    # arrays for returning.
-    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
-        cugraph_centrality_result_get_vertices(result_ptr)
-    cdef cugraph_type_erased_device_array_view_t* pageranks_ptr = \
-        cugraph_centrality_result_get_values(result_ptr)
-
-    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
-    cupy_pageranks = copy_to_cupy_array(c_resource_handle_ptr, pageranks_ptr)
+    # Only extract results if necessary
+    if (fail_on_nonconvergence is False) or (converged is True):
+        # Extract individual device array pointers from result and copy to cupy
+        # arrays for returning.
+        vertices_ptr = cugraph_centrality_result_get_vertices(result_ptr)
+        pageranks_ptr = cugraph_centrality_result_get_values(result_ptr)
+        cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+        cupy_pageranks = copy_to_cupy_array(c_resource_handle_ptr, pageranks_ptr)
 
     # Free all pointers
     cugraph_centrality_result_free(result_ptr)
@@ -263,4 +282,10 @@ def personalized_pagerank(ResourceHandle resource_handle,
     if personalization_values is not None:
         cugraph_type_erased_device_array_view_free(personalization_values_view_ptr)
 
-    return (cupy_vertices, cupy_pageranks)
+    if fail_on_nonconvergence is False:
+        return (cupy_vertices, cupy_pageranks, bool(converged))
+    else:
+        if converged is True:
+            return (cupy_vertices, cupy_pageranks)
+        else:
+            raise FailedToConvergeError

From 49378a355d39332afa9490f0a87760062e473faa Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Wed, 21 Jun 2023 16:38:31 -0700
Subject: [PATCH 02/10] [REVIEW] Optimize bulk sampling (#3661)

This PR optimizes bulk sampling by persisting the samples to prevent redundant computations .

I see a 50% boost and consistent perf with this PR on `13.5 s` vs `9s`  on https://github.com/rapidsai/cugraph/pull/3628

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3661
---
 python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
index 0257a56ba08..33de5fdc185 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
@@ -172,6 +172,8 @@ def flush(self) -> None:
         if self.size == 0:
             return
         self.__batches.reset_index(drop=True)
+        if isinstance(self.__batches, dask_cudf.DataFrame):
+            self.__batches = self.__batches.persist()
 
         min_batch_id = self.__batches[self.batch_col_name].min()
         if isinstance(self.__batches, dask_cudf.DataFrame):
@@ -213,6 +215,9 @@ def flush(self) -> None:
         )
 
         self.__batches = self.__batches[~batch_id_filter]
+        if isinstance(self.__batches, dask_cudf.DataFrame):
+            self.__batches = self.__batches.persist()
+
         self.__write(samples, offsets)
 
         if self.size > 0:

From c7a23b828db7a8b1a45bf72b2cc78953bfeebe7d Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Thu, 22 Jun 2023 12:58:09 -0400
Subject: [PATCH 03/10] Add Benchmark for Bulk Sampling (#3628)

Adds the benchmark for bulk sampling.

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3628
---
 .../standalone/bulk_sampling/README.md        | 116 +++
 .../bulk_sampling/cugraph_bulk_sampling.py    | 736 ++++++++++++++++++
 python/cugraph/cugraph/testing/mg_utils.py    |  11 +
 3 files changed, 863 insertions(+)
 create mode 100644 benchmarks/cugraph/standalone/bulk_sampling/README.md
 create mode 100644 benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/README.md b/benchmarks/cugraph/standalone/bulk_sampling/README.md
new file mode 100644
index 00000000000..f48eea5c556
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/README.md
@@ -0,0 +1,116 @@
+# cuGraph Bulk Sampling
+
+## Overview
+The `cugraph_bulk_sampling.py` script runs the bulk sampler for a variety of datasets, including
+both generated (rmat) datasets and disk (ogbn_papers100M, etc.) datasets.  It can also load
+replicas of these datasets to create a larger benchmark (i.e. ogbn_papers100M x2).
+
+## Arguments
+The script takes a variety of arguments to control sampling behavior.
+Required:
+    --output_root
+        The output root directory.  File/folder names are auto-generated.
+        For instance, if the output root directory is /home/samples,
+        the samples will be written to a new folder in /home/samples that
+        contains information about the sampling run as well as the time
+        of the run.
+    
+    --dataset_root
+        The folder where datasets are stored.  Uses the format described
+        in the input format section.
+    
+    --datasets
+        Comma-separated list of datasets; can specify ogb or rmat (i.e. ogb_papers100M[2],rmat_22_16).
+        For ogb datasets, can provide replication factor using brackets.
+        Will attempt to read from dataset_root/<datset_name>.
+    
+Optional:
+    --fanouts
+        Comma-separated list of fanout values (i.e. [10, 25]).
+        The default fanout is [10, 25].
+    
+    --batch_sizes
+        Comma-separated list of batch sizes (i.e. 500, 1000).
+        Defaults to "512,1024"
+
+    --seeds_per_call_opts
+        Comma-separated list of seeds per call.  Controls the number of input seed vertices processed
+        in a single sampling call.
+        Defaults to 524288
+    
+    --reverse_edges
+        Whether to reverse the edges of the input edgelist. Should be set to False for PyG and True for DGL.
+        Defaults to False (PyG).
+
+    --dask_worker_devices
+        Comma-separated list of the GPUs to assign to dask (i.e. "0,1,2").
+        Defaults to just the default GPU (0).
+        Changing this is strongly recommended in order to take advantage of all GPUs on the system.
+
+    --random_seed
+        Seed for random number generation.
+        Defaults to '62'
+    
+    --persist
+        Whether to aggressively use persist() in dask to make the ETL steps (NOT PART OF SAMPLING) faster.
+        Will probably make this script finish sooner at the expense of memory usage, but won't affect
+        sampling time.
+        Changing this is not recommended unless you know what you are doing.
+        Defaults to False.
+    
+## Input Format
+The script expects its input data in the following format:
+```
+<top level directory>
+|
+|------ meta.json
+|------ parquet
+|------ |---------- <node type 0 (i.e. paper)>
+|------ |---------- |---------------------------- [node_label.parquet]
+|------ |---------- <node type 1 (i.e. author)>
+|------ |---------- |---------------------------- [node_label.parquet]
+...
+|------ |---------- <edge type 0 (i.e. paper__cites__paper)>
+|------ |---------- |------------------------------------------ edge_index.parquet
+|------ |---------- <edge type 1 (i.e. author__writes__paper)>
+|------ |---------- |------------------------------------------ edge_index.parquet
+...
+
+```
+
+`node_label.parquet` only needs to be present for vertex types that have labeled
+nodes. It consists of two columns, "node" which contains node ids, and "label",
+which contains the labeled class of the node.
+
+`edge_index.parquet` is required for all edge types.  It has two columns, `src`
+and `dst`, representing the source and destination vertices of the edges in that
+edge type's COO edge index.
+
+`meta.json` is a json file containing metadata needed to properly process
+the parquet files.  It must have the following format:
+```
+{
+    "num_nodes": {
+        "<node type 0 (i.e. paper)">: <# nodes of node type 0>,
+        "<node type 1 (i.e. author)">: <# nodes of node type 1>,
+        ...
+    },
+    "num_edges": {
+        <edge type 0 (i.e. paper__cites__paper)>: <# edges of edge type 0>,
+        <edge type 1 (i.e. author__writes__paper)>: <# edges of edge type 1>,
+        ...
+    }
+}
+```
+
+## Output Meta
+The script, in addition to the samples, will also output a file named `output_meta.json`.
+This file contains various statistics about the sampling run, including the runtime,
+as well as information about the dataset and system that the samples were produced from.
+
+This metadata file can be used to gather the results from the sampling and training stages
+together.
+
+## Other Notes
+For rmat datasets, you will need to generate your own bogus features in the training stage.
+Since that is trivial, that is not done in this sampling script.
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
new file mode 100644
index 00000000000..3cfd39afc98
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
@@ -0,0 +1,736 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import warnings
+import argparse
+import traceback
+
+from cugraph.testing.mg_utils import (
+    generate_edgelist_rmat,
+    # get_allocation_counts_dask_persist,
+    get_allocation_counts_dask_lazy,
+    sizeof_fmt,
+    get_peak_output_ratio_across_workers,
+    restart_client,
+    start_dask_client,
+    stop_dask_client,
+    enable_spilling,
+)
+
+from cugraph.structure.symmetrize import symmetrize
+from cugraph.experimental.gnn import BulkSampler
+
+import cugraph
+
+from datetime import datetime
+
+import json
+import re
+import os
+import gc
+from time import sleep, perf_counter
+from math import ceil
+
+import pandas as pd
+import numpy as np
+import cupy
+import cudf
+
+import dask_cudf
+import dask.dataframe as ddf
+from dask.distributed import default_client
+
+from typing import Optional, Union, Dict
+
+
+def construct_graph(dask_dataframe):
+    """
+    Args:
+        dask_dataframe:
+            dask_dataframe contains weighted and undirected edges with self
+            loops. Multiple edges will likely be present as well.
+        directed:
+            If True, the graph will be directed.
+        renumber:
+            If True, the graph will be renumbered.
+    Returns:
+        G:  cugraph.Graph
+    """
+    assert dask_dataframe['src'].dtype == 'int64'
+    assert dask_dataframe['dst'].dtype == 'int64'
+
+    if 'etp' in dask_dataframe.columns:
+        assert dask_dataframe['etp'].dtype == 'int32'
+
+    G = cugraph.MultiGraph(directed=True)
+    G.from_dask_cudf_edgelist(
+        dask_dataframe,
+        source="src", 
+        destination="dst",
+        edge_type='etp' if 'etp' in dask_dataframe.columns else None,
+        renumber=False
+    )
+    return G
+
+
+def symmetrize_ddf(dask_dataframe):
+    source_col, dest_col = symmetrize(
+        dask_dataframe,
+        'src',
+        'dst',
+        multi=True,
+        symmetrize=True,
+    )
+
+    new_ddf = source_col.to_frame()
+    new_ddf['dst'] = dest_col
+
+    return new_ddf
+
+def renumber_ddf(dask_df, persist=False):
+    vertices = dask_cudf.concat([dask_df['src'], dask_df['dst']]).unique().reset_index(drop=True)
+    if persist:
+        vertices = vertices.persist()
+    
+    vertices.name = 'v'
+    vertices = vertices.reset_index().set_index('v').rename(columns={'index': 'm'})
+    if persist:
+        vertices = vertices.persist()
+
+    src = dask_df.merge(vertices, left_on='src', right_on='v', how='left').m.rename('src')
+    dst = dask_df.merge(vertices, left_on='dst', right_on='v', how='left').m.rename('dst')
+    df = src.to_frame()
+    df['dst'] = dst
+
+    return df.reset_index(drop=True)
+
+def _make_batch_ids(bdf: cudf.DataFrame, batch_size: int, num_workers: int, partition_info: Optional[Union[dict, str]] = None):
+    # Required by dask; need to skip dummy partitions.
+    if partition_info is None:
+        return cudf.DataFrame({
+            'batch': cudf.Series(dtype='int32'),
+            'start': cudf.Series(dtype='int64')
+        })
+    
+    partition = partition_info['number']
+    if partition is None:
+        raise ValueError('division is absent')
+
+    num_batches = int(ceil(len(bdf) / batch_size))
+    
+    batch_ids = cupy.repeat(
+        cupy.arange(num_batches * partition, num_batches * (partition + 1), dtype='int32'),
+        batch_size
+    )[:len(bdf)]
+
+    bdf = bdf.reset_index(drop=True)
+    bdf['batch'] = cudf.Series(batch_ids)
+
+    return bdf
+
+
+def _replicate_df(df: cudf.DataFrame, replication_factor: int, col_item_counts:Dict[str, int], partition_info: Optional[Union[dict, str]] = None):
+    # Required by dask; need to skip dummy partitions.
+    if partition_info is None:
+        return cudf.DataFrame({
+            col: cudf.Series(dtype=df[col].dtype) for col in col_item_counts.keys()
+        })
+    
+    original_df = df.copy()
+
+    if replication_factor > 1:
+        for r in range(1, replication_factor):
+            df_replicated = original_df
+            for col, offset in col_item_counts.items():
+                df_replicated[col] += offset * r
+        
+            df = cudf.concat([df, df_replicated], ignore_index=True)
+    
+    return df
+
+
+@get_allocation_counts_dask_lazy(return_allocations=True, logging=True)
+def sample_graph(G, label_df, output_path,seed=42, batch_size=500, seeds_per_call=200000, batches_per_partition=100, fanout=[5, 5, 5], persist=False):
+    cupy.random.seed(seed)
+
+    sampler = BulkSampler(
+        batch_size=batch_size,
+        output_path=output_path,
+        graph=G,
+        fanout_vals=fanout,
+        with_replacement=False,
+        random_state=seed,
+        seeds_per_call=seeds_per_call,
+        batches_per_partition=batches_per_partition,
+    )
+
+    n_workers = len(default_client().scheduler_info()['workers'])
+
+    meta = cudf.DataFrame({
+        'node': cudf.Series(dtype='int64'),
+        'batch': cudf.Series(dtype='int32')
+    })
+
+    
+    batch_df = label_df.map_partitions(_make_batch_ids, batch_size, n_workers, meta=meta)
+    
+    # should always persist the batch dataframe or performace may be suboptimal
+    batch_df = batch_df.persist()
+
+    del label_df
+    print('created batches')
+    
+
+    start_time = perf_counter()
+    sampler.add_batches(batch_df, start_col_name='node', batch_col_name='batch')
+    sampler.flush()
+    end_time = perf_counter()
+    print('flushed all batches')
+    return (end_time - start_time)
+
+
+def assign_offsets_pyg(node_counts: Dict[str, int], replication_factor:int=1):
+    # cuGraph-PyG assigns offsets based on lexicographic order
+    node_offsets = {}
+    node_offsets_replicated = {}
+    count = 0
+    count_replicated = 0
+    for node_type in sorted(node_counts.keys()):
+        node_offsets[node_type] = count
+        node_offsets_replicated[node_type] = count_replicated
+
+        count += node_counts[node_type]
+        count_replicated += node_counts[node_type] * replication_factor
+    
+    return node_offsets, node_offsets_replicated, count_replicated
+
+def generate_rmat_dataset(dataset, seed=62, labeled_percentage=0.01, num_labels=256, reverse_edges=False, persist=False, add_edge_types=False):
+    """
+    Generates an rmat dataset.  Currently does not support heterogeneous datasets.
+
+    Parameters
+    ----------
+    dataset: The specifier of the rmat dataset (i.e. rmat_20_16)
+    seed: The seed to use for random number generation
+    num_labels: The number of classes for the labeled nodes
+    reverse_edges: Whether to reverse the edges in the edgelist (should be True for DGL, False, for PyG)
+    """
+
+    dataset = dataset.split('_')
+    scale = int(dataset[1])
+    edgefactor = int(dataset[2])
+
+    dask_edgelist_df = generate_edgelist_rmat(
+        scale=scale, edgefactor=edgefactor, seed=seed, unweighted=True, mg=True,
+    )
+    dask_edgelist_df = dask_edgelist_df.astype("int64")
+    dask_edgelist_df = dask_edgelist_df.reset_index(drop=True)
+
+
+    dask_edgelist_df = renumber_ddf(dask_edgelist_df).persist()
+    if persist:
+        dask_edgelist_df = dask_edgelist_df.persist()
+
+    dask_edgelist_df = symmetrize_ddf(dask_edgelist_df).persist()
+    if persist:
+        dask_edgelist_df = dask_edgelist_df.persist()
+
+    if add_edge_types:
+        dask_edgelist_df['etp'] = cupy.int32(0) # doesn't matter what the value is, really
+    
+    # generator = np.random.default_rng(seed=seed)
+    num_labeled_nodes = int(2**(scale+1) * labeled_percentage)
+    label_df = pd.DataFrame({
+        'node': np.arange(num_labeled_nodes),
+        # 'label': generator.integers(0, num_labels - 1, num_labeled_nodes).astype('float32')
+    })
+    
+    n_workers = len(default_client().scheduler_info()['workers'])
+    dask_label_df = ddf.from_pandas(label_df, npartitions=n_workers*2)
+    del label_df
+    gc.collect()
+
+    dask_label_df = dask_cudf.from_dask_dataframe(dask_label_df)
+
+    node_offsets = {'paper': 0}
+    edge_offsets = {('paper','cites','paper'):0}
+    total_num_nodes = int(dask_cudf.concat([dask_edgelist_df.src, dask_edgelist_df.dst]).nunique().compute())
+
+    if reverse_edges:
+        dask_edgelist_df = dask_edgelist_df.rename(columns={'src':'dst', 'dst':'src'})
+
+    return dask_edgelist_df, dask_label_df, node_offsets, edge_offsets, total_num_nodes
+
+
+def load_disk_dataset(dataset, dataset_dir='.', reverse_edges=True, replication_factor=1, persist=False, add_edge_types=False):
+    from pathlib import Path
+    path = Path(dataset_dir) / dataset
+    parquet_path = path / 'parquet'
+
+    with open(os.path.join(path, 'meta.json')) as meta_file:
+        meta = json.load(meta_file)
+    
+    node_offsets, node_offsets_replicated, total_num_nodes = \
+        assign_offsets_pyg(meta['num_nodes'], replication_factor=replication_factor)
+
+    edge_index_dict = {}
+    for edge_type in meta['num_edges'].keys():
+        print(f'Loading edge index for edge type {edge_type}')
+
+        can_edge_type = tuple(edge_type.split('__'))
+        edge_index_dict[can_edge_type] = dask_cudf.read_parquet(os.path.join(os.path.join(parquet_path, edge_type), 'edge_index.parquet'))
+
+        edge_index_dict[can_edge_type]['src'] += node_offsets_replicated[can_edge_type[0]]
+        edge_index_dict[can_edge_type]['dst'] += node_offsets_replicated[can_edge_type[-1]]
+
+        edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type]
+        if persist:
+            edge_index_dict = edge_index_dict.persist()
+
+        if replication_factor > 1:
+            edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type].map_partitions(
+                _replicate_df,
+                replication_factor,
+                {
+                    'src': meta['num_nodes'][can_edge_type[0]],
+                    'dst': meta['num_nodes'][can_edge_type[2]],
+                },
+                meta=cudf.DataFrame({'src':cudf.Series(dtype='int64'), 'dst':cudf.Series(dtype='int64')})
+            )
+            
+            if persist:
+                edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type].persist()
+        
+        gc.collect()
+
+        if reverse_edges:
+            edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type].rename(columns={'src':'dst','dst':'src'})
+            
+        if persist:
+            edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type].persist()
+    
+    # Assign numeric edge type ids based on lexicographic order
+    edge_offsets = {}
+    edge_count = 0
+    for num_edge_type, can_edge_type in enumerate(sorted(edge_index_dict.keys())):
+        if add_edge_types:
+            edge_index_dict[can_edge_type]['etp'] = cupy.int32(num_edge_type)
+        edge_offsets[can_edge_type] = edge_count
+        edge_count += len(edge_index_dict[can_edge_type])
+    
+    all_edges_df = dask_cudf.concat(
+        list(edge_index_dict.values())
+    )
+    
+    if persist:
+        all_edges_df = all_edges_df.persist()
+
+    del edge_index_dict
+    gc.collect()
+
+    node_labels = {}
+    for node_type, offset in node_offsets_replicated.items():
+        print(f'Loading node labels for node type {node_type} (offset={offset})')
+        node_label_path = os.path.join(os.path.join(parquet_path, node_type), 'node_label.parquet')
+        if os.path.exists(node_label_path):
+            node_labels[node_type] = dask_cudf.read_parquet(node_label_path).drop('label',axis=1).persist()
+            node_labels[node_type]['node'] += offset
+            node_labels[node_type] = node_labels[node_type].persist()
+
+            if replication_factor > 1:
+                node_labels[node_type] = node_labels[node_type].map_partitions(
+                    _replicate_df,
+                    replication_factor,
+                    {
+                        'node': meta['num_nodes'][node_type]
+                    },
+                    meta=cudf.DataFrame({'node':cudf.Series(dtype='int64')})
+                )
+                
+                if persist:
+                    node_labels[node_type] = node_labels[node_type].persist()
+
+            gc.collect()
+    
+    node_labels_df = dask_cudf.concat(
+        list(node_labels.values())
+    )
+    
+    if persist:
+        node_labels_df = node_labels_df.persist()
+
+    del node_labels
+    gc.collect()
+
+    return all_edges_df, node_labels_df, node_offsets_replicated, edge_offsets, total_num_nodes
+    
+
+def benchmark_cugraph_bulk_sampling(
+                                    dataset,
+                                    output_path,
+                                    seed,
+                                    batch_size,
+                                    seeds_per_call,
+                                    fanout,
+                                    reverse_edges=True,
+                                    dataset_dir='.',
+                                    replication_factor=1,
+                                    num_labels=256,
+                                    labeled_percentage=0.001,
+                                    persist=False,
+                                    add_edge_types=False):
+    """
+    Entry point for the benchmark.
+
+    Parameters
+    ----------
+    dataset: str
+        The dataset to sample.  Can be rmat_{scale}_{edgefactor}, or the name of an ogb dataset.
+    output_path: str
+        The output path, where samples and metadata will be stored.
+    seed: int
+        The random seed.
+    batch_size: int
+        The batch size (number of input seeds in a single sampling batch).
+    seeds_per_call: int
+        The number of input seeds in a single sampling call.
+    fanout: list[int]
+        The fanout.
+    reverse_edges: bool
+        Whether to reverse edges when constructing the graph.
+    dataset_dir: str
+        The directory where datasets are stored (only for ogb datasets)
+    replication_factor: int
+        The number of times to replicate the dataset.
+    num_labels: int
+        The number of random labels to generate (only for rmat datasets)
+    labeled_percentage: float
+        The percentage of the data that is labeled (only for rmat datasets)
+        Defaults to 0.001 to match papers100M
+    persist: bool
+        Whether to aggressively persist data in dask in attempt to speed up ETL.
+        Defaults to False.
+    add_edge_types: bool
+        Whether to add edge types to the edgelist.
+        Defaults to False.
+    """
+    print(dataset)
+    if dataset[0:4] == 'rmat':
+        dask_edgelist_df, dask_label_df, node_offsets, edge_offsets, total_num_nodes = \
+            generate_rmat_dataset(
+                dataset,
+                reverse_edges=reverse_edges,
+                seed=seed,
+                labeled_percentage=labeled_percentage,
+                num_labels=num_labels,
+                persist=persist,
+                add_edge_types=add_edge_types
+            )
+
+    else:
+        dask_edgelist_df, dask_label_df, node_offsets, edge_offsets, total_num_nodes = \
+            load_disk_dataset(
+                dataset,
+                dataset_dir=dataset_dir,
+                reverse_edges=reverse_edges,
+                replication_factor=replication_factor,
+                persist=persist,
+                add_edge_types=add_edge_types
+            )
+
+    num_input_edges = len(dask_edgelist_df)
+    print(
+        f"Number of input edges = {num_input_edges:,}"
+    )
+
+    G = construct_graph(
+        dask_edgelist_df
+    )
+    del dask_edgelist_df
+    print('constructed graph')
+
+    input_memory = G.edgelist.edgelist_df.memory_usage().sum().compute()
+    print(f'input memory: {input_memory}')
+
+    output_subdir = os.path.join(output_path, f'{dataset}[{replication_factor}]_b{batch_size}_f{fanout}')
+    os.makedirs(output_subdir)
+
+    output_sample_path = os.path.join(output_subdir, 'samples')
+    os.makedirs(output_sample_path)
+
+    batches_per_partition = 200_000 // batch_size
+    execution_time, allocation_counts = sample_graph(
+        G,
+        dask_label_df,
+        output_sample_path,
+        seed=seed,
+        batch_size=batch_size,
+        seeds_per_call=seeds_per_call,
+        batches_per_partition=batches_per_partition,
+        fanout=fanout,
+        persist=persist,
+    )
+
+    output_meta = {
+        'dataset': dataset,
+        'dataset_dir': dataset_dir,
+        'seed': seed,
+        'node_offsets': node_offsets,
+        'edge_offsets': {'__'.join(k): v for k, v in edge_offsets.items()},
+        'total_num_nodes': total_num_nodes,
+        'total_num_edges': num_input_edges,
+        'batch_size': batch_size,
+        'seeds_per_call': seeds_per_call,
+        'batches_per_partition': batches_per_partition,
+        'fanout': fanout,
+        'replication_factor': replication_factor,
+        'num_sampling_gpus': len(G._plc_graph),
+        'execution_time': execution_time,
+    }
+
+    with open(os.path.join(output_subdir, 'output_meta.json'), 'w') as f:
+        json.dump(
+            output_meta,
+            f,
+            indent='\t'
+        )
+
+    print('allocation counts b:')
+    print(allocation_counts.values())
+
+    (
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+    ) = get_memory_statistics(
+        allocation_counts=allocation_counts, input_memory=input_memory
+    )
+    print(f"Number of edges in final graph = {G.number_of_edges():,}")
+    print("-" * 80)
+    return (
+        num_input_edges,
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+    )
+
+
+def get_memory_statistics(allocation_counts, input_memory):
+    """
+    Get memory statistics for the benchmark.
+    """
+    output_to_peak_ratio = get_peak_output_ratio_across_workers(allocation_counts)
+    peak_allocation_across_workers = max(
+        [a["peak_bytes"] for a in allocation_counts.values()]
+    )
+    input_memory_per_worker = input_memory / len(allocation_counts.keys())
+    input_to_peak_ratio = peak_allocation_across_workers / input_memory_per_worker
+    print(f"Edge List Memory = {sizeof_fmt(input_memory_per_worker)}")
+    print(f"Peak Memory across workers = {sizeof_fmt(peak_allocation_across_workers)}")
+    print(f"Max Peak to output graph ratio across workers = {output_to_peak_ratio:.2f}")
+    print(
+        f"Max Peak to avg input graph ratio across workers = {input_to_peak_ratio:.2f}"
+    )
+    return (
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+    )
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        '--output_root',
+        type=str,
+        help='The output root directory.  File/folder names are auto-generated.',
+        required=True,
+    )
+
+    parser.add_argument(
+        '--dataset_root',
+        type=str,
+        help='The dataset root directory containing ogb datasets.',
+        required=True,
+    )
+
+    parser.add_argument(
+        '--datasets',
+        type=str,
+        help=(
+            'Comma separated list of datasets; can specify ogb or rmat (i.e. ogb_papers100M[2],rmat_22_16).'
+            ' For ogb datasets, can provide replication factor using brackets.'
+        ),
+        required=True,
+    )
+
+    parser.add_argument(
+        '--fanouts',
+        type=str,
+        help='Comma separated list of fanouts (i.e. 10_25,5_5_5)',
+        required=False,
+        default='10_25',
+    )
+
+    parser.add_argument(
+        '--batch_sizes',
+        type=str,
+        help='Comma separated list of batch sizes (i.e. 500,1000)',
+        required=False,
+        default='512,1024'
+    )
+
+    parser.add_argument(
+        '--seeds_per_call_opts',
+        type=str,
+        help='Comma separated list of seeds per call (i.e. 1000000,2000000)',
+        required=False,
+        default='524288',
+    )
+    
+    parser.add_argument(
+        '--reverse_edges',
+        action='store_true',
+        help='Whether to reverse the edges for DGL (defaults to False).  Should be True for DGL, False for PyG.',
+        required=False,
+        default=False,
+    )
+
+    parser.add_argument(
+        '--dask_worker_devices',
+        type=str,
+        help='Comma separated list of dask worker devices',
+        required=False,
+        default="0"
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        help='Random seed',
+        required=False,
+        default=62
+    )
+
+    parser.add_argument(
+        '--persist',
+        action='store_true',
+        help='Will add additional persist() calls to speed up ETL.  Does not affect sampling runtime.',
+        required=False,
+        default=False,
+    )
+
+    parser.add_argument(
+        '--add_edge_types',
+        action='store_true',
+        help='Adds edge types to the edgelist.  Required for PyG if not providing edge ids.',
+        required=False,
+        default=False,
+    )
+
+    return parser.parse_args()
+
+
+# call __main__ function
+if __name__ == "__main__":
+    logging.basicConfig()
+
+    args = get_args()
+    fanouts = [[int(f) for f in fanout.split('_')] for fanout in args.fanouts.split(',')]
+    datasets = args.datasets.split(',')
+    batch_sizes = [int(b) for b in args.batch_sizes.split(',')]
+    seeds_per_call_opts = [int(s) for s in args.seeds_per_call_opts.split(',')]
+    dask_worker_devices = [int(d) for d in args.dask_worker_devices.split(',')]
+
+    client, cluster = start_dask_client(dask_worker_devices=dask_worker_devices, jit_unspill=False, rmm_pool_size=28e9, rmm_async=True)
+    enable_spilling()
+    stats_ls = []
+    client.run(enable_spilling)
+    for dataset in datasets:
+        if re.match(r'([A-z]|[0-9])+\[[0-9]+\]', dataset):
+            replication_factor = int(dataset[-2])
+            dataset = dataset[:-3]
+        else:
+            replication_factor = 1
+
+        for fanout in fanouts:
+            for batch_size in batch_sizes:
+                for seeds_per_call in seeds_per_call_opts:
+                    print(f'dataset: {dataset}')
+                    print(f'batch size: {batch_size}')
+                    print(f'fanout: {fanout}')
+                    print(f'seeds_per_call: {seeds_per_call}')
+
+                    try:
+                        stats_d = {}
+                        (
+                            num_input_edges,
+                            input_to_peak_ratio,
+                            output_to_peak_ratio,
+                            input_memory_per_worker,
+                            peak_allocation_across_workers,
+                        ) = benchmark_cugraph_bulk_sampling(
+                            dataset=dataset,
+                            output_path=args.output_root,
+                            seed=args.random_seed,
+                            batch_size=batch_size,
+                            seeds_per_call=seeds_per_call,
+                            fanout=fanout,
+                            dataset_dir=args.dataset_root,
+                            reverse_edges=args.reverse_edges,
+                            replication_factor=replication_factor,
+                            persist=args.persist,
+                            add_edge_types=args.add_edge_types,
+                        )
+                        stats_d["dataset"] = dataset
+                        stats_d["num_input_edges"] = num_input_edges
+                        stats_d["batch_size"] = batch_size
+                        stats_d["fanout"] = fanout
+                        stats_d["seeds_per_call"] = seeds_per_call
+                        stats_d["input_memory_per_worker"] = sizeof_fmt(input_memory_per_worker)
+                        stats_d["peak_allocation_across_workers"] = sizeof_fmt(
+                            peak_allocation_across_workers
+                        )
+                        stats_d["input_to_peak_ratio"] = input_to_peak_ratio
+                        stats_d["output_to_peak_ratio"] = output_to_peak_ratio
+                        stats_ls.append(stats_d)
+                    except Exception as e:
+                        warnings.warn('An Exception Occurred!')
+                        print(e)
+                        traceback.print_exc()
+                    restart_client(client)
+                    sleep(10)
+
+        stats_df = pd.DataFrame(
+            stats_ls,
+            columns=[
+                "dataset",
+                "num_input_edges",
+                "directed",
+                "renumber",
+                "input_memory_per_worker",
+                "peak_allocation_across_workers",
+                "input_to_peak_ratio",
+                "output_to_peak_ratio",
+            ],
+        )
+        stats_df.to_csv("cugraph_sampling_stats.csv")
+        print("-" * 40 + f"dataset = {dataset} completed" + "-" * 40)
+
+    # Cleanup Dask Cluster
+    stop_dask_client(client, cluster)
diff --git a/python/cugraph/cugraph/testing/mg_utils.py b/python/cugraph/cugraph/testing/mg_utils.py
index 1e1a481e4d6..bd165ba3db5 100644
--- a/python/cugraph/cugraph/testing/mg_utils.py
+++ b/python/cugraph/cugraph/testing/mg_utils.py
@@ -29,6 +29,7 @@
 
 def start_dask_client(
     protocol=None,
+    rmm_async=False,
     rmm_pool_size=None,
     dask_worker_devices=None,
     jit_unspill=False,
@@ -137,6 +138,7 @@ def start_dask_client(
             local_directory=local_directory,
             protocol=protocol,
             rmm_pool_size=rmm_pool_size,
+            rmm_async=rmm_async,
             CUDA_VISIBLE_DEVICES=dask_worker_devices,
             jit_unspill=jit_unspill,
             device_memory_limit=device_memory_limit,
@@ -287,6 +289,15 @@ def persist_dask_object(arg):
 
 # Function to convert bytes into human readable format
 def sizeof_fmt(num, suffix="B"):
+    if isinstance(num, str):
+        if num[-2:] == "GB":
+            return num[:-2] + "G"
+        elif num[-2:] == "MB":
+            return num[:-2] + "M"
+        elif num[-2:] == "KB":
+            return num[:-2] + "K"
+        else:
+            raise ValueError("unknown unit")
     for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
         if abs(num) < 1024.0:
             return "%3.1f%s%s" % (num, unit, suffix)

From 5bce94a659143b512d79c2cd3d102ad18709a58b Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Tue, 27 Jun 2023 11:12:56 -0400
Subject: [PATCH 04/10] Implement C++ Edge Betweenness Centrality (#3602)

This PR contains the implementation for edge betweenness centrality.

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/3602
---
 cpp/include/cugraph/algorithms.hpp            |   5 +-
 .../detail/decompress_edge_partition.cuh      |  12 +-
 cpp/include/cugraph/graph_functions.hpp       |  10 +-
 cpp/include/cugraph_c/centrality_algorithms.h |   9 +
 cpp/src/c_api/betweenness_centrality.cpp      |  24 +-
 cpp/src/c_api/centrality_result.hpp           |   1 +
 .../betweenness_centrality_impl.cuh           | 337 +++++++++++++++---
 .../centrality/betweenness_centrality_mg.cu   |  18 +-
 .../centrality/betweenness_centrality_sg.cu   |  18 +-
 ...t_nbr_intersection_of_e_endpoints_by_v.cuh |   2 +
 cpp/src/structure/coarsen_graph_impl.cuh      |   3 +
 .../structure/decompress_to_edgelist_impl.cuh |  72 +++-
 .../structure/decompress_to_edgelist_mg.cu    |  50 ++-
 .../structure/decompress_to_edgelist_sg.cu    |  50 ++-
 cpp/src/structure/symmetrize_graph_impl.cuh   |  14 +-
 cpp/src/structure/transpose_graph_impl.cuh    |  14 +-
 .../transpose_graph_storage_impl.cuh          |  14 +-
 cpp/tests/CMakeLists.txt                      |   2 +
 ...y.c => edge_betweenness_centrality_test.c} |  92 +++--
 ... => mg_edge_betweenness_centrality_test.c} |  83 ++++-
 .../betweenness_centrality_reference.hpp      |  36 +-
 .../edge_betweenness_centrality_test.cpp      |  41 +--
 .../eigenvector_centrality_test.cpp           |   9 +-
 .../mg_edge_betweenness_centrality_test.cpp   |  77 ++--
 cpp/tests/community/egonet_validate.cu        |  17 +-
 cpp/tests/cores/k_core_validate.cu            |  19 +-
 cpp/tests/prims/mg_transform_e.cu             |   3 +-
 cpp/tests/sampling/random_walks_check.cuh     |  17 +-
 cpp/tests/structure/mg_symmetrize_test.cpp    |  12 +-
 .../structure/mg_transpose_storage_test.cpp   |  12 +-
 cpp/tests/structure/mg_transpose_test.cpp     |  12 +-
 cpp/tests/structure/symmetrize_test.cpp       |  41 ++-
 .../structure/transpose_storage_test.cpp      |  29 +-
 cpp/tests/structure/transpose_test.cpp        |  26 +-
 cpp/tests/utilities/test_utilities.hpp        |  14 +
 cpp/tests/utilities/test_utilities_impl.cuh   |  89 ++++-
 cpp/tests/utilities/test_utilities_mg.cu      |  96 +++++
 cpp/tests/utilities/test_utilities_sg.cu      |  96 +++++
 38 files changed, 1184 insertions(+), 292 deletions(-)
 rename cpp/tests/c_api/{edge_betweenness_centrality.c => edge_betweenness_centrality_test.c} (51%)
 rename cpp/tests/c_api/{mg_edge_betweenness_centrality.c => mg_edge_betweenness_centrality_test.c} (54%)

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 51212d9d568..cf9cba2af4d 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -378,10 +378,11 @@ rmm::device_uvector<weight_t> betweenness_centrality(
  * @param normalized         A flag indicating whether or not to normalize the result
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  *
- * @return device vector containing the centralities.
+ * @return edge_property_t containing the centralities.
  */
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-rmm::device_uvector<weight_t> edge_betweenness_centrality(
+edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
index 81ece768edb..cd8739114f2 100644
--- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -190,9 +190,12 @@ void decompress_edge_partition_to_edgelist(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   std::optional<edge_partition_edge_property_device_view_t<edge_t, weight_t const*>>
     edge_partition_weight_view,
+  std::optional<edge_partition_edge_property_device_view_t<edge_t, edge_t const*>>
+    edge_partition_id_view,
   vertex_t* edgelist_majors /* [OUT] */,
   vertex_t* edgelist_minors /* [OUT] */,
   std::optional<weight_t*> edgelist_weights /* [OUT] */,
+  std::optional<edge_t*> edgelist_ids /* [OUT] */,
   std::optional<std::vector<vertex_t>> const& segment_offsets)
 {
   auto number_of_edges = edge_partition.number_of_edges();
@@ -203,6 +206,13 @@ void decompress_edge_partition_to_edgelist(
                edge_partition.indices(),
                edge_partition.indices() + number_of_edges,
                edgelist_minors);
+  if (edge_partition_id_view) {
+    assert(edgelist_ids.has_value());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*edge_partition_id_view).value_first(),
+                 (*edge_partition_id_view).value_first() + number_of_edges,
+                 (*edgelist_ids));
+  }
   if (edge_partition_weight_view) {
     assert(edgelist_weights.has_value());
     thrust::copy(handle.get_thrust_policy(),
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 1c01568ae17..017b32d0470 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -350,12 +350,14 @@ void renumber_local_ext_vertices(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object of the graph to be decompressed.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
  * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
  * @param renumber_map If valid, return the renumbered edge list based on the provided @p
  * renumber_map
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
- * @return Tuple of edge sources, destinations, and (optional) edge weights (if @p
- * edge_weight_view.has_value() is true).
+ * @return Tuple of edge sources, destinations, (optional) edge weights (if
+ * @p edge_weight_view.has_value() is true) and (optional) edge ids (if
+ * @p edge_id_view.has_value() is true).
  */
 template <typename vertex_t,
           typename edge_t,
@@ -364,11 +366,13 @@ template <typename vertex_t,
           bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<weight_t>>>
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>>
 decompress_to_edgelist(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<raft::device_span<vertex_t const>> renumber_map,
   bool do_expensive_check = false);
 
diff --git a/cpp/include/cugraph_c/centrality_algorithms.h b/cpp/include/cugraph_c/centrality_algorithms.h
index ca60d3bfad4..0ac0e58540f 100644
--- a/cpp/include/cugraph_c/centrality_algorithms.h
+++ b/cpp/include/cugraph_c/centrality_algorithms.h
@@ -426,6 +426,15 @@ cugraph_type_erased_device_array_view_t* cugraph_edge_centrality_result_get_src_
 cugraph_type_erased_device_array_view_t* cugraph_edge_centrality_result_get_dst_vertices(
   cugraph_edge_centrality_result_t* result);
 
+/**
+ * @brief     Get the edge ids from an edge centrality result
+ *
+ * @param [in]   result   The result from an edge centrality algorithm
+ * @return type erased array of edge ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edge_centrality_result_get_edge_ids(
+  cugraph_edge_centrality_result_t* result);
+
 /**
  * @brief     Get the centrality values from an edge centrality algorithm result
  *
diff --git a/cpp/src/c_api/betweenness_centrality.cpp b/cpp/src/c_api/betweenness_centrality.cpp
index 0387b050262..3cf3e92e960 100644
--- a/cpp/src/c_api/betweenness_centrality.cpp
+++ b/cpp/src/c_api/betweenness_centrality.cpp
@@ -144,7 +144,7 @@ struct edge_betweenness_centrality_functor : public cugraph::c_api::abstract_fun
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* vertex_list_{};
   bool_t normalized_{};
   bool do_expensive_check_{};
-  cugraph::c_api::cugraph_centrality_result_t* result_{};
+  cugraph::c_api::cugraph_edge_centrality_result_t* result_{};
 
   edge_betweenness_centrality_functor(cugraph_resource_handle_t const* handle,
                                       cugraph_graph_t* graph,
@@ -190,6 +190,10 @@ struct edge_betweenness_centrality_functor : public cugraph::c_api::abstract_fun
         cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
                                  weight_t>*>(graph_->edge_weights_);
 
+      auto edge_ids = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+                                 edge_t>*>(graph_->edge_ids_);
+
       auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
 
       rmm::device_uvector<vertex_t> local_vertices(0, handle_.get_stream());
@@ -230,14 +234,24 @@ struct edge_betweenness_centrality_functor : public cugraph::c_api::abstract_fun
           normalized_,
           do_expensive_check_);
 
-      CUGRAPH_FAIL("Need to clean up return type");
+      auto [src_ids, dst_ids, output_centralities, output_edge_ids] =
+        cugraph::decompress_to_edgelist(
+          handle_,
+          graph_view,
+          std::make_optional(centralities.view()),
+          (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+          (number_map != nullptr) ? std::make_optional(raft::device_span<vertex_t const>{
+                                      number_map->data(), number_map->size()})
+                                  : std::nullopt);
 
-#if 0
       result_ = new cugraph::c_api::cugraph_edge_centrality_result_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(src_ids, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(dst_ids, graph_->vertex_type_),
-        new cugraph::c_api::cugraph_type_erased_device_array_t(centralities, graph_->weight_type_)};
-#endif
+        output_edge_ids ? new cugraph::c_api::cugraph_type_erased_device_array_t(*output_edge_ids,
+                                                                                 graph_->edge_type_)
+                        : nullptr,
+        new cugraph::c_api::cugraph_type_erased_device_array_t(*output_centralities,
+                                                               graph_->weight_type_)};
     }
   }
 };
diff --git a/cpp/src/c_api/centrality_result.hpp b/cpp/src/c_api/centrality_result.hpp
index 068dd838c93..e0acde9cce3 100644
--- a/cpp/src/c_api/centrality_result.hpp
+++ b/cpp/src/c_api/centrality_result.hpp
@@ -31,6 +31,7 @@ struct cugraph_centrality_result_t {
 struct cugraph_edge_centrality_result_t {
   cugraph_type_erased_device_array_t* src_ids_{};
   cugraph_type_erased_device_array_t* dst_ids_{};
+  cugraph_type_erased_device_array_t* edge_ids_{};
   cugraph_type_erased_device_array_t* values_{};
 };
 
diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh
index 5631fadde96..0a87531d6ca 100644
--- a/cpp/src/centrality/betweenness_centrality_impl.cuh
+++ b/cpp/src/centrality/betweenness_centrality_impl.cuh
@@ -16,8 +16,12 @@
 #pragma once
 
 #include <prims/count_if_v.cuh>
+#include <prims/edge_bucket.cuh>
+#include <prims/extract_transform_e.cuh>
 #include <prims/extract_transform_v_frontier_outgoing_e.cuh>
+#include <prims/fill_edge_property.cuh>
 #include <prims/per_v_transform_reduce_incoming_outgoing_e.cuh>
+#include <prims/transform_e.cuh>
 #include <prims/transform_reduce_v.cuh>
 #include <prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
@@ -55,6 +59,24 @@ struct brandes_e_op_t {
   }
 };
 
+template <typename vertex_t>
+struct extract_edge_e_op_t {
+  vertex_t d{};
+
+  template <typename edge_t, typename weight_t>
+  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
+    vertex_t src,
+    vertex_t dst,
+    thrust::tuple<vertex_t, edge_t, weight_t> src_props,
+    thrust::tuple<vertex_t, edge_t, weight_t> dst_props,
+    weight_t edge_centrality)
+  {
+    return ((thrust::get<0>(dst_props) == d) && (thrust::get<0>(src_props) == (d - 1)))
+             ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
+             : thrust::nullopt;
+  }
+};
+
 }  // namespace
 
 namespace cugraph {
@@ -77,16 +99,16 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
   constexpr int bucket_idx_cur{0};
   constexpr int bucket_idx_next{1};
 
-  rmm::device_uvector<edge_t> sigma(graph_view.local_vertex_partition_range_size(),
-                                    handle.get_stream());
-  rmm::device_uvector<vertex_t> distance(graph_view.local_vertex_partition_range_size(),
-                                         handle.get_stream());
-  detail::scalar_fill(handle, distance.data(), distance.size(), invalid_distance);
-  detail::scalar_fill(handle, sigma.data(), sigma.size(), edge_t{0});
+  rmm::device_uvector<edge_t> sigmas(graph_view.local_vertex_partition_range_size(),
+                                     handle.get_stream());
+  rmm::device_uvector<vertex_t> distances(graph_view.local_vertex_partition_range_size(),
+                                          handle.get_stream());
+  detail::scalar_fill(handle, distances.data(), distances.size(), invalid_distance);
+  detail::scalar_fill(handle, sigmas.data(), sigmas.size(), edge_t{0});
 
-  edge_src_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> src_sigma(
+  edge_src_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> src_sigmas(
     handle, graph_view);
-  edge_dst_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, vertex_t> dst_distance(
+  edge_dst_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, vertex_t> dst_distances(
     handle, graph_view);
 
   auto vertex_partition =
@@ -97,7 +119,7 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
       handle.get_thrust_policy(),
       vertex_frontier.bucket(bucket_idx_cur).begin(),
       vertex_frontier.bucket(bucket_idx_cur).end(),
-      [d_sigma = sigma.begin(), d_distance = distance.begin(), vertex_partition] __device__(
+      [d_sigma = sigmas.begin(), d_distance = distances.begin(), vertex_partition] __device__(
         auto v) {
         auto offset        = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
         d_distance[offset] = 0;
@@ -108,15 +130,15 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
   edge_t hop{0};
 
   while (true) {
-    update_edge_src_property(handle, graph_view, sigma.begin(), src_sigma);
-    update_edge_dst_property(handle, graph_view, distance.begin(), dst_distance);
+    update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas);
+    update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances);
 
     auto [new_frontier, new_sigma] =
       transform_reduce_v_frontier_outgoing_e_by_dst(handle,
                                                     graph_view,
                                                     vertex_frontier.bucket(bucket_idx_cur),
-                                                    src_sigma.view(),
-                                                    dst_distance.view(),
+                                                    src_sigmas.view(),
+                                                    dst_distances.view(),
                                                     cugraph::edge_dummy_property_t{}.view(),
                                                     brandes_e_op_t<vertex_t>{},
                                                     reduce_op::plus<vertex_t>());
@@ -127,8 +149,8 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
                       std::move(new_sigma),
                       vertex_frontier,
                       std::vector<size_t>{bucket_idx_next},
-                      thrust::make_zip_iterator(distance.begin(), sigma.begin()),
-                      thrust::make_zip_iterator(distance.begin(), sigma.begin()),
+                      thrust::make_zip_iterator(distances.begin(), sigmas.begin()),
+                      thrust::make_zip_iterator(distances.begin(), sigmas.begin()),
                       [hop] __device__(auto v, auto old_values, auto v_sigma) {
                         return thrust::make_tuple(
                           thrust::make_optional(bucket_idx_next),
@@ -143,7 +165,7 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
     ++hop;
   }
 
-  return std::make_tuple(std::move(distance), std::move(sigma));
+  return std::make_tuple(std::move(distances), std::move(sigmas));
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -152,8 +174,8 @@ void accumulate_vertex_results(
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   raft::device_span<weight_t> centralities,
-  rmm::device_uvector<vertex_t>&& distance,
-  rmm::device_uvector<edge_t>&& sigma,
+  rmm::device_uvector<vertex_t>&& distances,
+  rmm::device_uvector<edge_t>&& sigmas,
   bool with_endpoints,
   bool do_expensive_check)
 {
@@ -162,26 +184,26 @@ void accumulate_vertex_results(
   vertex_t diameter = transform_reduce_v(
     handle,
     graph_view,
-    distance.begin(),
+    distances.begin(),
     [] __device__(auto, auto d) { return (d == invalid_distance) ? vertex_t{0} : d; },
     vertex_t{0},
     reduce_op::maximum<vertex_t>{},
     do_expensive_check);
 
-  rmm::device_uvector<weight_t> delta(sigma.size(), handle.get_stream());
-  detail::scalar_fill(handle, delta.data(), delta.size(), weight_t{0});
+  rmm::device_uvector<weight_t> deltas(sigmas.size(), handle.get_stream());
+  detail::scalar_fill(handle, deltas.data(), deltas.size(), weight_t{0});
 
   if (with_endpoints) {
     vertex_t count = count_if_v(
       handle,
       graph_view,
-      distance.begin(),
+      distances.begin(),
       [] __device__(auto, auto d) { return (d != invalid_distance); },
       do_expensive_check);
 
     thrust::transform(handle.get_thrust_policy(),
-                      distance.begin(),
-                      distance.end(),
+                      distances.begin(),
+                      distances.end(),
                       centralities.begin(),
                       centralities.begin(),
                       [count] __device__(auto d, auto centrality) {
@@ -205,12 +227,12 @@ void accumulate_vertex_results(
   update_edge_src_property(
     handle,
     graph_view,
-    thrust::make_zip_iterator(distance.begin(), sigma.begin(), delta.begin()),
+    thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
     src_properties);
   update_edge_dst_property(
     handle,
     graph_view,
-    thrust::make_zip_iterator(distance.begin(), sigma.begin(), delta.begin()),
+    thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
     dst_properties);
 
   // FIXME: To do this efficiently, I need a version of
@@ -243,29 +265,167 @@ void accumulate_vertex_results(
       },
       weight_t{0},
       reduce_op::plus<weight_t>{},
-      delta.begin(),
+      deltas.begin(),
       do_expensive_check);
 
     update_edge_src_property(
       handle,
       graph_view,
-      thrust::make_zip_iterator(distance.begin(), sigma.begin(), delta.begin()),
+      thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
       src_properties);
     update_edge_dst_property(
       handle,
       graph_view,
-      thrust::make_zip_iterator(distance.begin(), sigma.begin(), delta.begin()),
+      thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
       dst_properties);
 
     thrust::transform(handle.get_thrust_policy(),
                       centralities.begin(),
                       centralities.end(),
-                      delta.begin(),
+                      deltas.begin(),
                       centralities.begin(),
                       thrust::plus<weight_t>());
   }
 }
 
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void accumulate_edge_results(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  edge_property_view_t<edge_t, weight_t*> centralities_view,
+  rmm::device_uvector<vertex_t>&& distances,
+  rmm::device_uvector<edge_t>&& sigmas,
+  bool do_expensive_check)
+{
+  constexpr vertex_t invalid_distance = std::numeric_limits<vertex_t>::max();
+
+  vertex_t diameter = transform_reduce_v(
+    handle,
+    graph_view,
+    distances.begin(),
+    [] __device__(auto, auto d) { return (d == invalid_distance) ? vertex_t{0} : d; },
+    vertex_t{0},
+    reduce_op::maximum<vertex_t>{},
+    do_expensive_check);
+
+  rmm::device_uvector<weight_t> deltas(sigmas.size(), handle.get_stream());
+  detail::scalar_fill(handle, deltas.data(), deltas.size(), weight_t{0});
+
+  edge_src_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+                      thrust::tuple<vertex_t, edge_t, weight_t>>
+    src_properties(handle, graph_view);
+  edge_dst_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+                      thrust::tuple<vertex_t, edge_t, weight_t>>
+    dst_properties(handle, graph_view);
+
+  update_edge_src_property(
+    handle,
+    graph_view,
+    thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
+    src_properties);
+  update_edge_dst_property(
+    handle,
+    graph_view,
+    thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
+    dst_properties);
+
+  //
+  //   For now this will do a O(E) pass over all edges over the diameter
+  //   of the graph.
+  //
+  // Based on Brandes algorithm, we want to follow back pointers in non-increasing
+  // distance from S to compute delta
+  //
+  for (vertex_t d = diameter; d > 0; --d) {
+    //
+    //  Populate edge_list with edges where `thrust::get<0>(dst_props) == d`
+    //  and `thrust::get<0>(dst_props) == (d-1)`
+    //
+    cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> edge_list(handle);
+
+    {
+      auto [src, dst] = extract_transform_e(handle,
+                                            graph_view,
+                                            src_properties.view(),
+                                            dst_properties.view(),
+                                            centralities_view,
+                                            extract_edge_e_op_t<vertex_t>{d},
+                                            do_expensive_check);
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(src.begin(), dst.begin()),
+                   thrust::make_zip_iterator(src.end(), dst.end()));
+
+      // Eliminate duplicates in case of a multi-graph
+      auto new_edgelist_end = thrust::unique(handle.get_thrust_policy(),
+                                             thrust::make_zip_iterator(src.begin(), dst.begin()),
+                                             thrust::make_zip_iterator(src.end(), dst.end()));
+
+      src.resize(
+        thrust::distance(thrust::make_zip_iterator(src.begin(), dst.begin()), new_edgelist_end),
+        handle.get_stream());
+      dst.resize(src.size(), handle.get_stream());
+
+      edge_list.insert(src.begin(), src.end(), dst.begin());
+    }
+
+    transform_e(
+      handle,
+      graph_view,
+      edge_list,
+      src_properties.view(),
+      dst_properties.view(),
+      centralities_view,
+      [d] __device__(auto src, auto dst, auto src_props, auto dst_props, auto edge_centrality) {
+        if ((thrust::get<0>(dst_props) == d) && (thrust::get<0>(src_props) == (d - 1))) {
+          auto sigma_v = static_cast<weight_t>(thrust::get<1>(src_props));
+          auto sigma_w = static_cast<weight_t>(thrust::get<1>(dst_props));
+          auto delta_w = thrust::get<2>(dst_props);
+
+          return edge_centrality + (sigma_v / sigma_w) * (1 + delta_w);
+        } else {
+          return edge_centrality;
+        }
+      },
+      centralities_view,
+      do_expensive_check);
+
+    per_v_transform_reduce_outgoing_e(
+      handle,
+      graph_view,
+      src_properties.view(),
+      dst_properties.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      [d] __device__(auto, auto, auto src_props, auto dst_props, auto) {
+        if ((thrust::get<0>(dst_props) == d) && (thrust::get<0>(src_props) == (d - 1))) {
+          auto sigma_v = static_cast<weight_t>(thrust::get<1>(src_props));
+          auto sigma_w = static_cast<weight_t>(thrust::get<1>(dst_props));
+          auto delta_w = thrust::get<2>(dst_props);
+
+          return (sigma_v / sigma_w) * (1 + delta_w);
+        } else {
+          return weight_t{0};
+        }
+      },
+      weight_t{0},
+      reduce_op::plus<weight_t>{},
+      deltas.begin(),
+      do_expensive_check);
+
+    update_edge_src_property(
+      handle,
+      graph_view,
+      thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
+      src_properties);
+    update_edge_dst_property(
+      handle,
+      graph_view,
+      thrust::make_zip_iterator(distances.begin(), sigmas.begin(), deltas.begin()),
+      dst_properties);
+  }
+}
+
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -284,7 +444,24 @@ rmm::device_uvector<weight_t> betweenness_centrality(
   //
   // Betweenness Centrality algorithm based on the Brandes Algorithm (2001)
   //
-  if (do_expensive_check) {}
+  if (do_expensive_check) {
+    auto vertex_partition =
+      vertex_partition_device_view_t<vertex_t, multi_gpu>(graph_view.local_vertex_partition_view());
+    auto num_invalid_vertices =
+      thrust::count_if(handle.get_thrust_policy(),
+                       vertices_begin,
+                       vertices_end,
+                       [vertex_partition] __device__(auto val) {
+                         return !(vertex_partition.is_valid_vertex(val) &&
+                                  vertex_partition.in_local_vertex_partition_range_nocheck(val));
+                       });
+    if constexpr (multi_gpu) {
+      num_invalid_vertices = host_scalar_allreduce(
+        handle.get_comms(), num_invalid_vertices, raft::comms::op_t::SUM, handle.get_stream());
+    }
+    CUGRAPH_EXPECTS(num_invalid_vertices == 0,
+                    "Invalid input argument: sources have invalid vertex IDs.");
+  }
 
   rmm::device_uvector<weight_t> centralities(graph_view.local_vertex_partition_range_size(),
                                              handle.get_stream());
@@ -333,14 +510,14 @@ rmm::device_uvector<weight_t> betweenness_centrality(
     // FIXME:  This has an inefficiency in early iterations, as it doesn't have enough work to
     //         keep the GPUs busy.  But we can't run too many at once or we will run out of
     //         memory. Need to investigate options to improve this performance
-    auto [distance, sigma] =
+    auto [distances, sigmas] =
       brandes_bfs(handle, graph_view, edge_weight_view, vertex_frontier, do_expensive_check);
     accumulate_vertex_results(handle,
                               graph_view,
                               edge_weight_view,
                               raft::device_span<weight_t>{centralities.data(), centralities.size()},
-                              std::move(distance),
-                              std::move(sigma),
+                              std::move(distances),
+                              std::move(sigmas),
                               include_endpoints,
                               do_expensive_check);
   }
@@ -379,7 +556,8 @@ template <typename vertex_t,
           typename weight_t,
           bool multi_gpu,
           typename VertexIterator>
-rmm::device_uvector<weight_t> edge_betweenness_centrality(
+edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
@@ -388,10 +566,88 @@ rmm::device_uvector<weight_t> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check)
 {
-  CUGRAPH_FAIL("Not Implemented");
-  // Edge betweenness is computed like vertex betweenness, but you accumulate
-  // centrality on each edge.  We need to adapt this to support edge properties
-  // properly.
+  //
+  // Betweenness Centrality algorithm based on the Brandes Algorithm (2001)
+  //
+  if (do_expensive_check) {
+    auto vertex_partition =
+      vertex_partition_device_view_t<vertex_t, multi_gpu>(graph_view.local_vertex_partition_view());
+    auto num_invalid_vertices =
+      thrust::count_if(handle.get_thrust_policy(),
+                       vertices_begin,
+                       vertices_end,
+                       [vertex_partition] __device__(auto val) {
+                         return !(vertex_partition.is_valid_vertex(val) &&
+                                  vertex_partition.in_local_vertex_partition_range_nocheck(val));
+                       });
+    if constexpr (multi_gpu) {
+      num_invalid_vertices = host_scalar_allreduce(
+        handle.get_comms(), num_invalid_vertices, raft::comms::op_t::SUM, handle.get_stream());
+    }
+    CUGRAPH_EXPECTS(num_invalid_vertices == 0,
+                    "Invalid input argument: sources have invalid vertex IDs.");
+  }
+
+  edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t> centralities(
+    handle, graph_view);
+
+  fill_edge_property(handle, graph_view, weight_t{0}, centralities, do_expensive_check);
+
+  size_t num_sources = thrust::distance(vertices_begin, vertices_end);
+  std::vector<size_t> source_offsets{{0, num_sources}};
+  int my_rank = 0;
+
+  if constexpr (multi_gpu) {
+    auto source_counts =
+      host_scalar_allgather(handle.get_comms(), num_sources, handle.get_stream());
+
+    num_sources = std::accumulate(source_counts.begin(), source_counts.end(), 0);
+    source_offsets.resize(source_counts.size() + 1);
+    source_offsets[0] = 0;
+    std::inclusive_scan(source_counts.begin(), source_counts.end(), source_offsets.begin() + 1);
+    my_rank = handle.get_comms().get_rank();
+  }
+
+  //
+  // FIXME: This could be more efficient using something akin to the
+  // technique in WCC.  Take the entire set of sources, insert them into
+  // a tagged frontier (tagging each source with itself).  Then we can
+  // expand from multiple sources concurrently. The challenge is managing
+  // the memory explosion.
+  //
+  for (size_t source_idx = 0; source_idx < num_sources; ++source_idx) {
+    //
+    //  BFS
+    //
+    constexpr size_t bucket_idx_cur = 0;
+    constexpr size_t num_buckets    = 2;
+
+    vertex_frontier_t<vertex_t, void, multi_gpu, true> vertex_frontier(handle, num_buckets);
+
+    if ((source_idx >= source_offsets[my_rank]) && (source_idx < source_offsets[my_rank + 1])) {
+      vertex_frontier.bucket(bucket_idx_cur)
+        .insert(vertices_begin + (source_idx - source_offsets[my_rank]),
+                vertices_begin + (source_idx - source_offsets[my_rank]) + 1);
+    }
+
+    //
+    //  Now we need to do modified BFS
+    //
+    // FIXME:  This has an inefficiency in early iterations, as it doesn't have enough work to
+    //         keep the GPUs busy.  But we can't run too many at once or we will run out of
+    //         memory. Need to investigate options to improve this performance
+    auto [distances, sigmas] =
+      brandes_bfs(handle, graph_view, edge_weight_view, vertex_frontier, do_expensive_check);
+    accumulate_edge_results(handle,
+                            graph_view,
+                            edge_weight_view,
+                            centralities.mutable_view(),
+                            std::move(distances),
+                            std::move(sigmas),
+                            do_expensive_check);
+  }
+
+  return centralities;
 }
 
 }  // namespace detail
@@ -431,7 +687,8 @@ rmm::device_uvector<weight_t> betweenness_centrality(
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-rmm::device_uvector<weight_t> edge_betweenness_centrality(
+edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
diff --git a/cpp/src/centrality/betweenness_centrality_mg.cu b/cpp/src/centrality/betweenness_centrality_mg.cu
index 7bb1f4db6d1..2df843c95c9 100644
--- a/cpp/src/centrality/betweenness_centrality_mg.cu
+++ b/cpp/src/centrality/betweenness_centrality_mg.cu
@@ -73,7 +73,8 @@ template rmm::device_uvector<double> betweenness_centrality(
   bool const include_endpoints,
   bool do_expensive_check);
 
-template rmm::device_uvector<float> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int32_t, false, true>, float>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
@@ -81,7 +82,8 @@ template rmm::device_uvector<float> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<float> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int64_t, false, true>, float>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
@@ -89,7 +91,8 @@ template rmm::device_uvector<float> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<float> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int64_t, int64_t, false, true>, float>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
@@ -97,7 +100,8 @@ template rmm::device_uvector<float> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<double> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int32_t, false, true>, double>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
@@ -105,7 +109,8 @@ template rmm::device_uvector<double> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<double> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int64_t, false, true>, double>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
@@ -113,7 +118,8 @@ template rmm::device_uvector<double> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<double> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int64_t, int64_t, false, true>, double>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
diff --git a/cpp/src/centrality/betweenness_centrality_sg.cu b/cpp/src/centrality/betweenness_centrality_sg.cu
index 1d10b720d09..191857ff5dd 100644
--- a/cpp/src/centrality/betweenness_centrality_sg.cu
+++ b/cpp/src/centrality/betweenness_centrality_sg.cu
@@ -73,7 +73,8 @@ template rmm::device_uvector<double> betweenness_centrality(
   bool const include_endpoints,
   bool do_expensive_check);
 
-template rmm::device_uvector<float> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int32_t, false, false>, float>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
@@ -81,7 +82,8 @@ template rmm::device_uvector<float> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<float> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int64_t, false, false>, float>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
@@ -89,7 +91,8 @@ template rmm::device_uvector<float> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<float> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int64_t, int64_t, false, false>, float>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
@@ -97,7 +100,8 @@ template rmm::device_uvector<float> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<double> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int32_t, false, false>, double>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
@@ -105,7 +109,8 @@ template rmm::device_uvector<double> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<double> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int32_t, int64_t, false, false>, double>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
@@ -113,7 +118,8 @@ template rmm::device_uvector<double> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check);
 
-template rmm::device_uvector<double> edge_betweenness_centrality(
+template edge_property_t<graph_view_t<int64_t, int64_t, false, false>, double>
+edge_betweenness_centrality(
   const raft::handle_t& handle,
   graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
diff --git a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
index 4823c1febf4..b5cfdf4b16b 100644
--- a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
+++ b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
@@ -293,9 +293,11 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
                                                   GraphViewType::is_multi_gpu>(handle,
                                                                                edge_partition,
                                                                                std::nullopt,
+                                                                               std::nullopt,
                                                                                majors.data(),
                                                                                minors.data(),
                                                                                std::nullopt,
+                                                                               std::nullopt,
                                                                                segment_offsets);
 
     auto vertex_pair_first =
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 6dacbee2fb1..b8dc28d563e 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -168,9 +168,12 @@ decompress_edge_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
     handle,
     edge_partition,
     edge_partition_weight_view,
+    std::optional<detail::edge_partition_edge_property_device_view_t<edge_t, edge_t const*>>{
+      std::nullopt},
     edgelist_majors.data(),
     edgelist_minors.data(),
     edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data()} : std::nullopt,
+    std::optional<edge_t*>{std::nullopt},
     segment_offsets);
 
   auto pair_first =
diff --git a/cpp/src/structure/decompress_to_edgelist_impl.cuh b/cpp/src/structure/decompress_to_edgelist_impl.cuh
index fb0ffdb96c1..d653307c620 100644
--- a/cpp/src/structure/decompress_to_edgelist_impl.cuh
+++ b/cpp/src/structure/decompress_to_edgelist_impl.cuh
@@ -52,11 +52,13 @@ template <typename vertex_t,
 std::enable_if_t<multi_gpu,
                  std::tuple<rmm::device_uvector<vertex_t>,
                             rmm::device_uvector<vertex_t>,
-                            std::optional<rmm::device_uvector<weight_t>>>>
+                            std::optional<rmm::device_uvector<weight_t>>,
+                            std::optional<rmm::device_uvector<edge_t>>>>
 decompress_to_edgelist_impl(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<raft::device_span<vertex_t const>> renumber_map,
   bool do_expensive_check)
 {
@@ -86,6 +88,9 @@ decompress_to_edgelist_impl(
 
   rmm::device_uvector<vertex_t> edgelist_majors(number_of_local_edges, handle.get_stream());
   rmm::device_uvector<vertex_t> edgelist_minors(edgelist_majors.size(), handle.get_stream());
+  auto edgelist_ids     = edge_id_view ? std::make_optional<rmm::device_uvector<edge_t>>(
+                                       edgelist_majors.size(), handle.get_stream())
+                                       : std::nullopt;
   auto edgelist_weights = edge_weight_view ? std::make_optional<rmm::device_uvector<weight_t>>(
                                                edgelist_majors.size(), handle.get_stream())
                                            : std::nullopt;
@@ -101,10 +106,15 @@ decompress_to_edgelist_impl(
             detail::edge_partition_edge_property_device_view_t<edge_t, weight_t const*>>(
             (*edge_weight_view), i)
         : std::nullopt,
+      edge_id_view ? std::make_optional<
+                       detail::edge_partition_edge_property_device_view_t<edge_t, edge_t const*>>(
+                       (*edge_id_view), i)
+                   : std::nullopt,
       edgelist_majors.data() + cur_size,
       edgelist_minors.data() + cur_size,
       edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data() + cur_size}
                        : std::nullopt,
+      edgelist_ids ? std::optional<edge_t*>{(*edgelist_ids).data() + cur_size} : std::nullopt,
       graph_view.local_edge_partition_segment_offsets(i));
     cur_size += edgelist_edge_counts[i];
   }
@@ -131,16 +141,34 @@ decompress_to_edgelist_impl(
       major_ptrs[i] = edgelist_majors.data() + cur_size;
       minor_ptrs[i] = edgelist_minors.data() + cur_size;
       if (edgelist_weights) {
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            minor_ptrs[i],
-                            minor_ptrs[i] + edgelist_edge_counts[i],
-                            thrust::make_zip_iterator(thrust::make_tuple(
-                              major_ptrs[i], (*edgelist_weights).data() + cur_size)));
+        if (edgelist_ids) {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            minor_ptrs[i],
+            minor_ptrs[i] + edgelist_edge_counts[i],
+            thrust::make_zip_iterator(thrust::make_tuple(major_ptrs[i],
+                                                         (*edgelist_ids).data() + cur_size,
+                                                         (*edgelist_weights).data() + cur_size)));
+        } else {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              minor_ptrs[i],
+                              minor_ptrs[i] + edgelist_edge_counts[i],
+                              thrust::make_zip_iterator(thrust::make_tuple(
+                                major_ptrs[i], (*edgelist_weights).data() + cur_size)));
+        }
       } else {
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            minor_ptrs[i],
-                            minor_ptrs[i] + edgelist_edge_counts[i],
-                            major_ptrs[i]);
+        if (edgelist_ids) {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              minor_ptrs[i],
+                              minor_ptrs[i] + edgelist_edge_counts[i],
+                              thrust::make_zip_iterator(thrust::make_tuple(
+                                major_ptrs[i], (*edgelist_ids).data() + cur_size)));
+        } else {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              minor_ptrs[i],
+                              minor_ptrs[i] + edgelist_edge_counts[i],
+                              major_ptrs[i]);
+        }
       }
       rmm::device_uvector<size_t> d_segment_offsets(d_thresholds.size(), handle.get_stream());
       thrust::lower_bound(handle.get_thrust_policy(),
@@ -172,7 +200,8 @@ decompress_to_edgelist_impl(
 
   return std::make_tuple(store_transposed ? std::move(edgelist_minors) : std::move(edgelist_majors),
                          store_transposed ? std::move(edgelist_majors) : std::move(edgelist_minors),
-                         std::move(edgelist_weights));
+                         std::move(edgelist_weights),
+                         std::move(edgelist_ids));
 }
 
 template <typename vertex_t,
@@ -183,11 +212,13 @@ template <typename vertex_t,
 std::enable_if_t<!multi_gpu,
                  std::tuple<rmm::device_uvector<vertex_t>,
                             rmm::device_uvector<vertex_t>,
-                            std::optional<rmm::device_uvector<weight_t>>>>
+                            std::optional<rmm::device_uvector<weight_t>>,
+                            std::optional<rmm::device_uvector<edge_t>>>>
 decompress_to_edgelist_impl(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<raft::device_span<vertex_t const>> renumber_map,
   bool do_expensive_check)
 {
@@ -206,6 +237,9 @@ decompress_to_edgelist_impl(
   auto edgelist_weights = edge_weight_view ? std::make_optional<rmm::device_uvector<weight_t>>(
                                                edgelist_majors.size(), handle.get_stream())
                                            : std::nullopt;
+  auto edgelist_ids     = edge_id_view ? std::make_optional<rmm::device_uvector<edge_t>>(
+                                       edgelist_majors.size(), handle.get_stream())
+                                       : std::nullopt;
   detail::decompress_edge_partition_to_edgelist(
     handle,
     edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(
@@ -215,9 +249,14 @@ decompress_to_edgelist_impl(
           detail::edge_partition_edge_property_device_view_t<edge_t, weight_t const*>>(
           (*edge_weight_view), 0)
       : std::nullopt,
+    edge_id_view ? std::make_optional<
+                     detail::edge_partition_edge_property_device_view_t<edge_t, edge_t const*>>(
+                     (*edge_id_view), 0)
+                 : std::nullopt,
     edgelist_majors.data(),
     edgelist_minors.data(),
     edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data()} : std::nullopt,
+    edgelist_ids ? std::optional<edge_t*>{(*edgelist_ids).data()} : std::nullopt,
     graph_view.local_edge_partition_segment_offsets());
 
   if (renumber_map) {
@@ -232,7 +271,8 @@ decompress_to_edgelist_impl(
 
   return std::make_tuple(store_transposed ? std::move(edgelist_minors) : std::move(edgelist_majors),
                          store_transposed ? std::move(edgelist_majors) : std::move(edgelist_minors),
-                         std::move(edgelist_weights));
+                         std::move(edgelist_weights),
+                         std::move(edgelist_ids));
 }
 
 }  // namespace
@@ -244,18 +284,20 @@ template <typename vertex_t,
           bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<weight_t>>>
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>>
 decompress_to_edgelist(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<raft::device_span<vertex_t const>> renumber_map,
   bool do_expensive_check)
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   return decompress_to_edgelist_impl(
-    handle, graph_view, edge_weight_view, renumber_map, do_expensive_check);
+    handle, graph_view, edge_weight_view, edge_id_view, renumber_map, do_expensive_check);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/structure/decompress_to_edgelist_mg.cu b/cpp/src/structure/decompress_to_edgelist_mg.cu
index 9f03570504b..fbe56ca9b04 100644
--- a/cpp/src/structure/decompress_to_edgelist_mg.cu
+++ b/cpp/src/structure/decompress_to_edgelist_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,121 +21,145 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, float, false, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, float, true, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, true, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, double, false, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, double, true, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, true, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, float, false, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, float, true, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, true, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, double, false, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, double, true, true>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, true, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, float, false, true>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, float, true, true>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, true, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, double, false, true>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, double, true, true>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, true, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
diff --git a/cpp/src/structure/decompress_to_edgelist_sg.cu b/cpp/src/structure/decompress_to_edgelist_sg.cu
index 296f39fdfd2..5b8e410e087 100644
--- a/cpp/src/structure/decompress_to_edgelist_sg.cu
+++ b/cpp/src/structure/decompress_to_edgelist_sg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,121 +21,145 @@ namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, float, false, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, float, true, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, true, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, double, false, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 decompress_to_edgelist<int32_t, int32_t, double, true, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, true, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, float, false, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, float, true, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, true, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, double, false, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int32_t, int64_t, double, true, false>(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, true, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int32_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, float, false, false>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, float, true, false>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, true, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, double, false, false>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>>
 decompress_to_edgelist<int64_t, int64_t, double, true, false>(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, true, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
   std::optional<raft::device_span<int64_t const>> renumber_map,
   bool do_expensive_check);
 
diff --git a/cpp/src/structure/symmetrize_graph_impl.cuh b/cpp/src/structure/symmetrize_graph_impl.cuh
index 4afa4122a06..3334e089ba3 100644
--- a/cpp/src/structure/symmetrize_graph_impl.cuh
+++ b/cpp/src/structure/symmetrize_graph_impl.cuh
@@ -73,12 +73,17 @@ symmetrize_graph_impl(
 
   auto is_multigraph = graph.is_multigraph();
 
-  auto [edgelist_srcs, edgelist_dsts, edgelist_weights] = decompress_to_edgelist(
+  rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> edgelist_weights{std::nullopt};
+
+  std::tie(edgelist_srcs, edgelist_dsts, edgelist_weights, std::ignore) = decompress_to_edgelist(
     handle,
     graph_view,
     edge_weights
       ? std::optional<edge_property_view_t<edge_t, weight_t const*>>{(*edge_weights).view()}
       : std::nullopt,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
     std::make_optional<raft::device_span<vertex_t const>>((*renumber_map).data(),
                                                           (*renumber_map).size()));
   graph = graph_t<vertex_t, edge_t, store_transposed, multi_gpu>(handle);
@@ -158,12 +163,17 @@ symmetrize_graph_impl(
   auto is_multigraph      = graph.is_multigraph();
   bool renumber           = renumber_map.has_value();
 
-  auto [edgelist_srcs, edgelist_dsts, edgelist_weights] = decompress_to_edgelist(
+  rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> edgelist_weights{std::nullopt};
+
+  std::tie(edgelist_srcs, edgelist_dsts, edgelist_weights, std::ignore) = decompress_to_edgelist(
     handle,
     graph_view,
     edge_weights
       ? std::optional<edge_property_view_t<edge_t, weight_t const*>>{(*edge_weights).view()}
       : std::nullopt,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
     renumber_map ? std::make_optional<raft::device_span<vertex_t const>>((*renumber_map).data(),
                                                                          (*renumber_map).size())
                  : std::nullopt);
diff --git a/cpp/src/structure/transpose_graph_impl.cuh b/cpp/src/structure/transpose_graph_impl.cuh
index c2609362b0b..5b418a15478 100644
--- a/cpp/src/structure/transpose_graph_impl.cuh
+++ b/cpp/src/structure/transpose_graph_impl.cuh
@@ -74,12 +74,17 @@ transpose_graph_impl(
 
   auto is_multigraph = graph.is_multigraph();
 
-  auto [edgelist_srcs, edgelist_dsts, edgelist_weights] = decompress_to_edgelist(
+  rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> edgelist_weights{std::nullopt};
+
+  std::tie(edgelist_srcs, edgelist_dsts, edgelist_weights, std::ignore) = decompress_to_edgelist(
     handle,
     graph_view,
     edge_weights
       ? std::optional<edge_property_view_t<edge_t, weight_t const*>>{(*edge_weights).view()}
       : std::nullopt,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
     std::make_optional<raft::device_span<vertex_t const>>((*renumber_map).data(),
                                                           (*renumber_map).size()));
   graph = graph_t<vertex_t, edge_t, store_transposed, multi_gpu>(handle);
@@ -165,12 +170,17 @@ transpose_graph_impl(
   auto is_multigraph      = graph.is_multigraph();
   bool renumber           = renumber_map.has_value();
 
-  auto [edgelist_srcs, edgelist_dsts, edgelist_weights] = decompress_to_edgelist(
+  rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> edgelist_weights{std::nullopt};
+
+  std::tie(edgelist_srcs, edgelist_dsts, edgelist_weights, std::ignore) = decompress_to_edgelist(
     handle,
     graph_view,
     edge_weights
       ? std::optional<edge_property_view_t<edge_t, weight_t const*>>{(*edge_weights).view()}
       : std::nullopt,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
     renumber_map ? std::make_optional<raft::device_span<vertex_t const>>((*renumber_map).data(),
                                                                          (*renumber_map).size())
                  : std::nullopt);
diff --git a/cpp/src/structure/transpose_graph_storage_impl.cuh b/cpp/src/structure/transpose_graph_storage_impl.cuh
index b34d2f67dcd..980c9b10c53 100644
--- a/cpp/src/structure/transpose_graph_storage_impl.cuh
+++ b/cpp/src/structure/transpose_graph_storage_impl.cuh
@@ -74,12 +74,17 @@ transpose_graph_storage_impl(
   // FIXME: if is_symmetric is true we can do this more efficiently,
   //        since the graph contents should be exactly the same
 
-  auto [edgelist_srcs, edgelist_dsts, edgelist_weights] = decompress_to_edgelist(
+  rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> edgelist_weights{std::nullopt};
+
+  std::tie(edgelist_srcs, edgelist_dsts, edgelist_weights, std::ignore) = decompress_to_edgelist(
     handle,
     graph_view,
     edge_weights
       ? std::optional<edge_property_view_t<edge_t, weight_t const*>>{(*edge_weights).view()}
       : std::nullopt,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
     std::make_optional<raft::device_span<vertex_t const>>((*renumber_map).data(),
                                                           (*renumber_map).size()));
   graph = graph_t<vertex_t, edge_t, store_transposed, multi_gpu>(handle);
@@ -170,12 +175,17 @@ transpose_graph_storage_impl(
   // FIXME: if is_symmetric is true we can do this more efficiently,
   //        since the graph contents should be exactly the same
 
-  auto [edgelist_srcs, edgelist_dsts, edgelist_weights] = decompress_to_edgelist(
+  rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> edgelist_weights{std::nullopt};
+
+  std::tie(edgelist_srcs, edgelist_dsts, edgelist_weights, std::ignore) = decompress_to_edgelist(
     handle,
     graph_view,
     edge_weights
       ? std::optional<edge_property_view_t<edge_t, weight_t const*>>{(*edge_weights).view()}
       : std::nullopt,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
     renumber_map ? std::make_optional<raft::device_span<vertex_t const>>((*renumber_map).data(),
                                                                          (*renumber_map).size())
                  : std::nullopt);
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7d4a2181af1..3bcd5546455 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -606,6 +606,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_KATZ_TEST c_api/mg_katz_test.c c_api/mg_test_utils.cpp)
     ConfigureCTestMG(MG_CAPI_EIGENVECTOR_CENTRALITY_TEST c_api/mg_eigenvector_centrality_test.c c_api/mg_test_utils.cpp)
     ConfigureCTestMG(MG_CAPI_BETWEENNESS_CENTRALITY_TEST c_api/mg_betweenness_centrality_test.c c_api/mg_test_utils.cpp)
+    ConfigureCTestMG(MG_CAPI_EDGE_BETWEENNESS_CENTRALITY_TEST c_api/mg_edge_betweenness_centrality_test.c c_api/mg_test_utils.cpp)
     ConfigureCTestMG(MG_CAPI_HITS_TEST c_api/mg_hits_test.c c_api/mg_test_utils.cpp)
     ConfigureCTestMG(MG_CAPI_UNIFORM_NEIGHBOR_SAMPLE_TEST c_api/mg_uniform_neighbor_sample_test.c c_api/mg_test_utils.cpp)
     ConfigureCTestMG(MG_CAPI_RANDOM_WALKS_TEST c_api/mg_random_walks_test.c c_api/mg_test_utils.cpp)
@@ -654,6 +655,7 @@ ConfigureCTest(CAPI_PAGERANK_TEST c_api/pagerank_test.c)
 ConfigureCTest(CAPI_KATZ_TEST c_api/katz_test.c)
 ConfigureCTest(CAPI_EIGENVECTOR_CENTRALITY_TEST c_api/eigenvector_centrality_test.c)
 ConfigureCTest(CAPI_BETWEENNESS_CENTRALITY_TEST c_api/betweenness_centrality_test.c)
+ConfigureCTest(CAPI_EDGE_BETWEENNESS_CENTRALITY_TEST c_api/edge_betweenness_centrality_test.c)
 ConfigureCTest(CAPI_HITS_TEST c_api/hits_test.c)
 ConfigureCTest(CAPI_BFS_TEST c_api/bfs_test.c)
 ConfigureCTest(CAPI_SSSP_TEST c_api/sssp_test.c)
diff --git a/cpp/tests/c_api/edge_betweenness_centrality.c b/cpp/tests/c_api/edge_betweenness_centrality_test.c
similarity index 51%
rename from cpp/tests/c_api/edge_betweenness_centrality.c
rename to cpp/tests/c_api/edge_betweenness_centrality_test.c
index 7a56f90eac7..ab119288fab 100644
--- a/cpp/tests/c_api/edge_betweenness_centrality.c
+++ b/cpp/tests/c_api/edge_betweenness_centrality_test.c
@@ -29,9 +29,11 @@ typedef float weight_t;
 int generic_edge_betweenness_centrality_test(vertex_t* h_src,
                                              vertex_t* h_dst,
                                              weight_t* h_wgt,
+                                             vertex_t* h_seeds,
                                              weight_t* h_result,
                                              size_t num_vertices,
                                              size_t num_edges,
+                                             size_t num_seeds,
                                              bool_t store_transposed,
                                              size_t num_vertices_to_sample)
 {
@@ -40,64 +42,102 @@ int generic_edge_betweenness_centrality_test(vertex_t* h_src,
   cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
   cugraph_error_t* ret_error;
 
-  cugraph_resource_handle_t* p_handle   = NULL;
-  cugraph_graph_t* p_graph              = NULL;
-  cugraph_centrality_result_t* p_result = NULL;
-  cugraph_rng_state_t* rng_state        = NULL;
+  cugraph_resource_handle_t* handle                   = NULL;
+  cugraph_graph_t* graph                              = NULL;
+  cugraph_edge_centrality_result_t* result            = NULL;
+  cugraph_rng_state_t* rng_state                      = NULL;
+  cugraph_type_erased_device_array_t* seeds           = NULL;
+  cugraph_type_erased_device_array_view_t* seeds_view = NULL;
 
-  p_handle = cugraph_create_resource_handle(NULL);
-  TEST_ASSERT(test_ret_value, p_handle != NULL, "resource handle creation failed.");
+  handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
 
-  ret_code = cugraph_rng_state_create(p_handle, 0, &rng_state, &ret_error);
+  ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "failed to create rng_state.");
 
-  ret_code = create_test_graph(p_handle,
+  ret_code = create_test_graph(handle,
                                h_src,
                                h_dst,
                                h_wgt,
                                num_edges,
-                               rng_state,
                                store_transposed,
                                FALSE,
                                FALSE,
-                               &p_graph,
+                               &graph,
                                &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
+  if (h_seeds == NULL) {
+    ret_code = cugraph_select_random_vertices(
+      handle, graph, rng_state, num_vertices_to_sample, &seeds, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "select random seeds failed.");
+
+    seeds_view = cugraph_type_erased_device_array_view(seeds);
+  } else {
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_seeds, INT32, &seeds, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds create failed.");
+
+    seeds_view = cugraph_type_erased_device_array_view(seeds);
+    ret_code   = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, seeds_view, (byte_t*)h_seeds, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds copy_from_host failed.");
+  }
+
   ret_code = cugraph_edge_betweenness_centrality(
-    p_handle, p_graph, num_vertices_to_sample, NULL, FALSE, FALSE, &p_result, &ret_error);
+    handle, graph, seeds_view, FALSE, FALSE, &result, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ASSERT(
     test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_edge_betweenness_centrality failed.");
 
-  cugraph_type_erased_device_array_view_t* vertices;
+  cugraph_type_erased_device_array_view_t* srcs;
+  cugraph_type_erased_device_array_view_t* dsts;
   cugraph_type_erased_device_array_view_t* centralities;
 
-  vertices     = cugraph_centrality_result_get_vertices(p_result);
-  centralities = cugraph_centrality_result_get_values(p_result);
+  srcs         = cugraph_edge_centrality_result_get_src_vertices(result);
+  dsts         = cugraph_edge_centrality_result_get_dst_vertices(result);
+  centralities = cugraph_edge_centrality_result_get_values(result);
+
+  size_t num_local_edges = cugraph_type_erased_device_array_view_size(srcs);
 
-  vertex_t h_vertices[num_vertices];
-  weight_t h_centralities[num_vertices];
+  vertex_t h_cugraph_src[num_local_edges];
+  vertex_t h_cugraph_dst[num_local_edges];
+  weight_t h_centralities[num_local_edges];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_cugraph_src, srcs , &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    p_handle, (byte_t*)h_vertices, vertices, &ret_error);
+    handle, (byte_t*)h_cugraph_dst, dsts, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    p_handle, (byte_t*)h_centralities, centralities, &ret_error);
+    handle, (byte_t*)h_centralities, centralities, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
-  for (int i = 0; (i < num_vertices) && (test_ret_value == 0); ++i) {
+  weight_t M[num_vertices][num_vertices];
+
+  for (int i = 0; i < num_vertices; ++i)
+    for (int j = 0; j < num_vertices; ++j) {
+      M[i][j]         = 0.0;
+    }
+
+  for (int i = 0; i < num_edges; ++i) {
+    M[h_src[i]][h_dst[i]] = h_result[i];
+  }
+
+  for (int i = 0; (i < num_local_edges) && (test_ret_value == 0); ++i) {
     TEST_ASSERT(test_ret_value,
-                nearlyEqual(h_result[h_vertices[i]], h_centralities[i], 0.001),
-                "centralities results don't match");
+                nearlyEqual(M[h_cugraph_src[i]][h_cugraph_dst[i]], h_centralities[i], 0.001),
+                "betweenness centrality results don't match");
   }
 
-  cugraph_centrality_result_free(p_result);
-  cugraph_sg_graph_free(p_graph);
-  cugraph_free_resource_handle(p_handle);
+  cugraph_edge_centrality_result_free(result);
+  cugraph_sg_graph_free(graph);
+  cugraph_free_resource_handle(handle);
   cugraph_error_free(ret_error);
 
   return test_ret_value;
@@ -112,14 +152,14 @@ int test_edge_betweenness_centrality()
   vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
   weight_t h_wgt[] = {
     0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f, 0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  weight_t h_result[] = {0.236325, 0.292055, 0.458457, 0.60533, 0.190498, 0.495942};
+  weight_t h_result[] = { 0, 2, 3, 1.83333, 2, 2, 3, 2, 3.16667, 2.83333, 4.33333, 0, 2, 2.83333, 3.66667, 2.33333 };
 
   double epsilon        = 1e-6;
   size_t max_iterations = 200;
 
   // Eigenvector centrality wants store_transposed = TRUE
   return generic_edge_betweenness_centrality_test(
-    h_src, h_dst, h_wgt, h_result, num_vertices, num_edges, TRUE, 5);
+    h_src, h_dst, h_wgt, NULL, h_result, num_vertices, num_edges, 0, TRUE, 5);
 }
 
 /******************************************************************************/
diff --git a/cpp/tests/c_api/mg_edge_betweenness_centrality.c b/cpp/tests/c_api/mg_edge_betweenness_centrality_test.c
similarity index 54%
rename from cpp/tests/c_api/mg_edge_betweenness_centrality.c
rename to cpp/tests/c_api/mg_edge_betweenness_centrality_test.c
index 17ce717dcfe..13f0085be84 100644
--- a/cpp/tests/c_api/mg_edge_betweenness_centrality.c
+++ b/cpp/tests/c_api/mg_edge_betweenness_centrality_test.c
@@ -29,9 +29,11 @@ int generic_edge_betweenness_centrality_test(const cugraph_resource_handle_t* ha
                                              vertex_t* h_src,
                                              vertex_t* h_dst,
                                              weight_t* h_wgt,
+                                             vertex_t* h_seeds,
                                              weight_t* h_result,
                                              size_t num_vertices,
                                              size_t num_edges,
+                                             size_t num_seeds,
                                              bool_t store_transposed,
                                              size_t num_vertices_to_sample)
 {
@@ -40,16 +42,43 @@ int generic_edge_betweenness_centrality_test(const cugraph_resource_handle_t* ha
   cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
   cugraph_error_t* ret_error;
 
-  cugraph_graph_t* p_graph              = NULL;
-  cugraph_centrality_result_t* p_result = NULL;
+  cugraph_graph_t* graph                              = NULL;
+  cugraph_edge_centrality_result_t* result            = NULL;
+  cugraph_rng_state_t* rng_state                      = NULL;
+  cugraph_type_erased_device_array_t* seeds           = NULL;
+  cugraph_type_erased_device_array_view_t* seeds_view = NULL;
 
   ret_code = create_mg_test_graph(
-    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &p_graph, &ret_error);
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &graph, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_mg_test_graph failed.");
 
+  int rank = cugraph_resource_handle_get_rank(handle);
+
+  ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "failed to create rng_state.");
+
+  if (h_seeds == NULL) {
+    ret_code = cugraph_select_random_vertices(
+      handle, graph, rng_state, num_vertices_to_sample, &seeds, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "select random seeds failed.");
+
+    seeds_view = cugraph_type_erased_device_array_view(seeds);
+  } else {
+    if (rank > 0) num_seeds = 0;
+
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_seeds, INT32, &seeds, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds create failed.");
+
+    seeds_view = cugraph_type_erased_device_array_view(seeds);
+    ret_code   = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, seeds_view, (byte_t*)h_seeds, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds copy_from_host failed.");
+  }
+
   ret_code = cugraph_edge_betweenness_centrality(
-    handle, p_graph, num_vertices_to_sample, NULL, FALSE, FALSE, &p_result, &ret_error);
+    handle, graph, seeds_view, FALSE, FALSE, &result, &ret_error);
   TEST_ASSERT(
     test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_edge_betweenness_centrality failed.");
 
@@ -57,33 +86,51 @@ int generic_edge_betweenness_centrality_test(const cugraph_resource_handle_t* ha
   //       the returned values with the expected results for the entire
   //       graph.  Each GPU will have a subset of the total vertices, so
   //       they will do a subset of the comparisons.
-  cugraph_type_erased_device_array_view_t* vertices;
+  cugraph_type_erased_device_array_view_t* srcs;
+  cugraph_type_erased_device_array_view_t* dsts;
   cugraph_type_erased_device_array_view_t* centralities;
 
-  vertices     = cugraph_centrality_result_get_vertices(p_result);
-  centralities = cugraph_centrality_result_get_values(p_result);
+  srcs         = cugraph_edge_centrality_result_get_src_vertices(result);
+  dsts         = cugraph_edge_centrality_result_get_dst_vertices(result);
+  centralities = cugraph_edge_centrality_result_get_values(result);
+
+  size_t num_local_edges = cugraph_type_erased_device_array_view_size(srcs);
 
-  vertex_t h_vertices[num_vertices];
-  weight_t h_centralities[num_vertices];
+  vertex_t h_cugraph_src[num_local_edges];
+  vertex_t h_cugraph_dst[num_local_edges];
+  weight_t h_centralities[num_local_edges];
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_vertices, vertices, &ret_error);
+    handle, (byte_t*)h_cugraph_src, srcs , &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_cugraph_dst, dsts, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
     handle, (byte_t*)h_centralities, centralities, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
-  size_t num_local_vertices = cugraph_type_erased_device_array_view_size(vertices);
+  weight_t M[num_vertices][num_vertices];
+
+  for (int i = 0; i < num_vertices; ++i)
+    for (int j = 0; j < num_vertices; ++j) {
+      M[i][j]         = 0.0;
+    }
 
-  for (int i = 0; (i < num_local_vertices) && (test_ret_value == 0); ++i) {
+  for (int i = 0; i < num_edges; ++i) {
+    M[h_src[i]][h_dst[i]] = h_result[i];
+  }
+
+  for (int i = 0; (i < num_local_edges) && (test_ret_value == 0); ++i) {
     TEST_ASSERT(test_ret_value,
-                nearlyEqual(h_result[h_vertices[i]], h_centralities[i], 0.001),
+                nearlyEqual(M[h_cugraph_src[i]][h_cugraph_dst[i]], h_centralities[i], 0.001),
                 "betweenness centrality results don't match");
   }
 
-  cugraph_centrality_result_free(p_result);
-  cugraph_mg_graph_free(p_graph);
+  cugraph_edge_centrality_result_free(result);
+  cugraph_mg_graph_free(graph);
   cugraph_error_free(ret_error);
 
   return test_ret_value;
@@ -98,14 +145,16 @@ int test_edge_betweenness_centrality(const cugraph_resource_handle_t* handle)
   vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
   weight_t h_wgt[] = {
     0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f, 0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  weight_t h_result[] = {0.236374, 0.292046, 0.458369, 0.605472, 0.190544, 0.495814};
+  weight_t h_result[] = { 3.16667, 2.83333, 4.33333, 1.83333, 2, 2.83333, 3.66667, 2.33333,
+                          3.16667, 2.83333, 4.33333, 1.83333, 2, 2.83333, 3.66667, 2.33333 };
+
 
   double epsilon        = 1e-6;
   size_t max_iterations = 200;
 
   // Eigenvector centrality wants store_transposed = TRUE
   return generic_edge_betweenness_centrality_test(
-    handle, h_src, h_dst, h_wgt, h_result, num_vertices, num_edges, TRUE, 5);
+    handle, h_src, h_dst, h_wgt, NULL, h_result, num_vertices, num_edges, 0, TRUE, 6);
 }
 
 /******************************************************************************/
diff --git a/cpp/tests/centrality/betweenness_centrality_reference.hpp b/cpp/tests/centrality/betweenness_centrality_reference.hpp
index 9a86de934c3..3c60020265a 100644
--- a/cpp/tests/centrality/betweenness_centrality_reference.hpp
+++ b/cpp/tests/centrality/betweenness_centrality_reference.hpp
@@ -33,7 +33,7 @@ void ref_bfs(std::vector<edge_t> const& offsets,
              std::queue<vertex_t>& Q,
              std::stack<vertex_t>& S,
              std::vector<vertex_t>& dist,
-             std::vector<std::vector<vertex_t>>& pred,
+             std::vector<std::vector<std::pair<vertex_t, edge_t>>>& pred,
              std::vector<double>& sigmas,
              vertex_t source)
 {
@@ -61,7 +61,7 @@ void ref_bfs(std::vector<edge_t> const& offsets,
       // Edge(v, w) on  a shortest path?
       if (dist[nbr] == dist[v] + 1) {
         sigmas[nbr] += sigmas[v];
-        pred[nbr].push_back(v);
+        pred[nbr].push_back(std::make_pair(v, nbr_idx));
       }
     }
   }
@@ -70,7 +70,7 @@ void ref_bfs(std::vector<edge_t> const& offsets,
 template <typename vertex_t, typename edge_t, typename weight_t>
 void ref_accumulation(std::vector<weight_t>& result,
                       std::stack<vertex_t>& S,
-                      std::vector<std::vector<vertex_t>>& pred,
+                      std::vector<std::vector<std::pair<vertex_t, edge_t>>>& pred,
                       std::vector<double>& sigmas,
                       std::vector<double>& deltas,
                       vertex_t source)
@@ -80,8 +80,8 @@ void ref_accumulation(std::vector<weight_t>& result,
   while (!S.empty()) {
     vertex_t w = S.top();
     S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
+    for (auto v : pred[w]) {
+      deltas[v.first] += (sigmas[v.first] / sigmas[w]) * (1.0 + deltas[w]);
     }
     if (w != source) { result[w] += deltas[w]; }
   }
@@ -90,7 +90,7 @@ void ref_accumulation(std::vector<weight_t>& result,
 template <typename vertex_t, typename edge_t, typename weight_t>
 void ref_endpoints_accumulation(std::vector<weight_t>& result,
                                 std::stack<vertex_t>& S,
-                                std::vector<std::vector<vertex_t>>& pred,
+                                std::vector<std::vector<std::pair<vertex_t, edge_t>>>& pred,
                                 std::vector<double>& sigmas,
                                 std::vector<double>& deltas,
                                 vertex_t source)
@@ -101,17 +101,19 @@ void ref_endpoints_accumulation(std::vector<weight_t>& result,
   while (!S.empty()) {
     vertex_t w = S.top();
     S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
+    for (auto v : pred[w]) {
+      deltas[v.first] += (sigmas[v.first] / sigmas[w]) * (1.0 + deltas[w]);
     }
     if (w != source) { result[w] += deltas[w] + 1; }
   }
 }
 
-template <typename vertex_t, typename weight_t>
+template <typename vertex_t, typename edge_t, typename weight_t>
 void ref_edge_accumulation(std::vector<weight_t>& result,
+                           std::vector<edge_t> const& offsets,
+                           std::vector<vertex_t> const& indices,
                            std::stack<vertex_t>& S,
-                           std::vector<std::vector<vertex_t>>& pred,
+                           std::vector<std::vector<std::pair<vertex_t, edge_t>>>& pred,
                            std::vector<double>& sigmas,
                            std::vector<double>& deltas,
                            vertex_t source)
@@ -120,10 +122,12 @@ void ref_edge_accumulation(std::vector<weight_t>& result,
   while (!S.empty()) {
     vertex_t w = S.top();
     S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
+    for (auto v : pred[w]) {
+      double coefficient = (sigmas[v.first] / sigmas[w]) * (1.0 + deltas[w]);
+
+      deltas[v.first] += coefficient;
+      result[v.second] += coefficient;
     }
-    if (w != source) { result[w] += deltas[w]; }
   }
 }
 
@@ -181,7 +185,7 @@ std::vector<weight_t> betweenness_centrality_reference(
     std::stack<vertex_t> S;
 
     std::vector<vertex_t> dist(result.size());
-    std::vector<std::vector<vertex_t>> pred(result.size());
+    std::vector<std::vector<std::pair<vertex_t, edge_t>>> pred(result.size());
     std::vector<double> sigmas(result.size());
     std::vector<double> deltas(result.size());
 
@@ -220,14 +224,14 @@ std::vector<weight_t> edge_betweenness_centrality_reference(
     std::stack<vertex_t> S;
 
     std::vector<vertex_t> dist(offsets.size() - 1);
-    std::vector<std::vector<vertex_t>> pred(offsets.size() - 1);
+    std::vector<std::vector<std::pair<vertex_t, edge_t>>> pred(result.size());
     std::vector<double> sigmas(offsets.size() - 1);
     std::vector<double> deltas(offsets.size() - 1);
 
     for (vertex_t s : seeds) {
       ref_bfs(offsets, indices, Q, S, dist, pred, sigmas, s);
 
-      ref_edge_accumulation(result, S, pred, sigmas, deltas, s);
+      ref_edge_accumulation(result, offsets, indices, S, pred, sigmas, deltas, s);
     }
   }
   return result;
diff --git a/cpp/tests/centrality/edge_betweenness_centrality_test.cpp b/cpp/tests/centrality/edge_betweenness_centrality_test.cpp
index bb223067e1c..e4d22ff069c 100644
--- a/cpp/tests/centrality/edge_betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/edge_betweenness_centrality_test.cpp
@@ -99,7 +99,6 @@ class Tests_EdgeBetweennessCentrality
       hr_timer.start("Edge betweenness centrality");
     }
 
-#if 0
     auto d_centralities = cugraph::edge_betweenness_centrality(
       handle,
       graph_view,
@@ -108,17 +107,6 @@ class Tests_EdgeBetweennessCentrality
         raft::device_span<vertex_t const>{d_seeds.data(), d_seeds.size()}),
       betweenness_usecase.normalized,
       do_expensive_check);
-#else
-    EXPECT_THROW(cugraph::edge_betweenness_centrality(
-                   handle,
-                   graph_view,
-                   edge_weight_view,
-                   std::make_optional<raft::device_span<vertex_t const>>(
-                     raft::device_span<vertex_t const>{d_seeds.data(), d_seeds.size()}),
-                   betweenness_usecase.normalized,
-                   do_expensive_check),
-                 cugraph::logic_error);
-#endif
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -127,24 +115,34 @@ class Tests_EdgeBetweennessCentrality
     }
 
     if (betweenness_usecase.check_correctness) {
-#if 0
-      auto [h_offsets, h_indices, h_wgt] = cugraph::test::graph_to_host_csr(handle, graph_view, edge_weight_view);
+      // Compute reference edge betweenness result
+      auto [h_offsets, h_indices, h_wgt] =
+        cugraph::test::graph_to_host_csr(handle, graph_view, edge_weight_view);
 
-      auto h_seeds        = cugraph::test::to_host(handle, d_seeds);
+      auto h_seeds = cugraph::test::to_host(handle, d_seeds);
 
       auto h_reference_centralities =
-        betweenness_centrality_reference(h_offsets, h_indices, h_wgt, h_seeds, betweenness_usecase.include_endpoints);
+        edge_betweenness_centrality_reference(h_offsets, h_indices, h_wgt, h_seeds);
+
+      rmm::device_uvector<vertex_t> d_reference_src_vertex_ids(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_reference_dst_vertex_ids(0, handle.get_stream());
+
+      std::tie(d_reference_src_vertex_ids, d_reference_dst_vertex_ids, std::ignore) =
+        cugraph::test::graph_to_device_coo(handle, graph_view, edge_weight_view);
 
       auto d_reference_centralities = cugraph::test::to_device(handle, h_reference_centralities);
 
-      //  Need to get edges in order...
+      auto [d_cugraph_src_vertex_ids, d_cugraph_dst_vertex_ids, d_cugraph_results] =
+        cugraph::test::graph_to_device_coo(
+          handle, graph_view, std::make_optional(d_centralities.view()));
 
       cugraph::test::edge_betweenness_centrality_validate(handle,
-                                                          d_renumber_map_labels,
-                                                          d_centralities,
-                                                          std::nullopt,
+                                                          d_cugraph_src_vertex_ids,
+                                                          d_cugraph_dst_vertex_ids,
+                                                          *d_cugraph_results,
+                                                          d_reference_src_vertex_ids,
+                                                          d_reference_dst_vertex_ids,
                                                           d_reference_centralities);
-#endif
     }
   }
 };
@@ -188,7 +186,6 @@ INSTANTIATE_TEST_SUITE_P(
                       EdgeBetweennessCentrality_Usecase{20, false, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
-                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/cpp/tests/centrality/eigenvector_centrality_test.cpp b/cpp/tests/centrality/eigenvector_centrality_test.cpp
index f3408d9b131..7cafcfbde85 100644
--- a/cpp/tests/centrality/eigenvector_centrality_test.cpp
+++ b/cpp/tests/centrality/eigenvector_centrality_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -167,10 +167,15 @@ class Tests_EigenvectorCentrality
     }
 
     if (eigenvector_usecase.check_correctness) {
-      auto [dst_v, src_v, opt_wgt_v] = cugraph::decompress_to_edgelist(
+      rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> opt_wgt_v{std::nullopt};
+
+      std::tie(dst_v, src_v, opt_wgt_v, std::ignore) = cugraph::decompress_to_edgelist(
         handle,
         graph_view,
         edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
       auto h_src     = cugraph::test::to_host(handle, src_v);
diff --git a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
index a1e73b6147b..ebc49e4a3e4 100644
--- a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
@@ -57,6 +57,8 @@ class Tests_MGEdgeBetweennessCentrality
   template <typename vertex_t, typename edge_t, typename weight_t>
   void run_current_test(std::tuple<EdgeBetweennessCentrality_Usecase, input_usecase_t> const& param)
   {
+    constexpr bool do_expensive_check = false;
+
     auto [betweenness_usecase, input_usecase] = param;
 
     HighResTimer hr_timer{};
@@ -83,7 +85,7 @@ class Tests_MGEdgeBetweennessCentrality
       mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
 
     raft::random::RngState rng_state(handle_->get_comms().get_rank());
-    auto d_seeds = cugraph::select_random_vertices(
+    auto d_mg_seeds = cugraph::select_random_vertices(
       *handle_,
       mg_graph_view,
       std::optional<raft::device_span<vertex_t const>>{std::nullopt},
@@ -98,24 +100,13 @@ class Tests_MGEdgeBetweennessCentrality
       hr_timer.start("MG edge betweenness centrality");
     }
 
-#if 0
     auto d_centralities = cugraph::edge_betweenness_centrality(
       *handle_,
       mg_graph_view,
       mg_edge_weight_view,
       std::make_optional<raft::device_span<vertex_t const>>(
-        raft::device_span<vertex_t const>{d_seeds.data(), d_seeds.size()}),
+        raft::device_span<vertex_t const>{d_mg_seeds.data(), d_mg_seeds.size()}),
       betweenness_usecase.normalized);
-#else
-    EXPECT_THROW(cugraph::edge_betweenness_centrality(
-                   *handle_,
-                   mg_graph_view,
-                   mg_edge_weight_view,
-                   std::make_optional<raft::device_span<vertex_t const>>(
-                     raft::device_span<vertex_t const>{d_seeds.data(), d_seeds.size()}),
-                   betweenness_usecase.normalized),
-                 cugraph::logic_error);
-#endif
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -125,22 +116,52 @@ class Tests_MGEdgeBetweennessCentrality
     }
 
     if (betweenness_usecase.check_correctness) {
-#if 0
-      d_centralities = cugraph::test::device_gatherv(
-        *handle_, raft::device_span<weight_t const>(d_centralities.data(), d_centralities.size()));
-      d_seeds = cugraph::test::device_gatherv(
-        *handle_, raft::device_span<vertex_t const>(d_seeds.data(), d_seeds.size()));
-
-      auto [h_src, h_dst, h_wgt] = cugraph::test::graph_to_host_coo(*handle_, graph_view);
-
-      if (h_src.size() > 0) {
-        auto h_centralities = cugraph::test::to_host(*handle_, d_centralities);
-        auto h_seeds        = cugraph::test::to_host(*handle_, d_seeds);
-
-        cugraph::test::edge_betweenness_centrality_validate(
-          h_src, h_dst, h_wgt, h_centralities, h_seeds);
+      // Extract MG results
+      auto [d_cugraph_src_vertex_ids, d_cugraph_dst_vertex_ids, d_cugraph_results] =
+        cugraph::test::graph_to_device_coo(
+          *handle_, mg_graph_view, std::make_optional(d_centralities.view()));
+
+      // Create SG graph so we can generate SG results
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        false);
+
+      auto d_mg_aggregate_seeds = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{d_mg_seeds.data(), d_mg_seeds.size()});
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_edge_weights_view =
+          sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt;
+
+        // Generate SG results and compare
+        auto d_sg_centralities = cugraph::edge_betweenness_centrality(
+          *handle_,
+          sg_graph.view(),
+          sg_edge_weights_view,
+          std::make_optional<raft::device_span<vertex_t const>>(raft::device_span<vertex_t const>{
+            d_mg_aggregate_seeds.data(), d_mg_aggregate_seeds.size()}),
+          betweenness_usecase.normalized,
+          do_expensive_check);
+
+        auto [d_sg_src_vertex_ids, d_sg_dst_vertex_ids, d_sg_reference_centralities] =
+          cugraph::test::graph_to_device_coo(
+            *handle_, sg_graph.view(), std::make_optional(d_sg_centralities.view()));
+
+        cugraph::test::edge_betweenness_centrality_validate(*handle_,
+                                                            d_cugraph_src_vertex_ids,
+                                                            d_cugraph_dst_vertex_ids,
+                                                            *d_cugraph_results,
+                                                            d_sg_src_vertex_ids,
+                                                            d_sg_dst_vertex_ids,
+                                                            *d_sg_reference_centralities);
       }
-#endif
     }
   }
 
diff --git a/cpp/tests/community/egonet_validate.cu b/cpp/tests/community/egonet_validate.cu
index 44b74090ec4..5fc94c5c07d 100644
--- a/cpp/tests/community/egonet_validate.cu
+++ b/cpp/tests/community/egonet_validate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,11 +44,16 @@ egonet_reference(
   int radius)
 {
 #if 1
-  auto [d_coo_src, d_coo_dst, d_coo_wgt] =
-    cugraph::decompress_to_edgelist(handle,
-                                    graph_view,
-                                    edge_weight_view,
-                                    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
+  rmm::device_uvector<vertex_t> d_coo_src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_coo_dst(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> d_coo_wgt{std::nullopt};
+
+  std::tie(d_coo_src, d_coo_dst, d_coo_wgt, std::ignore) = cugraph::decompress_to_edgelist(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 #else
   // FIXME: This should be faster (smaller list of edges to operate on), but uniform_nbr_sample
   // doesn't preserve multi-edges (which is probably a bug)
diff --git a/cpp/tests/cores/k_core_validate.cu b/cpp/tests/cores/k_core_validate.cu
index 687349dbbd7..b264ed53540 100644
--- a/cpp/tests/cores/k_core_validate.cu
+++ b/cpp/tests/cores/k_core_validate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,12 +61,17 @@ void check_correctness(
 
   EXPECT_EQ(error_count, 0) << "destination error count is non-zero";
 
-  auto [graph_src, graph_dst, graph_wgt] =
-    cugraph::decompress_to_edgelist(handle,
-                                    graph_view,
-                                    edge_weight_view,
-                                    std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-                                    false);
+  rmm::device_uvector<vertex_t> graph_src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> graph_dst(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> graph_wgt{std::nullopt};
+
+  std::tie(graph_src, graph_dst, graph_wgt, std::ignore) = cugraph::decompress_to_edgelist(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+    false);
 
   // Now we'll count how many edges should be in the subgraph
   auto expected_edge_count =
diff --git a/cpp/tests/prims/mg_transform_e.cu b/cpp/tests/prims/mg_transform_e.cu
index 47def15fffc..ed29fb7c5e3 100644
--- a/cpp/tests/prims/mg_transform_e.cu
+++ b/cpp/tests/prims/mg_transform_e.cu
@@ -117,10 +117,11 @@ class Tests_MGTransformE
     {
       rmm::device_uvector<vertex_t> srcs(0, handle_->get_stream());
       rmm::device_uvector<vertex_t> dsts(0, handle_->get_stream());
-      std::tie(srcs, dsts, std::ignore) = cugraph::decompress_to_edgelist(
+      std::tie(srcs, dsts, std::ignore, std::ignore) = cugraph::decompress_to_edgelist(
         *handle_,
         mg_graph_view,
         std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         std::optional<raft::device_span<vertex_t const>>{std::nullopt});
       auto edge_first = thrust::make_zip_iterator(
         thrust::make_tuple(store_transposed ? dsts.begin() : srcs.begin(),
diff --git a/cpp/tests/sampling/random_walks_check.cuh b/cpp/tests/sampling/random_walks_check.cuh
index 4cd74f01bcb..f73891a1537 100644
--- a/cpp/tests/sampling/random_walks_check.cuh
+++ b/cpp/tests/sampling/random_walks_check.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,11 +37,16 @@ void random_walks_validate(
   std::optional<rmm::device_uvector<weight_t>>&& d_weights,
   size_t max_length)
 {
-  auto [d_src, d_dst, d_wgt] =
-    cugraph::decompress_to_edgelist(handle,
-                                    graph_view,
-                                    edge_weight_view,
-                                    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
+  rmm::device_uvector<vertex_t> d_src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> d_wgt{std::nullopt};
+
+  std::tie(d_src, d_dst, d_wgt, std::ignore) = cugraph::decompress_to_edgelist(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
   if constexpr (multi_gpu) {
     d_src = cugraph::test::device_gatherv(
diff --git a/cpp/tests/structure/mg_symmetrize_test.cpp b/cpp/tests/structure/mg_symmetrize_test.cpp
index cdacff91403..f2d37170f76 100644
--- a/cpp/tests/structure/mg_symmetrize_test.cpp
+++ b/cpp/tests/structure/mg_symmetrize_test.cpp
@@ -123,11 +123,15 @@ class Tests_MGSymmetrize
 
     if (symmetrize_usecase.check_correctness) {
       // 4-1. decompress MG results
+      rmm::device_uvector<vertex_t> d_mg_srcs(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> d_mg_dsts(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> d_mg_weights{std::nullopt};
 
-      auto [d_mg_srcs, d_mg_dsts, d_mg_weights] = cugraph::decompress_to_edgelist(
+      std::tie(d_mg_srcs, d_mg_dsts, d_mg_weights, std::ignore) = cugraph::decompress_to_edgelist(
         *handle_,
         mg_graph.view(),
         mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         mg_renumber_map ? std::make_optional<raft::device_span<vertex_t const>>(
                             (*mg_renumber_map).data(), (*mg_renumber_map).size())
                         : std::nullopt);
@@ -157,11 +161,15 @@ class Tests_MGSymmetrize
         ASSERT_FALSE(d_sg_renumber_map_labels.has_value());
 
         // 4-4. decompress SG results
+        rmm::device_uvector<vertex_t> d_sg_srcs(0, handle_->get_stream());
+        rmm::device_uvector<vertex_t> d_sg_dsts(0, handle_->get_stream());
+        std::optional<rmm::device_uvector<weight_t>> d_sg_weights{std::nullopt};
 
-        auto [d_sg_srcs, d_sg_dsts, d_sg_weights] = cugraph::decompress_to_edgelist(
+        std::tie(d_sg_srcs, d_sg_dsts, d_sg_weights, std::ignore) = cugraph::decompress_to_edgelist(
           *handle_,
           sg_graph.view(),
           sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
           std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
         // 4-5. compare
diff --git a/cpp/tests/structure/mg_transpose_storage_test.cpp b/cpp/tests/structure/mg_transpose_storage_test.cpp
index b6033a7ab53..1adce8d102e 100644
--- a/cpp/tests/structure/mg_transpose_storage_test.cpp
+++ b/cpp/tests/structure/mg_transpose_storage_test.cpp
@@ -131,13 +131,17 @@ class Tests_MGTransposeStorage
 
     if (transpose_storage_usecase.check_correctness) {
       // 3-1. decompress MG results
+      rmm::device_uvector<vertex_t> d_mg_srcs(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> d_mg_dsts(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> d_mg_weights{std::nullopt};
 
-      auto [d_mg_srcs, d_mg_dsts, d_mg_weights] = cugraph::decompress_to_edgelist(
+      std::tie(d_mg_srcs, d_mg_dsts, d_mg_weights, std::ignore) = cugraph::decompress_to_edgelist(
         *handle_,
         mg_storage_transposed_graph.view(),
         mg_storage_transposed_edge_weights
           ? std::make_optional((*mg_storage_transposed_edge_weights).view())
           : std::nullopt,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         mg_renumber_map ? std::make_optional<raft::device_span<vertex_t const>>(
                             (*mg_renumber_map).data(), (*mg_renumber_map).size())
                         : std::nullopt);
@@ -156,11 +160,15 @@ class Tests_MGTransposeStorage
 
       if (handle_->get_comms().get_rank() == int{0}) {
         // 3-3. decompress SG results
+        rmm::device_uvector<vertex_t> d_sg_srcs(0, handle_->get_stream());
+        rmm::device_uvector<vertex_t> d_sg_dsts(0, handle_->get_stream());
+        std::optional<rmm::device_uvector<weight_t>> d_sg_weights{std::nullopt};
 
-        auto [d_sg_srcs, d_sg_dsts, d_sg_weights] = cugraph::decompress_to_edgelist(
+        std::tie(d_sg_srcs, d_sg_dsts, d_sg_weights, std::ignore) = cugraph::decompress_to_edgelist(
           *handle_,
           sg_graph.view(),
           sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
           std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
         // 3-4. compare
diff --git a/cpp/tests/structure/mg_transpose_test.cpp b/cpp/tests/structure/mg_transpose_test.cpp
index 3558e0f7d97..03a31e14ca9 100644
--- a/cpp/tests/structure/mg_transpose_test.cpp
+++ b/cpp/tests/structure/mg_transpose_test.cpp
@@ -121,11 +121,15 @@ class Tests_MGTranspose
 
     if (transpose_usecase.check_correctness) {
       // 4-1. decompress MG results
+      rmm::device_uvector<vertex_t> d_mg_srcs(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> d_mg_dsts(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> d_mg_weights{std::nullopt};
 
-      auto [d_mg_srcs, d_mg_dsts, d_mg_weights] = cugraph::decompress_to_edgelist(
+      std::tie(d_mg_srcs, d_mg_dsts, d_mg_weights, std::ignore) = cugraph::decompress_to_edgelist(
         *handle_,
         mg_graph.view(),
         mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         mg_renumber_map ? std::make_optional<raft::device_span<vertex_t const>>(
                             (*mg_renumber_map).data(), (*mg_renumber_map).size())
                         : std::nullopt);
@@ -152,11 +156,15 @@ class Tests_MGTranspose
                                    std::optional<rmm::device_uvector<vertex_t>>{std::nullopt});
 
         // 4-4. decompress SG results
+        rmm::device_uvector<vertex_t> d_sg_srcs(0, handle_->get_stream());
+        rmm::device_uvector<vertex_t> d_sg_dsts(0, handle_->get_stream());
+        std::optional<rmm::device_uvector<weight_t>> d_sg_weights{std::nullopt};
 
-        auto [d_sg_srcs, d_sg_dsts, d_sg_weights] = cugraph::decompress_to_edgelist(
+        std::tie(d_sg_srcs, d_sg_dsts, d_sg_weights, std::ignore) = cugraph::decompress_to_edgelist(
           *handle_,
           sg_graph.view(),
           sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
           std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
         // 4-5. compare
diff --git a/cpp/tests/structure/symmetrize_test.cpp b/cpp/tests/structure/symmetrize_test.cpp
index 9673b29e389..89ff9ed139a 100644
--- a/cpp/tests/structure/symmetrize_test.cpp
+++ b/cpp/tests/structure/symmetrize_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -211,14 +211,18 @@ class Tests_Symmetrize
     rmm::device_uvector<vertex_t> d_org_srcs(0, handle.get_stream());
     rmm::device_uvector<vertex_t> d_org_dsts(0, handle.get_stream());
     std::optional<rmm::device_uvector<weight_t>> d_org_weights{std::nullopt};
+
     if (symmetrize_usecase.check_correctness) {
-      std::tie(d_org_srcs, d_org_dsts, d_org_weights) = cugraph::decompress_to_edgelist(
-        handle,
-        graph.view(),
-        edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
-        d_renumber_map_labels ? std::make_optional<raft::device_span<vertex_t const>>(
-                                  (*d_renumber_map_labels).data(), (*d_renumber_map_labels).size())
-                              : std::nullopt);
+      std::tie(d_org_srcs, d_org_dsts, d_org_weights, std::ignore) =
+        cugraph::decompress_to_edgelist(
+          handle,
+          graph.view(),
+          edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          d_renumber_map_labels
+            ? std::make_optional<raft::device_span<vertex_t const>>((*d_renumber_map_labels).data(),
+                                                                    (*d_renumber_map_labels).size())
+            : std::nullopt);
     }
 
     if (cugraph::test::g_perf) {
@@ -240,13 +244,20 @@ class Tests_Symmetrize
     }
 
     if (symmetrize_usecase.check_correctness) {
-      auto [d_symm_srcs, d_symm_dsts, d_symm_weights] = cugraph::decompress_to_edgelist(
-        handle,
-        graph.view(),
-        edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
-        d_renumber_map_labels ? std::make_optional<raft::device_span<vertex_t const>>(
-                                  (*d_renumber_map_labels).data(), (*d_renumber_map_labels).size())
-                              : std::nullopt);
+      rmm::device_uvector<vertex_t> d_symm_srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_symm_dsts(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> d_symm_weights{std::nullopt};
+
+      std::tie(d_symm_srcs, d_symm_dsts, d_symm_weights, std::ignore) =
+        cugraph::decompress_to_edgelist(
+          handle,
+          graph.view(),
+          edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          d_renumber_map_labels
+            ? std::make_optional<raft::device_span<vertex_t const>>((*d_renumber_map_labels).data(),
+                                                                    (*d_renumber_map_labels).size())
+            : std::nullopt);
 
       auto h_org_srcs    = cugraph::test::to_host(handle, d_org_srcs);
       auto h_org_dsts    = cugraph::test::to_host(handle, d_org_dsts);
diff --git a/cpp/tests/structure/transpose_storage_test.cpp b/cpp/tests/structure/transpose_storage_test.cpp
index 8c94e62d68b..a713abf7dae 100644
--- a/cpp/tests/structure/transpose_storage_test.cpp
+++ b/cpp/tests/structure/transpose_storage_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,13 +77,16 @@ class Tests_TransposeStorage
     rmm::device_uvector<vertex_t> d_org_dsts(0, handle.get_stream());
     std::optional<rmm::device_uvector<weight_t>> d_org_weights{std::nullopt};
     if (transpose_storage_usecase.check_correctness) {
-      std::tie(d_org_srcs, d_org_dsts, d_org_weights) = cugraph::decompress_to_edgelist(
-        handle,
-        graph.view(),
-        edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
-        d_renumber_map_labels ? std::make_optional<raft::device_span<vertex_t const>>(
-                                  (*d_renumber_map_labels).data(), (*d_renumber_map_labels).size())
-                              : std::nullopt);
+      std::tie(d_org_srcs, d_org_dsts, d_org_weights, std::ignore) =
+        cugraph::decompress_to_edgelist(
+          handle,
+          graph.view(),
+          edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          d_renumber_map_labels
+            ? std::make_optional<raft::device_span<vertex_t const>>((*d_renumber_map_labels).data(),
+                                                                    (*d_renumber_map_labels).size())
+            : std::nullopt);
     }
 
     if (cugraph::test::g_perf) {
@@ -107,13 +110,21 @@ class Tests_TransposeStorage
     }
 
     if (transpose_storage_usecase.check_correctness) {
-      auto [d_storage_transposed_srcs, d_storage_transposed_dsts, d_storage_transposed_weights] =
+      rmm::device_uvector<vertex_t> d_storage_transposed_srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_storage_transposed_dsts(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> d_storage_transposed_weights{std::nullopt};
+
+      std::tie(d_storage_transposed_srcs,
+               d_storage_transposed_dsts,
+               d_storage_transposed_weights,
+               std::ignore) =
         cugraph::decompress_to_edgelist(
           handle,
           storage_transposed_graph.view(),
           storage_transposed_edge_weights
             ? std::make_optional((*storage_transposed_edge_weights).view())
             : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
           d_renumber_map_labels
             ? std::make_optional<raft::device_span<vertex_t const>>((*d_renumber_map_labels).data(),
                                                                     (*d_renumber_map_labels).size())
diff --git a/cpp/tests/structure/transpose_test.cpp b/cpp/tests/structure/transpose_test.cpp
index 39ae7d7635e..1cbefa21fcc 100644
--- a/cpp/tests/structure/transpose_test.cpp
+++ b/cpp/tests/structure/transpose_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,13 +77,16 @@ class Tests_Transpose
     rmm::device_uvector<vertex_t> d_org_dsts(0, handle.get_stream());
     std::optional<rmm::device_uvector<weight_t>> d_org_weights{std::nullopt};
     if (transpose_usecase.check_correctness) {
-      std::tie(d_org_srcs, d_org_dsts, d_org_weights) = cugraph::decompress_to_edgelist(
-        handle,
-        graph.view(),
-        edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
-        d_renumber_map_labels ? std::make_optional<raft::device_span<vertex_t const>>(
-                                  (*d_renumber_map_labels).data(), (*d_renumber_map_labels).size())
-                              : std::nullopt);
+      std::tie(d_org_srcs, d_org_dsts, d_org_weights, std::ignore) =
+        cugraph::decompress_to_edgelist(
+          handle,
+          graph.view(),
+          edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          d_renumber_map_labels
+            ? std::make_optional<raft::device_span<vertex_t const>>((*d_renumber_map_labels).data(),
+                                                                    (*d_renumber_map_labels).size())
+            : std::nullopt);
     }
 
     if (cugraph::test::g_perf) {
@@ -101,11 +104,16 @@ class Tests_Transpose
     }
 
     if (transpose_usecase.check_correctness) {
-      auto [d_transposed_srcs, d_transposed_dsts, d_transposed_weights] =
+      rmm::device_uvector<vertex_t> d_transposed_srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_transposed_dsts(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> d_transposed_weights{std::nullopt};
+
+      std::tie(d_transposed_srcs, d_transposed_dsts, d_transposed_weights, std::ignore) =
         cugraph::decompress_to_edgelist(
           handle,
           graph.view(),
           edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
           d_renumber_map_labels
             ? std::make_optional<raft::device_span<vertex_t const>>((*d_renumber_map_labels).data(),
                                                                     (*d_renumber_map_labels).size())
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 615522a863b..1fa869ac2df 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -480,6 +480,20 @@ graph_to_host_coo(
   cugraph::graph_view_t<vertex_t, edge_t, store_transposed, is_multi_gpu> const& graph_view,
   std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view);
 
+// If multi-GPU, only the rank 0 GPU holds the valid data
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool is_multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, is_multi_gpu> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view);
+
 // If multi-GPU, only the rank 0 GPU holds the valid data
 template <typename vertex_t,
           typename edge_t,
diff --git a/cpp/tests/utilities/test_utilities_impl.cuh b/cpp/tests/utilities/test_utilities_impl.cuh
index ed4302b4156..3025ca7908b 100644
--- a/cpp/tests/utilities/test_utilities_impl.cuh
+++ b/cpp/tests/utilities/test_utilities_impl.cuh
@@ -48,11 +48,16 @@ graph_to_host_coo(
   cugraph::graph_view_t<vertex_t, edge_t, store_transposed, is_multi_gpu> const& graph_view,
   std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view)
 {
-  auto [d_src, d_dst, d_wgt] =
-    cugraph::decompress_to_edgelist(handle,
-                                    graph_view,
-                                    edge_weight_view,
-                                    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
+  rmm::device_uvector<vertex_t> d_src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> d_wgt{std::nullopt};
+
+  std::tie(d_src, d_dst, d_wgt, std::ignore) = cugraph::decompress_to_edgelist(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
   if constexpr (is_multi_gpu) {
     d_src = cugraph::test::device_gatherv(
@@ -89,6 +94,53 @@ graph_to_host_coo(
   return std::make_tuple(std::move(h_src), std::move(h_dst), std::move(h_wgt));
 }
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool is_multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, is_multi_gpu> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view)
+{
+  rmm::device_uvector<vertex_t> d_src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> d_wgt{std::nullopt};
+
+  std::tie(d_src, d_dst, d_wgt, std::ignore) = cugraph::decompress_to_edgelist(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
+
+  if constexpr (is_multi_gpu) {
+    d_src = cugraph::test::device_gatherv(
+      handle, raft::device_span<vertex_t const>{d_src.data(), d_src.size()});
+    d_dst = cugraph::test::device_gatherv(
+      handle, raft::device_span<vertex_t const>{d_dst.data(), d_dst.size()});
+    if (d_wgt)
+      *d_wgt = cugraph::test::device_gatherv(
+        handle, raft::device_span<weight_t const>{d_wgt->data(), d_wgt->size()});
+    if (handle.get_comms().get_rank() != 0) {
+      d_src.resize(0, handle.get_stream());
+      d_src.shrink_to_fit(handle.get_stream());
+      d_dst.resize(0, handle.get_stream());
+      d_dst.shrink_to_fit(handle.get_stream());
+      if (d_wgt) {
+        (*d_wgt).resize(0, handle.get_stream());
+        (*d_wgt).shrink_to_fit(handle.get_stream());
+      }
+    }
+  }
+
+  return std::make_tuple(std::move(d_src), std::move(d_dst), std::move(d_wgt));
+}
+
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -100,11 +152,16 @@ graph_to_host_csr(
   cugraph::graph_view_t<vertex_t, edge_t, store_transposed, is_multi_gpu> const& graph_view,
   std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view)
 {
-  auto [d_src, d_dst, d_wgt] =
-    cugraph::decompress_to_edgelist(handle,
-                                    graph_view,
-                                    edge_weight_view,
-                                    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
+  rmm::device_uvector<vertex_t> d_src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> d_wgt{std::nullopt};
+
+  std::tie(d_src, d_dst, d_wgt, std::ignore) = cugraph::decompress_to_edgelist(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    std::optional<raft::device_span<vertex_t const>>{std::nullopt});
 
   if constexpr (is_multi_gpu) {
     d_src = cugraph::test::device_gatherv(
@@ -184,8 +241,16 @@ mg_graph_to_sg_graph(
   std::optional<raft::device_span<vertex_t const>> number_map,
   bool renumber)
 {
-  auto [d_src, d_dst, d_wgt] =
-    cugraph::decompress_to_edgelist(handle, graph_view, edge_weight_view, number_map);
+  rmm::device_uvector<vertex_t> d_src(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_dst(0, handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> d_wgt{std::nullopt};
+
+  std::tie(d_src, d_dst, d_wgt, std::ignore) = cugraph::decompress_to_edgelist(
+    handle,
+    graph_view,
+    edge_weight_view,
+    std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+    number_map);
 
   d_src = cugraph::test::device_gatherv(
     handle, raft::device_span<vertex_t const>{d_src.data(), d_src.size()});
diff --git a/cpp/tests/utilities/test_utilities_mg.cu b/cpp/tests/utilities/test_utilities_mg.cu
index b572f7df23a..7366a8376a4 100644
--- a/cpp/tests/utilities/test_utilities_mg.cu
+++ b/cpp/tests/utilities/test_utilities_mg.cu
@@ -90,6 +90,102 @@ graph_to_host_coo(
   cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
 
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
 template std::tuple<std::vector<int32_t>, std::vector<int32_t>, std::optional<std::vector<float>>>
 graph_to_host_csr(
   raft::handle_t const& handle,
diff --git a/cpp/tests/utilities/test_utilities_sg.cu b/cpp/tests/utilities/test_utilities_sg.cu
index a5a4fecb4e5..aceff526f21 100644
--- a/cpp/tests/utilities/test_utilities_sg.cu
+++ b/cpp/tests/utilities/test_utilities_sg.cu
@@ -90,6 +90,102 @@ graph_to_host_coo(
   cugraph::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
   std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
 
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, float const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int32_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+graph_to_device_coo(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::edge_property_view_t<int64_t, double const*>> edge_weight_view);
+
 template std::tuple<std::vector<int32_t>, std::vector<int32_t>, std::optional<std::vector<float>>>
 graph_to_host_csr(
   raft::handle_t const& handle,

From 5ea63c0527f54d5a6b8aa95f016c580ee87c1948 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Tue, 27 Jun 2023 12:23:39 -0400
Subject: [PATCH 05/10] this fixes github links in cugraph, cugraph-dgl and
 cugraph-pyg (#3650)

resolves #3635

AJ Schmidt came up with this fix. It does correct all the links in the cugraph projects.

Authors:
  - Don Acosta (https://github.com/acostadon)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3650
---
 docs/cugraph/source/conf.py                  |  5 +-
 docs/cugraph/source/sphinxext/github_link.py | 72 ++++++++------------
 2 files changed, 29 insertions(+), 48 deletions(-)

diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index 394acf0e950..b64901772dc 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -204,6 +204,5 @@ def setup(app):
 
 # The following is used by sphinx.ext.linkcode to provide links to github
 linkcode_resolve = make_linkcode_resolve(
-    'cugraph', 'https://github.com/rapidsai/'
-    'cugraph/blob/{revision}/python/'
-    '{package}/{path}#L{lineno}')
+    "https://github.com/rapidsai/cugraph/blob/{revision}/python/{path}#L{lineno}"
+)
\ No newline at end of file
diff --git a/docs/cugraph/source/sphinxext/github_link.py b/docs/cugraph/source/sphinxext/github_link.py
index fa8fe3f5fe3..cc28dc6e897 100644
--- a/docs/cugraph/source/sphinxext/github_link.py
+++ b/docs/cugraph/source/sphinxext/github_link.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,10 +16,8 @@
 # license in /thirdparty/LICENSES/LICENSE.scikit_learn
 
 import inspect
-import os
 import re
 import subprocess
-import sys
 from functools import partial
 from operator import attrgetter
 
@@ -56,7 +54,7 @@ def _get_git_revision():
     return revision.decode('utf-8')
 
 
-def _linkcode_resolve(domain, info, package, url_fmt, revision):
+def _linkcode_resolve(domain, info, url_fmt, revision):
     """Determine a link to online source for a class/method/function
 
     This is called by sphinx.ext.linkcode
@@ -73,7 +71,7 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision):
 
     if revision is None:
         return
-    if domain not in ('py', 'pyx'):
+    if domain != 'py':
         return
     if not info.get('module') or not info.get('fullname'):
         return
@@ -89,41 +87,29 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision):
     fn: str = None
     lineno: str = None
 
-    try:
-        fn = inspect.getsourcefile(obj)
-    except Exception:
-        fn = None
-    if not fn:
-        try:
-            fn = inspect.getsourcefile(sys.modules[obj.__module__])
-        except Exception:
-            fn = None
-
-    if not fn:
-        # Possibly Cython code. Search docstring for source
-        m = source_regex.search(obj.__doc__)
-
-        if (m is not None):
-            source_file = m.group(1)
-            lineno = m.group(2)
-
-            # fn is expected to be the absolute path.
-            fn = os.path.relpath(source_file, start=package)
-            print("{}:{}".format(
-                os.path.abspath(os.path.join("..", "python", "cuml", fn)),
-                lineno))
-        else:
-            return
-    else:
-        # Test if we are absolute or not (pyx are relative)
-        if (not os.path.isabs(fn)):
-            # Should be relative to docs right now
-            fn = os.path.abspath(os.path.join("..", "python", fn))
-
-        # Convert to relative from module root
-        fn = os.path.relpath(fn,
-                             start=os.path.dirname(
-                                 __import__(package).__file__))
+    obj_module = inspect.getmodule(obj)
+    if not obj_module:
+        print(f"could not infer source code link for: {info}")
+        return
+    module_name = obj_module.__name__.split('.')[0]
+
+    module_dir_dict = {
+        "cugraph_dgl": "cugraph-dgl",
+        "cugraph_pyg": "cugraph-pyg",
+        "cugraph_service_client": "cugraph-service/client",
+        "cugraph_service_server": "cugraph-service/server",
+        "cugraph": "cugraph",
+        "pylibcugraph": "pylibcugraph",
+    }
+    module_dir = module_dir_dict.get(module_name)
+    if not module_dir:
+        print(f"no source path directory set for {module_name}")
+        return
+
+    obj_path = "/".join(obj_module.__name__.split(".")[1:])
+    obj_file_ext = obj_module.__file__.split('.')[-1]
+    source_ext = "pyx" if obj_file_ext == "so" else "py"
+    fn = f"{module_dir}/{module_name}/{obj_path}.{source_ext}"
 
     # Get the line number if we need it. (Can work without it)
     if (lineno is None):
@@ -137,18 +123,15 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision):
             else:
                 lineno = ''
     return url_fmt.format(revision=revision,
-                          package=package,
                           path=fn,
                           lineno=lineno)
 
 
-def make_linkcode_resolve(package, url_fmt):
+def make_linkcode_resolve(url_fmt):
     """Returns a linkcode_resolve function for the given URL format
 
     revision is a git commit reference (hash or name)
 
-    package is the name of the root module of the package
-
     url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
                                    'blob/{revision}/{package}/'
                                    '{path}#L{lineno}')
@@ -156,5 +139,4 @@ def make_linkcode_resolve(package, url_fmt):
     revision = _get_git_revision()
     return partial(_linkcode_resolve,
                    revision=revision,
-                   package=package,
                    url_fmt=url_fmt)

From 76190e2a18b0b0d2765397f1a7729e0f1477bec1 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Wed, 5 Jul 2023 19:28:48 +0200
Subject: [PATCH 06/10] [FIX] Rename `cugraph-ops` symbols (refactoring) and
 update GHA workflows to call pytest via `python -m pytest` (#3688)

This PR:
- renames `cugraph-ops` symbols and updates tests in `cugraph-dgl` and `-pyg` based on cugraph-ops refactoring
- updates GHA workflows to call pytest via `python -m pytest`. This is to fix the `pytest not found error` in [log](https://github.com/rapidsai/cugraph/actions/runs/5420960384/jobs/9855784044#step:9:260).

Authors:
  - Naim (https://github.com/naimnv)
  - Matt Joux (https://github.com/MatthiasKohl)
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Matt Joux (https://github.com/MatthiasKohl)
  - Ray Douglass (https://github.com/raydouglass)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3688
---
 .github/workflows/pr.yaml                     |  4 +-
 .github/workflows/test.yaml                   |  4 +-
 cpp/src/sampling/neighborhood.cu              | 22 ++++--
 cpp/src/utilities/cugraph_ops_utils.hpp       | 24 +++----
 .../cugraph_dgl/nn/conv/gatconv.py            | 71 +++++++------------
 .../cugraph_dgl/nn/conv/transformerconv.py    | 43 +++++------
 .../tests/nn/test_transformerconv.py          |  6 +-
 .../cugraph-pyg/cugraph_pyg/nn/conv/base.py   | 56 +++++++--------
 .../cugraph_pyg/nn/conv/gat_conv.py           | 35 +++------
 .../cugraph_pyg/nn/conv/gatv2_conv.py         | 52 +++++---------
 .../cugraph_pyg/nn/conv/transformer_conv.py   |  8 +--
 11 files changed, 138 insertions(+), 187 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1518d7ba432..4d52cd26de4 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -94,7 +94,7 @@ jobs:
     with:
       build_type: pull-request
       package-name: pylibcugraph
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest ./python/pylibcugraph/pylibcugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets python -m pytest ./python/pylibcugraph/pylibcugraph/tests"
       test-smoketest: "python ci/wheel_smoke_test_pylibcugraph.py"
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
@@ -120,5 +120,5 @@ jobs:
       test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
       # Skip dataset downloads on arm to save CI time -- arm only runs smoke tests.
       test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -m sg ./python/cugraph/cugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -m sg ./python/cugraph/cugraph/tests"
       test-smoketest: "python ci/wheel_smoke_test_cugraph.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 33cc3f27825..d697b8f1649 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -39,7 +39,7 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: pylibcugraph
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest ./python/pylibcugraph/pylibcugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets python -m pytest ./python/pylibcugraph/pylibcugraph/tests"
   wheel-tests-cugraph:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.08
@@ -52,4 +52,4 @@ jobs:
       # Always want to test against latest dask/distributed.
       test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
       test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -m sg ./python/cugraph/cugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -m sg ./python/cugraph/cugraph/tests"
diff --git a/cpp/src/sampling/neighborhood.cu b/cpp/src/sampling/neighborhood.cu
index 0c0beb8d8b0..2f7b203a319 100644
--- a/cpp/src/sampling/neighborhood.cu
+++ b/cpp/src/sampling/neighborhood.cu
@@ -22,6 +22,8 @@
 
 #include <raft/random/rng_state.hpp>
 
+#include <type_traits>
+
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t>
@@ -34,14 +36,19 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
                                 size_t sampling_size,
                                 ops::graph::SamplingAlgoT sampling_algo)
 {
-  const auto [ops_graph, max_degree] = detail::get_graph_and_max_degree(graph_view);
-  return ops::graph::uniform_sample_csr(rng_state,
+  using base_vertex_t = std::decay_t<vertex_t>;
+  using base_edge_t   = std::decay_t<edge_t>;
+  static_assert(std::is_same_v<base_vertex_t, base_edge_t>,
+                "cugraph-ops sampling not yet implemented for different node and edge types");
+
+  const auto ops_graph = detail::get_graph(graph_view);
+  return ops::graph::uniform_sample_csc(rng_state,
                                         ops_graph,
                                         ptr_d_start,
                                         num_start_vertices,
                                         sampling_size,
                                         sampling_algo,
-                                        max_degree,
+                                        ops_graph.dst_max_in_degree,
                                         handle.get_stream());
 }
 
@@ -55,14 +62,19 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> sample_
   size_t sampling_size,
   ops::graph::SamplingAlgoT sampling_algo)
 {
-  const auto [ops_graph, max_degree] = detail::get_graph_and_max_degree(graph_view);
+  using base_vertex_t = std::decay_t<vertex_t>;
+  using base_edge_t   = std::decay_t<edge_t>;
+  static_assert(std::is_same_v<base_vertex_t, base_edge_t>,
+                "cugraph-ops sampling not yet implemented for different node and edge types");
+
+  const auto ops_graph = detail::get_graph(graph_view);
   return ops::graph::uniform_sample_coo(rng_state,
                                         ops_graph,
                                         ptr_d_start,
                                         num_start_vertices,
                                         sampling_size,
                                         sampling_algo,
-                                        max_degree,
+                                        ops_graph.dst_max_in_degree,
                                         handle.get_stream());
 }
 
diff --git a/cpp/src/utilities/cugraph_ops_utils.hpp b/cpp/src/utilities/cugraph_ops_utils.hpp
index 1dbe930e4c9..9aea4183866 100644
--- a/cpp/src/utilities/cugraph_ops_utils.hpp
+++ b/cpp/src/utilities/cugraph_ops_utils.hpp
@@ -20,18 +20,20 @@
 
 #include <cugraph-ops/graph/format.hpp>
 
-#include <tuple>
-
 namespace cugraph {
 namespace detail {
 
 template <typename NodeTypeT, typename EdgeTypeT>
-ops::graph::fg_csr<EdgeTypeT> get_graph(
+ops::graph::csc<EdgeTypeT, NodeTypeT> get_graph(
   graph_view_t<NodeTypeT, EdgeTypeT, false, false> const& gview)
 {
-  ops::graph::fg_csr<EdgeTypeT> graph;
-  graph.n_nodes   = gview.number_of_vertices();
-  graph.n_indices = gview.number_of_edges();
+  ops::graph::csc<EdgeTypeT, NodeTypeT> graph;
+  graph.n_src_nodes = gview.number_of_vertices();
+  graph.n_dst_nodes = gview.number_of_vertices();
+  graph.n_indices   = gview.number_of_edges();
+  // FIXME this is sufficient for now, but if there is a fast (cached) way
+  // of getting max degree, use that instead
+  graph.dst_max_in_degree = std::numeric_limits<EdgeTypeT>::max();
   // FIXME: this is evil and is just temporary until we have a matching type in cugraph-ops
   // or we change the type accepted by the functions calling into cugraph-ops
   graph.offsets = const_cast<EdgeTypeT*>(gview.local_edge_partition_view().offsets().data());
@@ -39,15 +41,5 @@ ops::graph::fg_csr<EdgeTypeT> get_graph(
   return graph;
 }
 
-template <typename NodeTypeT, typename EdgeTypeT>
-std::tuple<ops::graph::fg_csr<EdgeTypeT>, NodeTypeT> get_graph_and_max_degree(
-  graph_view_t<NodeTypeT, EdgeTypeT, false, false> const& gview)
-{
-  // FIXME this is sufficient for now, but if there is a fast (cached) way
-  // of getting max degree, use that instead
-  auto max_degree = std::numeric_limits<NodeTypeT>::max();
-  return std::make_tuple(get_graph(gview), max_degree);
-}
-
 }  // namespace detail
 }  // namespace cugraph
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index e70f2d0c6d1..7825febc24b 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -19,8 +19,8 @@
 from cugraph_dgl.nn.conv.base import BaseConv
 from cugraph.utilities.utils import import_optional
 
-from pylibcugraphops.pytorch import BipartiteCSC, SampledCSC, StaticCSC
-from pylibcugraphops.pytorch.operators import mha_gat_n2n, mha_gat_n2n_bipartite
+from pylibcugraphops.pytorch import CSC
+from pylibcugraphops.pytorch.operators import mha_gat_n2n
 
 dgl = import_optional("dgl")
 torch = import_optional("torch")
@@ -173,9 +173,20 @@ def forward(
             :math:`H` is the number of heads, and :math:`D_{out}` is size of
             output feature.
         """
+        if max_in_degree is None:
+            max_in_degree = -1
+
         bipartite = not isinstance(nfeat, torch.Tensor)
         offsets, indices, _ = g.adj_tensors("csc")
 
+        graph = CSC(
+            offsets=offsets,
+            indices=indices,
+            num_src_nodes=g.num_src_nodes(),
+            dst_max_in_degree=max_in_degree,
+            is_bipartite=bipartite,
+        )
+
         if efeat is not None:
             if self.fc_edge is None:
                 raise RuntimeError(
@@ -191,23 +202,8 @@ def forward(
                     f"integers to allow bipartite node features, but got "
                     f"{self.in_feats}."
                 )
-            _graph = BipartiteCSC(
-                offsets=offsets, indices=indices, num_src_nodes=g.num_src_nodes()
-            )
             nfeat_src = self.fc_src(nfeat[0])
             nfeat_dst = self.fc_dst(nfeat[1])
-
-            out = mha_gat_n2n_bipartite(
-                src_feat=nfeat_src,
-                dst_feat=nfeat_dst,
-                attn_weights=self.attn_weights,
-                graph=_graph,
-                num_heads=self.num_heads,
-                activation="LeakyReLU",
-                negative_slope=self.negative_slope,
-                concat_heads=self.concat,
-                edge_feat=efeat,
-            )
         else:
             if not hasattr(self, "fc"):
                 raise RuntimeError(
@@ -215,36 +211,17 @@ def forward(
                     f"integer, but got {self.in_feats}."
                 )
             nfeat = self.fc(nfeat)
-            # Sampled primitive does not support edge features
-            if g.is_block and efeat is None:
-                if max_in_degree is None:
-                    max_in_degree = g.in_degrees().max().item()
-
-                if max_in_degree < self.MAX_IN_DEGREE_MFG:
-                    _graph = SampledCSC(
-                        offsets=offsets,
-                        indices=indices,
-                        max_num_neighbors=max_in_degree,
-                        num_src_nodes=g.num_src_nodes(),
-                    )
-                else:
-                    offsets = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-                    _graph = StaticCSC(offsets=offsets, indices=indices)
-            else:
-                if g.is_block:
-                    offsets = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-                _graph = StaticCSC(offsets=offsets, indices=indices)
-
-            out = mha_gat_n2n(
-                feat=nfeat,
-                attn_weights=self.attn_weights,
-                graph=_graph,
-                num_heads=self.num_heads,
-                activation="LeakyReLU",
-                negative_slope=self.negative_slope,
-                concat_heads=self.concat,
-                edge_feat=efeat,
-            )[: g.num_dst_nodes()]
+
+        out = mha_gat_n2n(
+            (nfeat_src, nfeat_dst) if bipartite else nfeat,
+            self.attn_weights,
+            graph,
+            num_heads=self.num_heads,
+            activation="LeakyReLU",
+            negative_slope=self.negative_slope,
+            concat_heads=self.concat,
+            edge_feat=efeat,
+        )[: g.num_dst_nodes()]
 
         if self.concat:
             out = out.view(-1, self.num_heads, self.out_feats)
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 1898f5159b1..141adc86069 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -15,7 +15,7 @@
 from cugraph_dgl.nn.conv.base import BaseConv
 from cugraph.utilities.utils import import_optional
 
-from pylibcugraphops.pytorch import BipartiteCSC, StaticCSC
+from pylibcugraphops.pytorch import CSC
 from pylibcugraphops.pytorch.operators import mha_simple_n2n
 
 dgl = import_optional("dgl")
@@ -132,31 +132,34 @@ def forward(
         efeat: torch.Tensor, optional
             Edge feature tensor. Default: ``None``.
         """
-        bipartite = not isinstance(nfeat, torch.Tensor)
         offsets, indices, _ = g.adj_tensors("csc")
-
-        if bipartite:
-            src_feats, dst_feats = nfeat
-            _graph = BipartiteCSC(
-                offsets=offsets, indices=indices, num_src_nodes=g.num_src_nodes()
-            )
-        else:
-            src_feats = dst_feats = nfeat
-            if g.is_block:
-                offsets = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-            _graph = StaticCSC(offsets=offsets, indices=indices)
-
-        query = self.lin_query(dst_feats)
-        key = self.lin_key(src_feats)
-        value = self.lin_value(src_feats)
-        if self.lin_edge is not None:
+        graph = CSC(
+            offsets=offsets,
+            indices=indices,
+            num_src_nodes=g.num_src_nodes(),
+            is_bipartite=True,
+        )
+
+        if isinstance(nfeat, torch.Tensor):
+            nfeat = (nfeat, nfeat)
+
+        query = self.lin_query(nfeat[1][: g.num_dst_nodes()])
+        key = self.lin_key(nfeat[0])
+        value = self.lin_value(nfeat[0])
+
+        if efeat is not None:
+            if self.lin_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_feats must be set to allow "
+                    f"edge features."
+                )
             efeat = self.lin_edge(efeat)
 
         out = mha_simple_n2n(
             key_emb=key,
             query_emb=query,
             value_emb=value,
-            graph=_graph,
+            graph=graph,
             num_heads=self.num_heads,
             concat_heads=self.concat,
             edge_emb=efeat,
@@ -165,7 +168,7 @@ def forward(
         )[: g.num_dst_nodes()]
 
         if self.root_weight:
-            res = self.lin_skip(dst_feats[: g.num_dst_nodes()])
+            res = self.lin_skip(nfeat[1][: g.num_dst_nodes()])
             if self.lin_beta is not None:
                 beta = self.lin_beta(torch.cat([out, res, out - res], dim=-1))
                 beta = beta.sigmoid()
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 64af795231c..00476b9f0bb 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -26,14 +26,14 @@
 
 
 @pytest.mark.parametrize("beta", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("bipartite_node_feats", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_TransformerConv(
-    beta, bipartite, concat, idtype_int, num_heads, to_block, use_edge_feats
+    beta, bipartite_node_feats, concat, idtype_int, num_heads, to_block, use_edge_feats
 ):
     device = "cuda"
     g = create_graph1().to(device)
@@ -44,7 +44,7 @@ def test_TransformerConv(
     if to_block:
         g = dgl.to_block(g)
 
-    if bipartite:
+    if bipartite_node_feats:
         in_node_feats = (5, 3)
         nfeat = (
             torch.rand(g.num_src_nodes(), in_node_feats[0], device=device),
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
index bec50792131..207efcdace4 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import warnings
-from typing import Any, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 from cugraph.utilities.utils import import_optional
 
@@ -20,13 +20,7 @@
 torch_geometric = import_optional("torch_geometric")
 
 try:  # pragma: no cover
-    from pylibcugraphops.pytorch import (
-        BipartiteCSC,
-        SampledCSC,
-        SampledHeteroCSC,
-        StaticCSC,
-        StaticHeteroCSC,
-    )
+    from pylibcugraphops.pytorch import CSC, HeteroCSC
 
     HAS_PYLIBCUGRAPHOPS = True
 except ImportError:
@@ -94,7 +88,7 @@ def get_cugraph(
         csc: Tuple[torch.Tensor, torch.Tensor, int],
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> Any:
+    ) -> CSC:
         r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
         Supports both bipartite and non-bipartite graphs.
 
@@ -119,16 +113,16 @@ def get_cugraph(
                 f"based processing (got CPU tensor)"
             )
 
-        if bipartite:
-            return BipartiteCSC(colptr, row, num_src_nodes)
+        if max_num_neighbors is None:
+            max_num_neighbors = -1
 
-        if num_src_nodes != colptr.numel() - 1:
-            if max_num_neighbors is None:
-                max_num_neighbors = int((colptr[1:] - colptr[:-1]).max())
-
-            return SampledCSC(colptr, row, max_num_neighbors, num_src_nodes)
-
-        return StaticCSC(colptr, row)
+        return CSC(
+            offsets=colptr,
+            indices=row,
+            num_src_nodes=num_src_nodes,
+            dst_max_in_degree=max_num_neighbors,
+            is_bipartite=bipartite,
+        )
 
     def get_typed_cugraph(
         self,
@@ -137,7 +131,7 @@ def get_typed_cugraph(
         num_edge_types: Optional[int] = None,
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> Any:
+    ) -> HeteroCSC:
         r"""Constructs a typed :obj:`cugraph` graph object from a CSC
         representation where each edge corresponds to a given edge type.
         Supports both bipartite and non-bipartite graphs.
@@ -162,21 +156,21 @@ def get_typed_cugraph(
         if num_edge_types is None:
             num_edge_types = int(edge_type.max()) + 1
 
+        if max_num_neighbors is None:
+            max_num_neighbors = -1
+
         row, colptr, num_src_nodes = csc
         edge_type = edge_type.int()
 
-        if bipartite:
-            raise NotImplementedError
-
-        if num_src_nodes != colptr.numel() - 1:
-            if max_num_neighbors is None:
-                max_num_neighbors = int((colptr[1:] - colptr[:-1]).max())
-
-            return SampledHeteroCSC(
-                colptr, row, edge_type, max_num_neighbors, num_src_nodes, num_edge_types
-            )
-
-        return StaticHeteroCSC(colptr, row, edge_type, num_edge_types)
+        return HeteroCSC(
+            offsets=colptr,
+            indices=row,
+            edge_types=edge_type,
+            num_src_nodes=num_src_nodes,
+            num_edge_types=num_edge_types,
+            dst_max_in_degree=max_num_neighbors,
+            is_bipartite=bipartite,
+        )
 
     def forward(
         self,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
index 4bf37cf3e72..23b7d50ba96 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 from typing import Optional, Tuple, Union
 
-from pylibcugraphops.pytorch.operators import mha_gat_n2n, mha_gat_n2n_bipartite
+from pylibcugraphops.pytorch.operators import mha_gat_n2n
 
 from cugraph.utilities.utils import import_optional
 
@@ -203,19 +203,6 @@ def forward(
                 )
             x_src = self.lin_src(x[0])
             x_dst = self.lin_dst(x[1])
-
-            out = mha_gat_n2n_bipartite(
-                x_src,
-                x_dst,
-                self.att,
-                graph,
-                num_heads=self.heads,
-                activation="LeakyReLU",
-                negative_slope=self.negative_slope,
-                concat_heads=self.concat,
-                edge_feat=edge_attr,
-            )
-
         else:
             if not hasattr(self, "lin"):
                 raise RuntimeError(
@@ -224,16 +211,16 @@ def forward(
                 )
             x = self.lin(x)
 
-            out = mha_gat_n2n(
-                x,
-                self.att,
-                graph,
-                num_heads=self.heads,
-                activation="LeakyReLU",
-                negative_slope=self.negative_slope,
-                concat_heads=self.concat,
-                edge_feat=edge_attr,
-            )
+        out = mha_gat_n2n(
+            (x_src, x_dst) if bipartite else x,
+            self.att,
+            graph,
+            num_heads=self.heads,
+            activation="LeakyReLU",
+            negative_slope=self.negative_slope,
+            concat_heads=self.concat,
+            edge_feat=edge_attr,
+        )
 
         if self.bias is not None:
             out = out + self.bias
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
index 66d962b3f86..d4c947b952a 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 from typing import Optional, Tuple, Union
 
-from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n, mha_gat_v2_n2n_bipartite
+from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n
 
 from cugraph.utilities.utils import import_optional
 
@@ -187,8 +187,8 @@ def forward(
                 representation to the desired format.
             edge_attr: (torch.Tensor, optional) The edge features.
         """
-        bipartite = not isinstance(x, torch.Tensor)
-        graph = self.get_cugraph(csc, bipartite=bipartite or not self.share_weights)
+        bipartite = not isinstance(x, torch.Tensor) or not self.share_weights
+        graph = self.get_cugraph(csc, bipartite=bipartite)
 
         if edge_attr is not None:
             if self.lin_edge is None:
@@ -200,38 +200,24 @@ def forward(
                 edge_attr = edge_attr.view(-1, 1)
             edge_attr = self.lin_edge(edge_attr)
 
-        if not bipartite and self.share_weights:
+        if bipartite:
+            if isinstance(x, torch.Tensor):
+                x = (x, x)
+            x_src = self.lin_src(x[0])
+            x_dst = self.lin_dst(x[1])
+        else:
             x = self.lin_src(x)
 
-            out = mha_gat_v2_n2n(
-                x,
-                self.att,
-                graph,
-                num_heads=self.heads,
-                activation="LeakyReLU",
-                negative_slope=self.negative_slope,
-                concat_heads=self.concat,
-                edge_feat=edge_attr,
-            )
-        else:
-            if bipartite:
-                x_src = self.lin_src(x[0])
-                x_dst = self.lin_dst(x[1])
-            else:
-                x_src = self.lin_src(x)
-                x_dst = self.lin_dst(x)
-
-            out = mha_gat_v2_n2n_bipartite(
-                x_src,
-                x_dst,
-                self.att,
-                graph,
-                num_heads=self.heads,
-                activation="LeakyReLU",
-                negative_slope=self.negative_slope,
-                concat_heads=self.concat,
-                edge_feat=edge_attr,
-            )
+        out = mha_gat_v2_n2n(
+            (x_src, x_dst) if bipartite else x,
+            self.att,
+            graph,
+            num_heads=self.heads,
+            activation="LeakyReLU",
+            negative_slope=self.negative_slope,
+            concat_heads=self.concat,
+            edge_feat=edge_attr,
+        )
 
         if self.bias is not None:
             out = out + self.bias
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
index aeb51c028ae..f67756eb3fe 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 from typing import Optional, Tuple, Union
 
-from pylibcugraphops.pytorch.operators import mha_simple_n2n as TransformerConvAgg
+from pylibcugraphops.pytorch.operators import mha_simple_n2n
 
 from cugraph.utilities.utils import import_optional
 
@@ -168,10 +168,10 @@ def forward(
                 representation to the desired format.
             edge_attr: (torch.Tensor, optional) The edge features.
         """
-        bipartite = not isinstance(x, torch.Tensor)
+        bipartite = True
         graph = self.get_cugraph(csc, bipartite=bipartite)
 
-        if not bipartite:
+        if isinstance(x, torch.Tensor):
             x = (x, x)
 
         query = self.lin_query(x[1])
@@ -186,7 +186,7 @@ def forward(
                 )
             edge_attr = self.lin_edge(edge_attr)
 
-        out = TransformerConvAgg(
+        out = mha_simple_n2n(
             key,
             query,
             value,

From 3d1539b198ec63b4c77e1208f134a0985577cc0b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Thu, 6 Jul 2023 08:27:12 -0700
Subject: [PATCH 07/10] Include cuCollection public header for hash functions
 (#3694)

Currently we are including `cuco/detail/hash_functions.cuh` but cuCollection now has `cuco/hash_functions.cuh`. Include the public one instead.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/3694
---
 cpp/src/detail/graph_partition_utils.cuh                        | 2 +-
 cpp/tests/prims/mg_count_if_e.cu                                | 2 +-
 cpp/tests/prims/mg_count_if_v.cu                                | 2 +-
 cpp/tests/prims/mg_extract_transform_e.cu                       | 2 +-
 cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu   | 2 +-
 .../prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu      | 2 +-
 cpp/tests/prims/mg_reduce_v.cu                                  | 2 +-
 cpp/tests/prims/mg_transform_e.cu                               | 2 +-
 cpp/tests/prims/mg_transform_reduce_e.cu                        | 2 +-
 cpp/tests/prims/mg_transform_reduce_v.cu                        | 2 +-
 .../prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu   | 2 +-
 cpp/tests/prims/property_generator.cuh                          | 2 +-
 cpp/tests/sampling/detail/nbr_sampling_utils.cuh                | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/detail/graph_partition_utils.cuh b/cpp/src/detail/graph_partition_utils.cuh
index 88e9623e043..67574719b45 100644
--- a/cpp/src/detail/graph_partition_utils.cuh
+++ b/cpp/src/detail/graph_partition_utils.cuh
@@ -19,7 +19,7 @@
 
 #include <raft/core/device_span.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index bebb21bd720..449aa728d87 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -33,7 +33,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_count_if_v.cu b/cpp/tests/prims/mg_count_if_v.cu
index f90f788cfae..3d745708401 100644
--- a/cpp/tests/prims/mg_count_if_v.cu
+++ b/cpp/tests/prims/mg_count_if_v.cu
@@ -27,7 +27,7 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index 1c85b55e4be..b71fe5ddb5e 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -35,7 +35,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
index 3cd6bd243e1..4d9435dd344 100644
--- a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
@@ -34,7 +34,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index 97d52c04114..677d6ce5022 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -35,7 +35,7 @@
 #include <cugraph/utilities/high_res_timer.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_reduce_v.cu b/cpp/tests/prims/mg_reduce_v.cu
index 7080eb12da6..b6f8da48ef4 100644
--- a/cpp/tests/prims/mg_reduce_v.cu
+++ b/cpp/tests/prims/mg_reduce_v.cu
@@ -30,7 +30,7 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_transform_e.cu b/cpp/tests/prims/mg_transform_e.cu
index ed29fb7c5e3..127eddd43c7 100644
--- a/cpp/tests/prims/mg_transform_e.cu
+++ b/cpp/tests/prims/mg_transform_e.cu
@@ -34,7 +34,7 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index 8dba488f23d..79aa3da54df 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -33,7 +33,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_transform_reduce_v.cu b/cpp/tests/prims/mg_transform_reduce_v.cu
index 3ea7636a718..c9fc138ae1b 100644
--- a/cpp/tests/prims/mg_transform_reduce_v.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v.cu
@@ -29,7 +29,7 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
index 2eb270973f2..d0b97065da7 100644
--- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
@@ -34,7 +34,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/core/comms.hpp>
diff --git a/cpp/tests/prims/property_generator.cuh b/cpp/tests/prims/property_generator.cuh
index 24a21c1cb01..e7264cd276f 100644
--- a/cpp/tests/prims/property_generator.cuh
+++ b/cpp/tests/prims/property_generator.cuh
@@ -21,7 +21,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/core/handle.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/tests/sampling/detail/nbr_sampling_utils.cuh b/cpp/tests/sampling/detail/nbr_sampling_utils.cuh
index 00c14009e86..8221073f556 100644
--- a/cpp/tests/sampling/detail/nbr_sampling_utils.cuh
+++ b/cpp/tests/sampling/detail/nbr_sampling_utils.cuh
@@ -31,7 +31,7 @@
 #include <cugraph/utilities/high_res_timer.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 
-#include <cuco/detail/hash_functions.cuh>
+#include <cuco/hash_functions.cuh>
 
 #include <raft/core/handle.hpp>
 

From 037239686052ee4e07286dcf235d0aa1b0da0ff0 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Thu, 6 Jul 2023 17:54:29 -0400
Subject: [PATCH 08/10] [FIX] Fix the hang in cuGraph Python Uniform Neighbor
 Sample, Add Logging to Bulk Sampler (#3669)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some dask operations were not being done correctly, and time was being lost in broadcasting the rank and label arrays to all workers.  This PR resolves those issues.

Also pulls in the previously-experimental changes that add logging to the bulk sampler.

Credit to @VibhuJawa for isolating and fixing the issues with the column merge in `uniform_neighbor_sample` and the new sampling notebook and shell script.

This PR does modify the sampling APIs so it is breaking.  The API changes are necessary to avoid unnecessary shuffling, and eventually, to improve batch id assignment.

**Dataset:** `ogbn_papers100M x 2`; **Fanout:** `[25, 25]`; **Batch Size:** `512`; **Seeds Per Call:** `524288`
Current runtime: 2.69 s ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
Previous runtime: 4.51 s ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
Speedup: 1.7x

**Dataset:** `ogbn_papers100M x 4`; **Fanout:** `[25, 25]`; **Batch Size:** `512`; **Seeds Per Call:** `524288`
Current runtime: 6.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
Previous runtime: 10.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
Speedup: 1.7x

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3669
---
 .../bulk_sampling/benchmarking_script.ipynb   | 1860 +++++++++++++++++
 .../standalone/bulk_sampling/bulk_sampling.sh |   50 +
 .../bulk_sampling/cugraph_bulk_sampling.py    |   16 +-
 mg_utils/run-dask-process.sh                  |    1 +
 .../dask/sampling/uniform_neighbor_sample.py  |  425 +++-
 .../cugraph/gnn/data_loading/bulk_sampler.py  |  106 +-
 .../gnn/data_loading/bulk_sampler_io.py       |   10 +-
 .../sampling/uniform_neighbor_sample.py       |  190 +-
 .../sampling/test_uniform_neighbor_sample.py  |   33 +-
 .../test_uniform_neighbor_sample_mg.py        |  142 +-
 10 files changed, 2689 insertions(+), 144 deletions(-)
 create mode 100644 benchmarks/cugraph/standalone/bulk_sampling/benchmarking_script.ipynb
 create mode 100755 benchmarks/cugraph/standalone/bulk_sampling/bulk_sampling.sh

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/benchmarking_script.ipynb b/benchmarks/cugraph/standalone/bulk_sampling/benchmarking_script.ipynb
new file mode 100644
index 00000000000..3ea158d1f61
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/benchmarking_script.ipynb
@@ -0,0 +1,1860 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9b8d43d5-3005-4b0b-b418-b84af104bc3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!export RAPIDS_NO_INITIALIZE=\"1\"\n",
+    "!export CUDF_SPILL=\"1\"\n",
+    "!export LIBCUDF_CUFILE_POLICY=\"OFF\"\n",
+    "\n",
+    "from cugraph_bulk_sampling import start_dask_client, benchmark_cugraph_bulk_sampling, load_disk_dataset, construct_graph\n",
+    "from cugraph_bulk_sampling import sample_graph\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3f04da7-c937-4dab-b432-fc569522f411",
+   "metadata": {},
+   "source": [
+    "# Setup Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "26324a75-1b34-4c7b-8a26-23bac23e91b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dask_worker_devices='0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fc8d56ef-4036-4105-9764-1c6cbb2bdb15",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Dask client/cluster created using LocalCUDACluster\n"
+     ]
+    }
+   ],
+   "source": [
+    "client, cluster = start_dask_client(dask_worker_devices=dask_worker_devices,\n",
+    "                                    jit_unspill=False,\n",
+    "                                    rmm_pool_size=28e9,\n",
+    "                                    rmm_async=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5335b115-eeb0-470d-9884-79990506ead7",
+   "metadata": {},
+   "source": [
+    "# Setup Benchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c9c8fb66-6bdd-45d7-8564-cc28e383d966",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset='ogbn_papers100M'\n",
+    "dataset_root=\".\"\n",
+    "output_root=\".\"\n",
+    "reverse_edges=True\n",
+    "add_edge_types=False\n",
+    "batch_size=512\n",
+    "seeds_per_call=524288\n",
+    "fanout=[25,25]\n",
+    "replication_factor=4\n",
+    "seed=123\n",
+    "\n",
+    "dataset_dir=dataset_root\n",
+    "output_path=output_root\n",
+    "persist=False\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "37ed06f6-ad06-443a-be12-61800d59d221",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading edge index for edge type paper__cites__paper\n",
+      "Loading node labels for node type paper (offset=0)\n",
+      "Number of input edges = 6,462,743,488\n",
+      "constructed graph\n"
+     ]
+    }
+   ],
+   "source": [
+    "dask_edgelist_df, dask_label_df, node_offsets, edge_offsets, total_num_nodes = \\\n",
+    "    load_disk_dataset(\n",
+    "        dataset,\n",
+    "        dataset_dir=dataset_dir,\n",
+    "        reverse_edges=reverse_edges,\n",
+    "        replication_factor=replication_factor,\n",
+    "        persist=False,\n",
+    "        add_edge_types=add_edge_types\n",
+    "    )\n",
+    "num_input_edges = len(dask_edgelist_df)\n",
+    "print(\n",
+    "f\"Number of input edges = {num_input_edges:,}\"\n",
+    ")\n",
+    "\n",
+    "G = construct_graph(\n",
+    "dask_edgelist_df\n",
+    ")\n",
+    "del dask_edgelist_df\n",
+    "print('constructed graph')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f71cf5a3-7e4b-4497-9c14-a342cc5abbcd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/index.py:3139: FutureWarning: cudf.StringIndex is deprecated and will be removed from cudf in a future version. Use cudf.Index with the appropriate dtype instead.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/core/index.py:3139: FutureWarning: cudf.StringIndex is deprecated and will be removed from cudf in a future version. Use cudf.Index with the appropriate dtype instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "input memory: 103403895808\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_memory = G.edgelist.edgelist_df.memory_usage().sum().compute()\n",
+    "print(f'input memory: {input_memory}')\n",
+    "\n",
+    "output_subdir = os.path.join(output_path, f'{dataset}[{replication_factor}]_b{batch_size}_f{fanout}')\n",
+    "os.makedirs(output_subdir, exist_ok=True)\n",
+    "\n",
+    "output_sample_path = os.path.join(output_subdir, 'samples')\n",
+    "os.makedirs(output_sample_path,  exist_ok=True)\n",
+    "\n",
+    "batches_per_partition = 200_000 // batch_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d276c5c-65d6-4191-b2a5-37b30d2cd44b",
+   "metadata": {},
+   "source": [
+    "# Benchmarking Sample Graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "675b580c-6a7a-4571-88dd-0d4429f9e5ff",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 9.981931209564209\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '51.1MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '58.6MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '59.0MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '149.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '82.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '98.1MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '240.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '73.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '310.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '267.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '80.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '131.8MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '205.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '288.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '303.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '130.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.299846172332764\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '252.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '278.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '243.6MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '256.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '330.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '239.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '254.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '239.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '278.6MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.2GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '68.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '397.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '79.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '127.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '90.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '303.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '53.5MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.2623701095581055\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '73.1MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '179.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '253.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '366.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '98.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '130.1MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '152.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '265.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '435.1MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.2GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '463.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '151.5MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '379.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '192.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '150.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '277.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '416.1MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.487639665603638\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '241.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '176.8MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '292.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '118.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '60.2MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '204.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '208.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '185.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '254.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '175.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '102.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '83.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '197.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '142.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '262.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '227.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.208818197250366\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '261.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '360.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '240.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '225.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.1GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '428.5MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '288.3MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '202.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '128.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '258.3MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.2GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '203.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '278.5MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '102.3MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '284.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '596.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '301.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '203.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.344887971878052\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '171.8MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '225.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '245.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '315.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '248.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '147.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '64.5MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '290.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '301.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '152.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '276.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '240.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '252.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '625.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.5GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '117.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '192.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.830034255981445\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '202.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '256.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '224.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '101.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '223.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.6GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '115.3MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '59.8MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '260.1MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '82.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '194.4MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '82.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '168.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '252.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '181.7MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '277.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '157.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.967972278594971\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '355.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '65.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '224.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '339.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '208.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.6GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '92.4MB',\n",
+      "                                 'peak_bytes': '2.3GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '59.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '452.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '440.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '159.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '228.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '261.1MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '284.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '129.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '262.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '579.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.228902339935303\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '169.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '179.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '214.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '57.5MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '85.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '283.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '81.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '265.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '240.1MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '83.1MB',\n",
+      "                                 'peak_bytes': '2.3GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '193.2MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '117.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '181.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '408.1MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '309.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '237.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.624476432800293\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '159.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '181.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '134.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '214.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '281.9MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '201.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '399.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '128.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '291.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '228.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '125.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '383.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '313.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '152.3MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '249.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '100.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.998133420944214\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '209.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '211.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '115.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '239.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.1GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '40.3MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '274.8MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '190.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '290.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '61.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '496.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '242.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '164.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '276.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '341.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '295.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '131.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.723633289337158\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '241.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '220.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '375.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '246.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.1GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '291.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '179.6MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '202.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '259.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '241.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '283.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '261.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '124.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '252.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '134.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '277.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '100.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.884088754653931\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '544.1MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '65.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '222.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '136.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '257.0MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '92.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '86.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '452.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '233.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '191.8MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '385.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '365.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '284.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '288.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '263.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '257.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.929041385650635\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '106.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '89.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '198.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '77.6MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '393.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '285.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '247.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '265.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '316.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '86.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '244.2MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '255.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '181.1MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '218.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '317.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '297.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.864148378372192\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '241.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '176.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '205.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '67.0MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '250.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '197.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '201.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '252.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '359.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '280.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '309.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '371.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '256.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '197.4MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '249.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '297.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.87973165512085\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '237.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '210.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '182.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '109.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '256.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '289.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '144.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '231.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '456.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '230.5MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '271.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '290.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '308.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '309.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '206.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '82.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 5.986347436904907\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '264.8MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '33.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '91.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '116.7MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '170.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '154.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '109.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '71.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '116.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '280.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '271.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '561.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '277.0MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '350.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '296.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '93.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.348597764968872\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '209.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '254.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '236.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '121.5MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '68.5MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '344.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '75.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '52.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '146.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '127.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '267.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '218.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '466.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '279.0MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.5GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '262.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '211.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.334516286849976\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '355.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '252.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '40.9MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '99.9MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '211.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '295.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '556.3MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '265.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '264.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '129.9MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '83.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '250.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '284.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '254.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '249.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '152.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.598327398300171\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '226.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '227.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '230.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '90.4MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '190.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.6GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '278.6MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '167.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '186.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '449.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '96.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '406.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '110.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '249.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '594.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '278.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '142.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.590704679489136\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '257.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '256.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '339.5MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '221.2MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '158.1MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '455.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '144.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '231.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '196.0MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '159.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '225.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '726.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '134.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '191.8MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '263.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '170.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.666577577590942\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '536.6MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '256.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '19.6MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '113.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '138.4MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '197.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '144.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '128.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '307.8MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.2GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '150.0MB',\n",
+      "                                 'peak_bytes': '2.3GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '164.7MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '213.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '246.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '106.5MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '309.1MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '268.1MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.439242839813232\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '106.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '256.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '222.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '81.3MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '66.6MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '199.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '72.3MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '240.5MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '232.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '150.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '256.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '200.5MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '466.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '312.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.5GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '134.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '493.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.422755718231201\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '241.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '220.2MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '153.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '50.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '339.5MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '92.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '190.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '265.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '236.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '274.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '201.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '122.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '144.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '602.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '510.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '78.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.785901784896851\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '258.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '224.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '181.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '217.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '167.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '436.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '164.6MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '205.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '204.9MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.2GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '172.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '297.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '61.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '282.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '259.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '171.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '143.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.363157033920288\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '536.5MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '224.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '250.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '225.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.1GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '220.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '181.3MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '252.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '240.6MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '134.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '228.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '405.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '78.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '284.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '174.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '276.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '437.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.568510055541992\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '171.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '224.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '117.2MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '254.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '427.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '116.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '128.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '265.7MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '247.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.2GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '139.3MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '116.4MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '590.2MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '207.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '182.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.5GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '133.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '214.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.794158220291138\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '235.0MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '225.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '227.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '239.0MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '230.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.6GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '212.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '93.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '195.4MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '231.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '386.9MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '256.8MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '86.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '659.0MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '811.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '265.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '322.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.1GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 7.261108636856079\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '355.5MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '251.8MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '238.9MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '336.4MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '166.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '59.6GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '236.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '228.9MB',\n",
+      "                                 'peak_bytes': '2.4GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '259.5MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '245.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '562.3MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '232.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '206.6MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '181.0MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '177.0MB',\n",
+      "                                 'peak_bytes': '2.9GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '185.8MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '224.9MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "created batches\n",
+      "flushed all batches\n",
+      "function:  sample_graph\n",
+      "function args: (<cugraph.structure.graph_classes.MultiGraph object at 0x7ff27879a440>, <dask_cudf.DataFrame | 32 tasks | 16 npartitions>, '/tmp/ramdisk/ogbn_papers100M[4]_b512_f[25, 25]/samples') kwargs: {'seed': 123, 'batch_size': 512, 'seeds_per_call': 524288, 'batches_per_partition': 390, 'fanout': [25, 25], 'persist': False}\n",
+      "execution_time: 6.985189437866211\n",
+      "allocation_counts:\n",
+      "{   'tcp://127.0.0.1:33343': {   'current_bytes': '66.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '60.4GB'},\n",
+      "    'tcp://127.0.0.1:33565': {   'current_bytes': '256.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:33977': {   'current_bytes': '168.4MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.4GB'},\n",
+      "    'tcp://127.0.0.1:34603': {   'current_bytes': '413.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '60.2GB'},\n",
+      "    'tcp://127.0.0.1:36543': {   'current_bytes': '141.7MB',\n",
+      "                                 'peak_bytes': '2.5GB',\n",
+      "                                 'total_bytes': '59.5GB'},\n",
+      "    'tcp://127.0.0.1:39379': {   'current_bytes': '324.9MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '61.5GB'},\n",
+      "    'tcp://127.0.0.1:40517': {   'current_bytes': '368.7MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.2GB'},\n",
+      "    'tcp://127.0.0.1:40547': {   'current_bytes': '452.6MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '61.8GB'},\n",
+      "    'tcp://127.0.0.1:40565': {   'current_bytes': '154.8MB',\n",
+      "                                 'peak_bytes': '2.7GB',\n",
+      "                                 'total_bytes': '62.3GB'},\n",
+      "    'tcp://127.0.0.1:40769': {   'current_bytes': '181.3MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:42093': {   'current_bytes': '118.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:42897': {   'current_bytes': '280.7MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:43245': {   'current_bytes': '236.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.9GB'},\n",
+      "    'tcp://127.0.0.1:46157': {   'current_bytes': '275.1MB',\n",
+      "                                 'peak_bytes': '2.6GB',\n",
+      "                                 'total_bytes': '62.4GB'},\n",
+      "    'tcp://127.0.0.1:46757': {   'current_bytes': '183.6MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '61.7GB'},\n",
+      "    'tcp://127.0.0.1:46883': {   'current_bytes': '167.1MB',\n",
+      "                                 'peak_bytes': '2.8GB',\n",
+      "                                 'total_bytes': '62.0GB'}}\n",
+      "6.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 30 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n30 -r1\n",
+    "\n",
+    "\n",
+    "execution_time, allocation_counts = sample_graph(\n",
+    "    G,\n",
+    "    dask_label_df,\n",
+    "    output_sample_path,\n",
+    "    seed=seed,\n",
+    "    batch_size=batch_size,\n",
+    "    seeds_per_call=seeds_per_call,\n",
+    "    batches_per_partition=batches_per_partition,\n",
+    "    fanout=fanout,\n",
+    "    persist=persist,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27066cf3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.11 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f708a36acfaef0acf74ccd43dfb58100269bf08fb79032a1e0a6f35bd9856f51"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bulk_sampling.sh b/benchmarks/cugraph/standalone/bulk_sampling/bulk_sampling.sh
new file mode 100755
index 00000000000..e62cb3cda29
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/bulk_sampling.sh
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export RAPIDS_NO_INITIALIZE="1"
+export CUDF_SPILL="1"
+export LIBCUDF_CUFILE_POLICY=OFF
+
+
+dataset_name=$1
+dataset_root=$2
+output_root=$3
+batch_sizes=$4
+fanouts=$5
+reverse_edges=$6
+
+rm -rf $output_root
+mkdir -p $output_root
+
+# Change to 2 in Selene
+gpu_per_replica=4
+#--add_edge_ids \
+
+# Expand to 1, 4, 8 in Selene
+for i in 1,2,3,4:
+do 
+    for replication in 2;
+    do
+        dataset_name_with_replication="${dataset_name}[${replication}]"
+        dask_worker_devices=$(seq -s, 0 $((gpu_per_replica*replication-1)))
+        echo "Sampling dataset = $dataset_name_with_replication on devices = $dask_worker_devices"
+        python3 cugraph_bulk_sampling.py --datasets $dataset_name_with_replication \
+                --dataset_root $dataset_root \
+                --batch_sizes $batch_sizes \
+                --output_root $output_root \
+                --dask_worker_devices $dask_worker_devices \
+                --fanouts $fanouts \
+                --batch_sizes $batch_sizes \
+                --reverse_edges
+    done
+done
\ No newline at end of file
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
index 3cfd39afc98..d2a3716da8a 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
@@ -33,8 +33,6 @@
 
 import cugraph
 
-from datetime import datetime
-
 import json
 import re
 import os
@@ -50,6 +48,7 @@
 import dask_cudf
 import dask.dataframe as ddf
 from dask.distributed import default_client
+from cugraph.dask import get_n_workers
 
 from typing import Optional, Union, Dict
 
@@ -173,6 +172,7 @@ def sample_graph(G, label_df, output_path,seed=42, batch_size=500, seeds_per_cal
         random_state=seed,
         seeds_per_call=seeds_per_call,
         batches_per_partition=batches_per_partition,
+        log_level = logging.INFO
     )
 
     n_workers = len(default_client().scheduler_info()['workers'])
@@ -182,10 +182,10 @@ def sample_graph(G, label_df, output_path,seed=42, batch_size=500, seeds_per_cal
         'batch': cudf.Series(dtype='int32')
     })
 
-    
     batch_df = label_df.map_partitions(_make_batch_ids, batch_size, n_workers, meta=meta)
+    #batch_df = batch_df.sort_values(by='node')
     
-    # should always persist the batch dataframe or performace may be suboptimal
+    # should always persist the batch dataframe or performance may be suboptimal
     batch_df = batch_df.persist()
 
     del label_df
@@ -278,6 +278,8 @@ def load_disk_dataset(dataset, dataset_dir='.', reverse_edges=True, replication_
     path = Path(dataset_dir) / dataset
     parquet_path = path / 'parquet'
 
+    n_workers = get_n_workers()
+
     with open(os.path.join(path, 'meta.json')) as meta_file:
         meta = json.load(meta_file)
     
@@ -289,7 +291,9 @@ def load_disk_dataset(dataset, dataset_dir='.', reverse_edges=True, replication_
         print(f'Loading edge index for edge type {edge_type}')
 
         can_edge_type = tuple(edge_type.split('__'))
-        edge_index_dict[can_edge_type] = dask_cudf.read_parquet(os.path.join(os.path.join(parquet_path, edge_type), 'edge_index.parquet'))
+        edge_index_dict[can_edge_type] = dask_cudf.read_parquet(
+            Path(parquet_path) / edge_type / 'edge_index.parquet'
+        ).repartition(n_workers*2)
 
         edge_index_dict[can_edge_type]['src'] += node_offsets_replicated[can_edge_type[0]]
         edge_index_dict[can_edge_type]['dst'] += node_offsets_replicated[can_edge_type[-1]]
@@ -344,7 +348,7 @@ def load_disk_dataset(dataset, dataset_dir='.', reverse_edges=True, replication_
         print(f'Loading node labels for node type {node_type} (offset={offset})')
         node_label_path = os.path.join(os.path.join(parquet_path, node_type), 'node_label.parquet')
         if os.path.exists(node_label_path):
-            node_labels[node_type] = dask_cudf.read_parquet(node_label_path).drop('label',axis=1).persist()
+            node_labels[node_type] = dask_cudf.read_parquet(node_label_path).repartition(n_workers).drop('label',axis=1).persist()
             node_labels[node_type]['node'] += offset
             node_labels[node_type] = node_labels[node_type].persist()
 
diff --git a/mg_utils/run-dask-process.sh b/mg_utils/run-dask-process.sh
index e5fa8fab332..b88abb685ec 100755
--- a/mg_utils/run-dask-process.sh
+++ b/mg_utils/run-dask-process.sh
@@ -102,6 +102,7 @@ function buildTcpArgs {
                 "
 
     WORKER_ARGS="--rmm-pool-size=$WORKER_RMM_POOL_SIZE
+             --rmm-async
              --local-directory=/tmp/$LOGNAME
              --scheduler-file=$SCHEDULER_FILE
              --memory-limit=$DASK_HOST_MEMORY_LIMIT
diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
index 7d8972a7385..d74a8df14eb 100644
--- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -14,10 +14,11 @@
 
 from __future__ import annotations
 
+import warnings
+
 import numpy
 from dask import delayed
-from dask.distributed import wait, Lock, get_client
-from cugraph.dask.common.input_utils import get_distributed_data
+from dask.distributed import Lock, get_client, wait
 
 import dask_cudf
 import cudf
@@ -26,12 +27,20 @@
 from pylibcugraph import ResourceHandle
 
 from pylibcugraph import uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample
+from pylibcugraph.utilities.api_tools import deprecated_warning_wrapper
 
 from cugraph.dask.comms import comms as Comms
+from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.dask import get_n_workers
 
 from typing import Sequence, List, Union, Tuple
 from typing import TYPE_CHECKING
 
+from cugraph.dask.common.part_utils import (
+    get_persisted_df_worker_map,
+    persist_dask_df_equal_parts_per_worker,
+)
+
 if TYPE_CHECKING:
     from cugraph import Graph
 
@@ -150,7 +159,63 @@ def convert_to_cudf(cp_arrays, weight_t, with_edge_properties, return_offsets=Fa
         return df
 
 
+def __get_label_to_output_comm_rank(min_batch_id, max_batch_id, n_workers):
+    num_batches = max_batch_id - min_batch_id + 1
+    num_batches = int(num_batches)
+    z = cp.zeros(num_batches, dtype="int32")
+    s = cp.array_split(cp.arange(num_batches), n_workers)
+    for i, t in enumerate(s):
+        z[t] = i
+
+    return z
+
+
 def _call_plc_uniform_neighbor_sample(
+    sID,
+    mg_graph_x,
+    st_x,
+    keep_batches_together,
+    n_workers,
+    min_batch_id,
+    max_batch_id,
+    fanout_vals,
+    with_replacement,
+    weight_t,
+    with_edge_properties,
+    random_state=None,
+    return_offsets=False,
+):
+    st_x = st_x[0]
+    start_list_x = st_x[start_col_name]
+    batch_id_list_x = st_x[batch_col_name] if batch_col_name in st_x else None
+
+    label_list = None
+    label_to_output_comm_rank = None
+    if keep_batches_together:
+        label_list = cp.arange(min_batch_id, max_batch_id + 1, dtype="int32")
+        label_to_output_comm_rank = __get_label_to_output_comm_rank(
+            min_batch_id, max_batch_id, n_workers
+        )
+
+    cp_arrays = pylibcugraph_uniform_neighbor_sample(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        input_graph=mg_graph_x,
+        start_list=start_list_x,
+        label_list=label_list,
+        label_to_output_comm_rank=label_to_output_comm_rank,
+        h_fan_out=fanout_vals,
+        with_replacement=with_replacement,
+        do_expensive_check=False,
+        with_edge_properties=with_edge_properties,
+        batch_id_list=batch_id_list_x,
+        random_state=random_state,
+    )
+    return convert_to_cudf(
+        cp_arrays, weight_t, with_edge_properties, return_offsets=return_offsets
+    )
+
+
+def _call_plc_uniform_neighbor_sample_legacy(
     sID,
     mg_graph_x,
     st_x,
@@ -183,7 +248,7 @@ def _call_plc_uniform_neighbor_sample(
     )
 
 
-def _mg_call_plc_uniform_neighbor_sample(
+def _mg_call_plc_uniform_neighbor_sample_legacy(
     client,
     session_id,
     input_graph,
@@ -200,7 +265,7 @@ def _mg_call_plc_uniform_neighbor_sample(
 ):
     result = [
         client.submit(
-            _call_plc_uniform_neighbor_sample,
+            _call_plc_uniform_neighbor_sample_legacy,
             session_id,
             input_graph._plc_graph[w],
             ddf[w][0],
@@ -247,7 +312,92 @@ def _mg_call_plc_uniform_neighbor_sample(
         return ddf
 
 
-def uniform_neighbor_sample(
+def _mg_call_plc_uniform_neighbor_sample(
+    client,
+    session_id,
+    input_graph,
+    ddf,
+    keep_batches_together,
+    min_batch_id,
+    max_batch_id,
+    fanout_vals,
+    with_replacement,
+    weight_t,
+    indices_t,
+    with_edge_properties,
+    random_state,
+    return_offsets=False,
+):
+    n_workers = None
+    if keep_batches_together:
+        n_workers = get_n_workers()
+
+        if hasattr(min_batch_id, "compute"):
+            min_batch_id = min_batch_id.compute()
+        if hasattr(max_batch_id, "compute"):
+            max_batch_id = max_batch_id.compute()
+
+    result = [
+        client.submit(
+            _call_plc_uniform_neighbor_sample,
+            session_id,
+            input_graph._plc_graph[w],
+            starts,
+            keep_batches_together,
+            n_workers,
+            min_batch_id,
+            max_batch_id,
+            fanout_vals,
+            with_replacement,
+            weight_t=weight_t,
+            with_edge_properties=with_edge_properties,
+            # FIXME accept and properly transmute a numpy/cupy random state.
+            random_state=hash((random_state, w)),
+            return_offsets=return_offsets,
+            allow_other_workers=False,
+            pure=False,
+        )
+        for w, starts in ddf.items()
+    ]
+    del ddf
+
+    empty_df = (
+        create_empty_df_with_edge_props(
+            indices_t, weight_t, return_offsets=return_offsets
+        )
+        if with_edge_properties
+        else create_empty_df(indices_t, weight_t)
+    )
+
+    wait(result)
+
+    if return_offsets:
+        result_split = [delayed(lambda x: x, nout=2)(r) for r in result]
+        ddf = dask_cudf.from_delayed(
+            [r[0] for r in result_split], meta=empty_df[0], verify_meta=False
+        ).persist()
+        ddf_offsets = dask_cudf.from_delayed(
+            [r[1] for r in result_split], meta=empty_df[1], verify_meta=False
+        ).persist()
+
+        wait([ddf, ddf_offsets])
+        wait([r.release() for r in result_split])
+        wait([r.release() for r in result])
+
+        del result
+
+        return ddf, ddf_offsets
+    else:
+        ddf = dask_cudf.from_delayed(result, meta=empty_df, verify_meta=False).persist()
+
+        wait(ddf)
+        wait([r.release() for r in result])
+        del result
+
+        return ddf
+
+
+def _uniform_neighbor_sample_legacy(
     input_graph: Graph,
     start_list: Sequence,
     fanout_vals: List[int],
@@ -259,6 +409,162 @@ def uniform_neighbor_sample(
     random_state: int = None,
     return_offsets: bool = False,
     _multiple_clients: bool = False,
+) -> Union[dask_cudf.DataFrame, Tuple[dask_cudf.DataFrame, dask_cudf.DataFrame]]:
+    warnings.warn(
+        "The batch_id_list, label_list, and label_to_output_comm_rank "
+        "parameters are deprecated.  Consider using with_batch_ids, "
+        "keep_batches_together, min_batch_id, and max_batch_id instead."
+    )
+
+    if isinstance(start_list, int):
+        start_list = [start_list]
+
+    if isinstance(start_list, list):
+        start_list = cudf.Series(
+            start_list,
+            dtype=input_graph.edgelist.edgelist_df[
+                input_graph.renumber_map.renumbered_src_col_name
+            ].dtype,
+        )
+
+    elif with_edge_properties and batch_id_list is None:
+        batch_id_list = cudf.Series(cp.zeros(len(start_list), dtype="int32"))
+
+    # fanout_vals must be a host array!
+    # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
+    if isinstance(fanout_vals, list):
+        fanout_vals = numpy.asarray(fanout_vals, dtype="int32")
+    else:
+        raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}")
+
+    if "value" in input_graph.edgelist.edgelist_df:
+        weight_t = input_graph.edgelist.edgelist_df["value"].dtype
+    else:
+        weight_t = "float32"
+
+    if "_SRC_" in input_graph.edgelist.edgelist_df:
+        indices_t = input_graph.edgelist.edgelist_df["_SRC_"].dtype
+    elif src_n in input_graph.edgelist.edgelist_df:
+        indices_t = input_graph.edgelist.edgelist_df[src_n].dtype
+    else:
+        indices_t = numpy.int32
+
+    start_list = start_list.rename(start_col_name)
+    if batch_id_list is not None:
+        batch_id_list = batch_id_list.rename(batch_col_name)
+        if hasattr(start_list, "compute"):
+            # mg input
+            start_list = start_list.to_frame()
+            batch_id_list = batch_id_list.to_frame()
+            ddf = start_list.merge(
+                batch_id_list,
+                how="left",
+                left_index=True,
+                right_index=True,
+            )
+        else:
+            # sg input
+            ddf = cudf.concat(
+                [
+                    start_list,
+                    batch_id_list,
+                ],
+                axis=1,
+            )
+    else:
+        ddf = start_list.to_frame()
+
+    if input_graph.renumbered:
+        ddf = input_graph.lookup_internal_vertex_id(ddf, column_name=start_col_name)
+
+    if hasattr(ddf, "compute"):
+        ddf = get_distributed_data(ddf)
+        wait(ddf)
+        ddf = ddf.worker_to_parts
+    else:
+        splits = cp.array_split(cp.arange(len(ddf)), len(Comms.get_workers()))
+        ddf = {w: [ddf.iloc[splits[i]]] for i, w in enumerate(Comms.get_workers())}
+
+    client = get_client()
+    session_id = Comms.get_session_id()
+    if _multiple_clients:
+        # Distributed centralized lock to allow
+        # two disconnected processes (clients) to coordinate a lock
+        # https://docs.dask.org/en/stable/futures.html?highlight=lock#distributed.Lock
+        lock = Lock("plc_graph_access")
+        if lock.acquire(timeout=100):
+            try:
+                ddf = _mg_call_plc_uniform_neighbor_sample_legacy(
+                    client=client,
+                    session_id=session_id,
+                    input_graph=input_graph,
+                    ddf=ddf,
+                    label_list=label_list,
+                    label_to_output_comm_rank=label_to_output_comm_rank,
+                    fanout_vals=fanout_vals,
+                    with_replacement=with_replacement,
+                    weight_t=weight_t,
+                    indices_t=indices_t,
+                    with_edge_properties=with_edge_properties,
+                    random_state=random_state,
+                    return_offsets=return_offsets,
+                )
+            finally:
+                lock.release()
+        else:
+            raise RuntimeError(
+                "Failed to acquire lock(plc_graph_access) while trying to sampling"
+            )
+    else:
+        ddf = _mg_call_plc_uniform_neighbor_sample_legacy(
+            client=client,
+            session_id=session_id,
+            input_graph=input_graph,
+            ddf=ddf,
+            label_list=label_list,
+            label_to_output_comm_rank=label_to_output_comm_rank,
+            fanout_vals=fanout_vals,
+            with_replacement=with_replacement,
+            weight_t=weight_t,
+            indices_t=indices_t,
+            with_edge_properties=with_edge_properties,
+            random_state=random_state,
+            return_offsets=return_offsets,
+        )
+
+    if return_offsets:
+        ddf, offsets_ddf = ddf
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "sources", preserve_order=True)
+        ddf = input_graph.unrenumber(ddf, "destinations", preserve_order=True)
+
+    if return_offsets:
+        return ddf, offsets_ddf
+
+    return ddf
+
+
+uniform_neighbor_sample_legacy = deprecated_warning_wrapper(
+    _uniform_neighbor_sample_legacy
+)
+
+
+def uniform_neighbor_sample(
+    input_graph: Graph,
+    start_list: Sequence,
+    fanout_vals: List[int],
+    with_replacement: bool = True,
+    with_edge_properties: bool = False,
+    batch_id_list: Sequence = None,  # deprecated
+    label_list: Sequence = None,  # deprecated
+    label_to_output_comm_rank: bool = None,  # deprecated
+    with_batch_ids: bool = False,
+    keep_batches_together=False,
+    min_batch_id=None,
+    max_batch_id=None,
+    random_state: int = None,
+    return_offsets: bool = False,
+    _multiple_clients: bool = False,
 ) -> Union[dask_cudf.DataFrame, Tuple[dask_cudf.DataFrame, dask_cudf.DataFrame]]:
     """
     Does neighborhood sampling, which samples nodes from a graph based on the
@@ -285,20 +591,36 @@ def uniform_neighbor_sample(
         edge type, batch id, hop id) with the sampled edges.
 
     batch_id_list: cudf.Series or dask_cudf.Series (int32), optional (default=None)
+        Deprecated.
         List of batch ids that will be returned with the sampled edges if
         with_edge_properties is set to True.
 
     label_list: cudf.Series or dask_cudf.Series (int32), optional (default=None)
+        Deprecated.
         List of unique batch id labels.  Used along with
         label_to_output_comm_rank to assign batch ids to GPUs.
 
     label_to_out_comm_rank: cudf.Series or dask_cudf.Series (int32),
     optional (default=None)
+        Deprecated.
         List of output GPUs (by rank) corresponding to batch
         id labels in the label list.  Used to assign each batch
         id to a GPU.
         Must be in ascending order (i.e. [0, 0, 1, 2]).
 
+    with_batch_ids: bool, optional (default=False)
+        Flag to specify whether batch ids are present in the start_list
+
+    keep_batches_together: bool (optional, default=False)
+        If True, will ensure that the returned samples for each batch are on the
+        same partition.
+
+    min_batch_id: int (optional, default=None)
+        Required for the keep_batches_together option.  The minimum batch id.
+
+    max_batch_id: int (optional, default=None)
+        Required for the keep_batches_together option.  The maximum batch id.
+
     random_state: int, optional
         Random seed to use when making sampling calls.
 
@@ -363,6 +685,25 @@ def uniform_neighbor_sample(
                     Contains the offsets of each batch in the sampling result
     """
 
+    if (
+        batch_id_list is not None
+        or label_list is not None
+        or label_to_output_comm_rank is not None
+    ):
+        return uniform_neighbor_sample_legacy(
+            input_graph,
+            start_list,
+            fanout_vals,
+            with_replacement=with_replacement,
+            with_edge_properties=with_edge_properties,
+            batch_id_list=batch_id_list,
+            label_list=label_list,
+            label_to_output_comm_rank=label_to_output_comm_rank,
+            random_state=random_state,
+            return_offsets=return_offsets,
+            _multiple_clients=_multiple_clients,
+        )
+
     if isinstance(start_list, int):
         start_list = [start_list]
 
@@ -373,9 +714,21 @@ def uniform_neighbor_sample(
                 input_graph.renumber_map.renumbered_src_col_name
             ].dtype,
         )
+    elif with_edge_properties and not with_batch_ids:
+        if isinstance(start_list, (cudf.DataFrame, dask_cudf.DataFrame)):
+            raise ValueError("expected 1d input for start list without batch ids")
 
-    elif with_edge_properties and batch_id_list is None:
-        batch_id_list = cudf.Series(cp.zeros(len(start_list), dtype="int32"))
+        start_list = start_list.to_frame()
+        start_list[batch_id_n] = cudf.Series(cp.zeros(len(start_list), dtype="int32"))
+
+    if keep_batches_together and min_batch_id is None:
+        raise ValueError(
+            "must provide min_batch_id if using keep_batches_together option"
+        )
+    if keep_batches_together and max_batch_id is None:
+        raise ValueError(
+            "must provide max_batch_id if using keep_batches_together option"
+        )
 
     # fanout_vals must be a host array!
     # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
@@ -396,44 +749,30 @@ def uniform_neighbor_sample(
     else:
         indices_t = numpy.int32
 
-    start_list = start_list.rename(start_col_name)
-    if batch_id_list is not None:
-        batch_id_list = batch_id_list.rename(batch_col_name)
-        if hasattr(start_list, "compute"):
-            # mg input
-            start_list = start_list.to_frame()
-            batch_id_list = batch_id_list.to_frame()
-            ddf = start_list.merge(
-                batch_id_list,
-                how="left",
-                left_index=True,
-                right_index=True,
-            )
-        else:
-            # sg input
-            ddf = cudf.concat(
-                [
-                    start_list,
-                    batch_id_list,
-                ],
-                axis=1,
-            )
-    else:
+    if isinstance(start_list, (cudf.Series, dask_cudf.Series)):
+        start_list = start_list.rename(start_col_name)
         ddf = start_list.to_frame()
+    else:
+        ddf = start_list
+        columns = ddf.columns
+        ddf = ddf.rename(
+            columns={columns[0]: start_col_name, columns[-1]: batch_col_name}
+        )
 
     if input_graph.renumbered:
         ddf = input_graph.lookup_internal_vertex_id(ddf, column_name=start_col_name)
 
-    if hasattr(ddf, "compute"):
-        ddf = get_distributed_data(ddf)
-        wait(ddf)
-        ddf = ddf.worker_to_parts
-    else:
-        splits = cp.array_split(cp.arange(len(ddf)), len(Comms.get_workers()))
-        ddf = {w: [ddf.iloc[splits[i]]] for i, w in enumerate(Comms.get_workers())}
-
     client = get_client()
     session_id = Comms.get_session_id()
+    n_workers = get_n_workers()
+
+    if isinstance(ddf, cudf.DataFrame):
+        ddf = dask_cudf.from_cudf(ddf, npartitions=n_workers)
+
+    ddf = ddf.repartition(npartitions=n_workers)
+    ddf = persist_dask_df_equal_parts_per_worker(ddf, client)
+    ddf = get_persisted_df_worker_map(ddf, client)
+
     if _multiple_clients:
         # Distributed centralized lock to allow
         # two disconnected processes (clients) to coordinate a lock
@@ -446,8 +785,9 @@ def uniform_neighbor_sample(
                     session_id=session_id,
                     input_graph=input_graph,
                     ddf=ddf,
-                    label_list=label_list,
-                    label_to_output_comm_rank=label_to_output_comm_rank,
+                    keep_batches_together=keep_batches_together,
+                    min_batch_id=min_batch_id,
+                    max_batch_id=max_batch_id,
                     fanout_vals=fanout_vals,
                     with_replacement=with_replacement,
                     weight_t=weight_t,
@@ -468,8 +808,9 @@ def uniform_neighbor_sample(
             session_id=session_id,
             input_graph=input_graph,
             ddf=ddf,
-            label_list=label_list,
-            label_to_output_comm_rank=label_to_output_comm_rank,
+            keep_batches_together=keep_batches_together,
+            min_batch_id=min_batch_id,
+            max_batch_id=max_batch_id,
             fanout_vals=fanout_vals,
             with_replacement=with_replacement,
             weight_t=weight_t,
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
index 33de5fdc185..a2b0a367d1d 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
@@ -15,16 +15,21 @@
 
 from typing import Union
 
-import cupy
 import cudf
 import dask_cudf
-import cugraph.dask as dask_cugraph
+
+from dask.distributed import wait
+from dask.distributed import futures_of
 
 import cugraph
 import pylibcugraph
 
 from cugraph.gnn.data_loading.bulk_sampler_io import write_samples
 
+import warnings
+import logging
+import time
+
 
 class EXPERIMENTAL__BulkSampler:
     start_col_name = "_START_"
@@ -36,7 +41,8 @@ def __init__(
         output_path: str,
         graph,
         seeds_per_call: int = 200_000,
-        batches_per_partition=100,
+        batches_per_partition: int = 100,
+        log_level: int = None,
         **kwargs,
     ):
         """
@@ -55,13 +61,19 @@ def __init__(
             a single sampling call.
         batches_per_partition: int (optional, default=100)
             The number of batches outputted to a single parquet partition.
+        log_level: int (optional, default=None)
+            Whether to enable logging for this sampler. Supports 3 levels
+            of logging if enabled (INFO, WARNING, ERROR).  If not provided,
+            defaults to WARNING.
         kwargs: kwargs
             Keyword arguments to be passed to the sampler (i.e. fanout).
         """
 
+        self.__logger = logging.getLogger(__name__)
+        self.__logger.setLevel(log_level or logging.WARNING)
+
         max_batches_per_partition = seeds_per_call // batch_size
         if batches_per_partition > max_batches_per_partition:
-            import warnings
 
             warnings.warn(
                 f"batches_per_partition ({batches_per_partition}) is >"
@@ -140,7 +152,7 @@ def add_batches(
         ...     start_col_name="start_vid",
         ...     batch_col_name="start_batch")
         """
-        df = df.rename(
+        df = df[[start_col_name, batch_col_name]].rename(
             columns={
                 start_col_name: self.start_col_name,
                 batch_col_name: self.batch_col_name,
@@ -163,6 +175,11 @@ def add_batches(
                 )
 
         if self.size >= self.seeds_per_call:
+            self.__logger.info(
+                f"Number of input seeds ({self.size})"
+                f" is >= seeds per call ({self.seeds_per_call})."
+                " Calling flush() to compute and write minibatches."
+            )
             self.flush()
 
     def flush(self) -> None:
@@ -171,14 +188,16 @@ def flush(self) -> None:
         """
         if self.size == 0:
             return
-        self.__batches.reset_index(drop=True)
+
+        start_time_calc_batches = time.perf_counter()
         if isinstance(self.__batches, dask_cudf.DataFrame):
             self.__batches = self.__batches.persist()
 
         min_batch_id = self.__batches[self.batch_col_name].min()
         if isinstance(self.__batches, dask_cudf.DataFrame):
-            min_batch_id = min_batch_id.compute()
-        min_batch_id = int(min_batch_id)
+            min_batch_id = min_batch_id.persist()
+        else:
+            min_batch_id = int(min_batch_id)
 
         partition_size = self.batches_per_partition * self.batch_size
         partitions_per_call = (
@@ -187,7 +206,19 @@ def flush(self) -> None:
         npartitions = partitions_per_call
 
         max_batch_id = min_batch_id + npartitions * self.batches_per_partition - 1
+        if isinstance(self.__batches, dask_cudf.DataFrame):
+            max_batch_id = max_batch_id.persist()
+
         batch_id_filter = self.__batches[self.batch_col_name] <= max_batch_id
+        if isinstance(batch_id_filter, dask_cudf.Series):
+            batch_id_filter = batch_id_filter.persist()
+
+        end_time_calc_batches = time.perf_counter()
+        self.__logger.info(
+            f"Calculated batches to sample; min = {min_batch_id}"
+            f" and max = {max_batch_id};"
+            f" took {end_time_calc_batches - start_time_calc_batches:.4f} s"
+        )
 
         if isinstance(self.__graph._plc_graph, pylibcugraph.graphs.SGGraph):
             sample_fn = cugraph.uniform_neighbor_sample
@@ -196,31 +227,62 @@ def flush(self) -> None:
             self.__sample_call_args.update(
                 {
                     "_multiple_clients": True,
-                    "label_to_output_comm_rank": self.__get_label_to_output_comm_rank(
-                        min_batch_id, max_batch_id
-                    ),
-                    "label_list": cupy.arange(
-                        min_batch_id, max_batch_id + 1, dtype="int32"
-                    ),
+                    "keep_batches_together": True,
+                    "min_batch_id": min_batch_id,
+                    "max_batch_id": max_batch_id,
                 }
             )
 
+        start_time_sample_call = time.perf_counter()
+
+        # Call uniform neighbor sample
         samples, offsets = sample_fn(
             self.__graph,
             **self.__sample_call_args,
-            start_list=self.__batches[self.start_col_name][batch_id_filter],
-            batch_id_list=self.__batches[self.batch_col_name][batch_id_filter],
+            start_list=self.__batches[[self.start_col_name, self.batch_col_name]][
+                batch_id_filter
+            ],
+            with_batch_ids=True,
             with_edge_properties=True,
             return_offsets=True,
         )
 
+        end_time_sample_call = time.perf_counter()
+        sample_runtime = end_time_sample_call - start_time_sample_call
+
+        self.__logger.info(
+            f"Called uniform neighbor sample, took {sample_runtime:.4f} s"
+        )
+
+        # Filter batches to remove those already processed
         self.__batches = self.__batches[~batch_id_filter]
+        del batch_id_filter
         if isinstance(self.__batches, dask_cudf.DataFrame):
             self.__batches = self.__batches.persist()
 
+        start_time_write = time.perf_counter()
+
+        # Write batches to parquet
         self.__write(samples, offsets)
+        if isinstance(self.__batches, dask_cudf.DataFrame):
+            wait(
+                [f.release() for f in futures_of(samples)]
+                + [f.release() for f in futures_of(offsets)]
+            )
 
-        if self.size > 0:
+        del samples
+        del offsets
+
+        end_time_write = time.perf_counter()
+        write_runtime = end_time_write - start_time_write
+        self.__logger.info(f"Wrote samples to parquet, took {write_runtime} seconds")
+
+        current_size = self.size
+        if current_size > 0:
+            self.__logger.info(
+                f"There are still {current_size} samples remaining, "
+                "calling flush() again..."
+            )
             self.flush()
 
     def __write(
@@ -232,13 +294,3 @@ def __write(
         write_samples(
             samples, offsets, self.__batches_per_partition, self.__output_path
         )
-
-    def __get_label_to_output_comm_rank(self, min_batch_id, max_batch_id):
-        num_workers = dask_cugraph.get_n_workers()
-        num_batches = max_batch_id - min_batch_id + 1
-        z = cupy.zeros(num_batches, dtype="int32")
-        s = cupy.array_split(cupy.arange(num_batches), num_workers)
-        for i, t in enumerate(s):
-            z[t] = i
-
-        return cudf.Series(z)
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
index d7f1c136484..44c1185bbf1 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -24,7 +24,7 @@ def _write_samples_to_parquet(
     batches_per_partition: int,
     output_path: str,
     partition_info: Optional[Union[dict, str]] = None,
-) -> None:
+) -> cudf.Series:
     """
     Writes the samples to parquet.
     results: cudf.DataFrame
@@ -40,11 +40,13 @@ def _write_samples_to_parquet(
         Either a dictionary containing partition data from dask, the string 'sg'
         indicating that this is a single GPU write, or None indicating that this
         function should perform a no-op (required by dask).
+
+    Returns an empty cudf series.
     """
 
     # Required by dask; need to skip dummy partitions.
     if partition_info is None or len(results) == 0:
-        return
+        return cudf.Series(dtype="int64")
     if partition_info != "sg" and (not isinstance(partition_info, dict)):
         raise ValueError("Invalid value of partition_info")
 
@@ -71,6 +73,8 @@ def _write_samples_to_parquet(
         ).values
         results_p.to_parquet(full_output_path, compression=None, index=False)
 
+    return cudf.Series(dtype="int64")
+
 
 def write_samples(
     results: cudf.DataFrame,
@@ -97,7 +101,9 @@ def write_samples(
             batches_per_partition,
             output_path,
             align_dataframes=False,
+            meta=cudf.Series(dtype="int64"),
         ).compute()
+
     else:
         _write_samples_to_parquet(
             results, offsets, batches_per_partition, output_path, partition_info="sg"
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index d6acaa550eb..d239f92d485 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -15,6 +15,7 @@
 
 from pylibcugraph import ResourceHandle
 from pylibcugraph import uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample
+from pylibcugraph.utilities.api_tools import deprecated_warning_wrapper
 
 import numpy
 
@@ -29,6 +30,10 @@
     from cugraph import Graph
 
 
+start_col_name = "_START_"
+batch_col_name = "_BATCH_"
+
+
 # FIXME: Move this function to the utility module so that it can be
 # shared by other algos
 def ensure_valid_dtype(input_graph, start_list):
@@ -50,7 +55,7 @@ def ensure_valid_dtype(input_graph, start_list):
     return start_list
 
 
-def uniform_neighbor_sample(
+def _uniform_neighbor_sample_legacy(
     G: Graph,
     start_list: Sequence,
     fanout_vals: List[int],
@@ -60,6 +65,135 @@ def uniform_neighbor_sample(
     random_state: int = None,
     return_offsets: bool = False,
 ) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]:
+
+    warnings.warn(
+        "The batch_id_list parameter is deprecated. "
+        "Consider passing a DataFrame where the last column "
+        "is the batch ids and setting with_batch_ids=True"
+    )
+
+    if isinstance(start_list, int):
+        start_list = [start_list]
+
+    if isinstance(start_list, list):
+        start_list = cudf.Series(
+            start_list, dtype=G.edgelist.edgelist_df[G.srcCol].dtype
+        )
+
+    if with_edge_properties and batch_id_list is None:
+        batch_id_list = cp.zeros(len(start_list), dtype="int32")
+
+    # fanout_vals must be a host array!
+    # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
+    if isinstance(fanout_vals, list):
+        fanout_vals = numpy.asarray(fanout_vals, dtype="int32")
+    else:
+        raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}")
+
+    if "weights" in G.edgelist.edgelist_df:
+        weight_t = G.edgelist.edgelist_df["weights"].dtype
+    else:
+        weight_t = "float32"
+
+    start_list = ensure_valid_dtype(G, start_list)
+
+    if G.renumbered is True:
+        if isinstance(start_list, cudf.DataFrame):
+            start_list = G.lookup_internal_vertex_id(start_list, start_list.columns)
+        else:
+            start_list = G.lookup_internal_vertex_id(start_list)
+
+    sampling_result = pylibcugraph_uniform_neighbor_sample(
+        resource_handle=ResourceHandle(),
+        input_graph=G._plc_graph,
+        start_list=start_list,
+        h_fan_out=fanout_vals,
+        with_replacement=with_replacement,
+        do_expensive_check=False,
+        with_edge_properties=with_edge_properties,
+        batch_id_list=batch_id_list,
+        random_state=random_state,
+    )
+
+    df = cudf.DataFrame()
+
+    if with_edge_properties:
+        (
+            sources,
+            destinations,
+            weights,
+            edge_ids,
+            edge_types,
+            batch_ids,
+            offsets,
+            hop_ids,
+        ) = sampling_result
+
+        df["sources"] = sources
+        df["destinations"] = destinations
+        df["weight"] = weights
+        df["edge_id"] = edge_ids
+        df["edge_type"] = edge_types
+        df["hop_id"] = hop_ids
+
+        if return_offsets:
+            offsets_df = cudf.DataFrame(
+                {
+                    "batch_id": batch_ids,
+                    "offsets": offsets[:-1],
+                }
+            )
+
+        else:
+            if len(batch_ids) > 0:
+                batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets))
+                batch_ids.reset_index(drop=True, inplace=True)
+
+            df["batch_id"] = batch_ids
+
+    else:
+        sources, destinations, indices = sampling_result
+
+        df["sources"] = sources
+        df["destinations"] = destinations
+
+        if indices is None:
+            df["indices"] = None
+        else:
+            df["indices"] = indices
+            if weight_t == "int32":
+                df["indices"] = indices.astype("int32")
+            elif weight_t == "int64":
+                df["indices"] = indices.astype("int64")
+            else:
+                df["indices"] = indices
+
+    if G.renumbered:
+        df = G.unrenumber(df, "sources", preserve_order=True)
+        df = G.unrenumber(df, "destinations", preserve_order=True)
+
+    if return_offsets:
+        return df, offsets_df
+
+    return df
+
+
+uniform_neighbor_sample_legacy = deprecated_warning_wrapper(
+    _uniform_neighbor_sample_legacy
+)
+
+
+def uniform_neighbor_sample(
+    G: Graph,
+    start_list: Sequence,
+    fanout_vals: List[int],
+    with_replacement: bool = True,
+    with_edge_properties: bool = False,
+    batch_id_list: Sequence = None,  # deprecated
+    with_batch_ids: bool = False,
+    random_state: int = None,
+    return_offsets: bool = False,
+) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]:
     """
     Does neighborhood sampling, which samples nodes from a graph based on the
     current node's neighbors, with a corresponding fanout value at each hop.
@@ -85,9 +219,14 @@ def uniform_neighbor_sample(
         edge type, batch id, hop id) with the sampled edges.
 
     batch_id_list: list (int32)
+        Deprecated.
         List of batch ids that will be returned with the sampled edges if
         with_edge_properties is set to True.
 
+    with_batch_ids: bool, optional (default=False)
+        Flag to specify whether batch ids are present in the start_list
+        Assumes they are the last column in the start_list dataframe
+
     random_state: int, optional
         Random seed to use when making sampling calls.
 
@@ -148,6 +287,18 @@ def uniform_neighbor_sample(
                     Contains the offsets of each batch in the sampling result
     """
 
+    if batch_id_list is not None:
+        return uniform_neighbor_sample_legacy(
+            G,
+            start_list,
+            fanout_vals,
+            with_replacement=with_replacement,
+            with_edge_properties=with_edge_properties,
+            batch_id_list=batch_id_list,
+            random_state=random_state,
+            return_offsets=return_offsets,
+        )
+
     if isinstance(start_list, int):
         start_list = [start_list]
 
@@ -156,8 +307,13 @@ def uniform_neighbor_sample(
             start_list, dtype=G.edgelist.edgelist_df[G.srcCol].dtype
         )
 
-    if with_edge_properties and batch_id_list is None:
-        batch_id_list = cp.zeros(len(start_list), dtype="int32")
+    if with_edge_properties and not with_batch_ids:
+        if isinstance(start_list, cudf.Series):
+            start_list = start_list.to_frame()
+
+        start_list[batch_col_name] = cudf.Series(
+            cp.zeros(len(start_list), dtype="int32")
+        )
 
     # fanout_vals must be a host array!
     # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
@@ -173,21 +329,37 @@ def uniform_neighbor_sample(
 
     start_list = ensure_valid_dtype(G, start_list)
 
-    if G.renumbered is True:
-        if isinstance(start_list, cudf.DataFrame):
-            start_list = G.lookup_internal_vertex_id(start_list, start_list.columns)
+    if isinstance(start_list, cudf.Series):
+        start_list = start_list.rename(start_col_name)
+        start_list = start_list.to_frame()
+
+        if G.renumbered:
+            start_list = G.lookup_internal_vertex_id(start_list, start_col_name)
+    else:
+        columns = start_list.columns
+
+        if with_batch_ids:
+            if G.renumbered:
+                start_list = G.lookup_internal_vertex_id(start_list, columns[:-1])
+            start_list = start_list.rename(
+                columns={columns[0]: start_col_name, columns[-1]: batch_col_name}
+            )
         else:
-            start_list = G.lookup_internal_vertex_id(start_list)
+            if G.renumbered:
+                start_list = G.lookup_internal_vertex_id(start_list, columns)
+            start_list = start_list.rename(columns={columns[0]: start_col_name})
 
     sampling_result = pylibcugraph_uniform_neighbor_sample(
         resource_handle=ResourceHandle(),
         input_graph=G._plc_graph,
-        start_list=start_list,
+        start_list=start_list[start_col_name],
+        batch_id_list=start_list[batch_col_name]
+        if batch_col_name in start_list
+        else None,
         h_fan_out=fanout_vals,
         with_replacement=with_replacement,
         do_expensive_check=False,
         with_edge_properties=with_edge_properties,
-        batch_id_list=batch_id_list,
         random_state=random_state,
     )
 
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
index 5d2f050bce9..39d2fbea7dd 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
@@ -285,7 +285,7 @@ def test_uniform_neighbor_sample_unweighted(simple_unweighted_input_expected_out
 
     sampling_results = uniform_neighbor_sample(
         test_data["Graph"],
-        test_data["start_list"],
+        test_data["start_list"].astype("int64"),
         test_data["fanout_vals"],
         test_data["with_replacement"],
     )
@@ -330,11 +330,11 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets):
 
     sampling_results = uniform_neighbor_sample(
         G,
-        start_list=start_df["seed"],
+        start_list=start_df,
         fanout_vals=[2, 2],
         with_replacement=False,
         with_edge_properties=True,
-        batch_id_list=start_df["batch"],
+        with_batch_ids=True,
         return_offsets=return_offsets,
     )
     if return_offsets:
@@ -389,11 +389,16 @@ def test_uniform_neighbor_sample_edge_properties_self_loops():
 
     sampling_results = cugraph.uniform_neighbor_sample(
         G,
-        start_list=cudf.Series([0, 1, 2]),
-        batch_id_list=cudf.Series([1, 1, 1], dtype="int32"),
+        start_list=cudf.DataFrame(
+            {
+                "start": cudf.Series([0, 1, 2]),
+                "batch": cudf.Series([1, 1, 1], dtype="int32"),
+            }
+        ),
         fanout_vals=[2, 2],
         with_replacement=False,
         with_edge_properties=True,
+        with_batch_ids=True,
         random_state=80,
     )
 
@@ -460,11 +465,16 @@ def test_uniform_neighbor_sample_hop_id_order_multi_batch():
 
     sampling_results = cugraph.uniform_neighbor_sample(
         G,
-        cudf.Series([0, 1], dtype="int64"),
+        start_list=cudf.DataFrame(
+            {
+                "start": cudf.Series([0, 1], dtype="int64"),
+                "batch": cudf.Series([0, 1], dtype="int32"),
+            }
+        ),
         fanout_vals=[2, 2, 2],
-        batch_id_list=cudf.Series([0, 1], dtype="int32"),
         with_replacement=False,
         with_edge_properties=True,
+        with_batch_ids=True,
     )
 
     for b in range(2):
@@ -502,11 +512,16 @@ def test_uniform_neighbor_sample_empty_start_list():
 
     sampling_results = cugraph.uniform_neighbor_sample(
         G,
-        start_list=cudf.Series([], dtype="int64"),
-        batch_id_list=cudf.Series([], dtype="int32"),
+        start_list=cudf.DataFrame(
+            {
+                "start_list": cudf.Series(dtype="int64"),
+                "batch_id_list": cudf.Series(dtype="int32"),
+            }
+        ),
         fanout_vals=[2, 2],
         with_replacement=False,
         with_edge_properties=True,
+        with_batch_ids=True,
         random_state=32,
     )
 
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
index 033b96487c4..4da3f3cf950 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -327,7 +327,8 @@ def test_mg_uniform_neighbor_sample_ensure_no_duplicates(dask_client):
 @pytest.mark.cugraph_ops
 @pytest.mark.parametrize("return_offsets", [True, False])
 def test_uniform_neighbor_sample_edge_properties(dask_client, return_offsets):
-    if len(dask_client.scheduler_info()["workers"]) <= 1:
+    n_workers = len(dask_client.scheduler_info()["workers"])
+    if n_workers <= 1:
         pytest.skip("Test only valid for MG environments")
     edgelist_df = dask_cudf.from_cudf(
         cudf.DataFrame(
@@ -352,43 +353,58 @@ def test_uniform_neighbor_sample_edge_properties(dask_client, return_offsets):
         edge_attr=["w", "eid", "etp"],
     )
 
-    dest_rank = [0, 1]
     sampling_results = cugraph.dask.uniform_neighbor_sample(
         G,
-        start_list=cudf.Series([0, 4], dtype="int64"),
+        start_list=cudf.DataFrame(
+            {
+                "start": cudf.Series([0, 4], dtype="int64"),
+                "batch": cudf.Series([0, 1], dtype="int32"),
+            }
+        ),
         fanout_vals=[-1, -1],
         with_replacement=False,
         with_edge_properties=True,
-        batch_id_list=cudf.Series([0, 1], dtype="int32"),
-        label_list=cudf.Series([0, 1], dtype="int32") if return_offsets else None,
-        label_to_output_comm_rank=cudf.Series(dest_rank, dtype="int32")
-        if return_offsets
-        else None,
+        with_batch_ids=True,
+        keep_batches_together=True,
+        min_batch_id=0,
+        max_batch_id=1,
         return_offsets=return_offsets,
     )
 
     if return_offsets:
         sampling_results, sampling_offsets = sampling_results
 
-        df_p0 = sampling_results.get_partition(0).compute()
-        assert sorted(df_p0.sources.values_host.tolist()) == (
-            [0, 0, 0, 1, 1, 2, 2, 2, 4, 4]
-        )
-        assert sorted(df_p0.destinations.values_host.tolist()) == (
-            [1, 1, 1, 2, 2, 3, 3, 4, 4, 4]
-        )
-
-        df_p1 = sampling_results.get_partition(1).compute()
-        assert sorted(df_p1.sources.values_host.tolist()) == ([1, 1, 3, 3, 4, 4])
-        assert sorted(df_p1.destinations.values_host.tolist()) == ([1, 2, 2, 3, 3, 4])
-
-        offsets_p0 = sampling_offsets.get_partition(0).compute()
-        assert offsets_p0.batch_id.values_host.tolist() == [0]
-        assert offsets_p0.offsets.values_host.tolist() == [0]
-
-        offsets_p1 = sampling_offsets.get_partition(1).compute()
-        assert offsets_p1.batch_id.values_host.tolist() == [1]
-        assert offsets_p1.offsets.values_host.tolist() == [0]
+        batches_found = {0: 0, 1: 0}
+        for i in range(n_workers):
+            dfp = sampling_results.get_partition(i).compute()
+            if len(dfp) > 0:
+                offsets_p = sampling_offsets.get_partition(i).compute()
+                assert len(offsets_p) > 0
+
+                if offsets_p.batch_id.iloc[0] == 1:
+                    batches_found[1] += 1
+
+                    assert offsets_p.batch_id.values_host.tolist() == [1]
+                    assert offsets_p.offsets.values_host.tolist() == [0]
+
+                    assert sorted(dfp.sources.values_host.tolist()) == (
+                        [1, 1, 3, 3, 4, 4]
+                    )
+                    assert sorted(dfp.destinations.values_host.tolist()) == (
+                        [1, 2, 2, 3, 3, 4]
+                    )
+                elif offsets_p.batch_id.iloc[0] == 0:
+                    batches_found[0] += 1
+
+                    assert offsets_p.batch_id.values_host.tolist() == [0]
+                    assert offsets_p.offsets.values_host.tolist() == [0]
+
+                    assert sorted(dfp.sources.values_host.tolist()) == (
+                        [0, 0, 0, 1, 1, 2, 2, 2, 4, 4]
+                    )
+                    assert sorted(dfp.destinations.values_host.tolist()) == (
+                        [1, 1, 1, 2, 2, 3, 3, 4, 4, 4]
+                    )
 
     mdf = cudf.merge(
         sampling_results.compute(),
@@ -446,13 +462,19 @@ def test_uniform_neighbor_sample_edge_properties_self_loops(dask_client):
 
     sampling_results = cugraph.dask.uniform_neighbor_sample(
         G,
-        start_list=dask_cudf.from_cudf(cudf.Series([0, 1, 2]), npartitions=2),
-        batch_id_list=dask_cudf.from_cudf(
-            cudf.Series([1, 1, 1], dtype="int32"), npartitions=2
+        start_list=dask_cudf.from_cudf(
+            cudf.DataFrame(
+                {
+                    "start": cudf.Series([0, 1, 2], dtype="int64"),
+                    "batch": cudf.Series([1, 1, 1], dtype="int32"),
+                }
+            ),
+            npartitions=2,
         ),
         fanout_vals=[2, 2],
         with_replacement=False,
         with_edge_properties=True,
+        with_batch_ids=True,
     ).compute()
 
     assert sorted(sampling_results.sources.values_host.tolist()) == [0, 0, 1, 1, 2, 2]
@@ -526,23 +548,32 @@ def test_uniform_neighbor_sample_hop_id_order_multi_batch():
 
     sampling_results = cugraph.dask.uniform_neighbor_sample(
         G,
-        cudf.Series([0, 1], dtype="int64"),
+        dask_cudf.from_cudf(
+            cudf.DataFrame(
+                {
+                    "start": cudf.Series([0, 1], dtype="int64"),
+                    "batch": cudf.Series([0, 1], dtype="int32"),
+                }
+            ),
+            npartitions=2,
+        ),
         fanout_vals=[2, 2, 2],
-        batch_id_list=cudf.Series([0, 1], dtype="int32"),
         with_replacement=False,
         with_edge_properties=True,
+        with_batch_ids=True,
     )
 
     for p in range(sampling_results.npartitions):
         sampling_results_p = sampling_results.get_partition(p)
-        for b in range(2):
-            sampling_results_pb = sampling_results_p[
-                sampling_results_p.batch_id == b
-            ].compute()
-            assert (
-                sorted(sampling_results_pb.hop_id.values_host.tolist())
-                == sampling_results_pb.hop_id.values_host.tolist()
-            )
+        if len(sampling_results_p) > 0:
+            for b in range(2):
+                sampling_results_pb = sampling_results_p[
+                    sampling_results_p.batch_id == b
+                ].compute()
+                assert (
+                    sorted(sampling_results_pb.hop_id.values_host.tolist())
+                    == sampling_results_pb.hop_id.values_host.tolist()
+                )
 
 
 @pytest.mark.mg
@@ -577,11 +608,19 @@ def test_uniform_neighbor_edge_properties_sample_small_start_list(
 
     cugraph.dask.uniform_neighbor_sample(
         G,
-        start_list=cudf.Series([0]),
+        start_list=dask_cudf.from_cudf(
+            cudf.Series(
+                {
+                    "start": cudf.Series([0]),
+                    "batch": cudf.Series([10], dtype="int32"),
+                }
+            ),
+            npartitions=1,
+        ),
         fanout_vals=[10, 25],
         with_replacement=with_replacement,
         with_edge_properties=True,
-        batch_id_list=cudf.Series([10], dtype="int32"),
+        with_batch_ids=True,
     )
 
 
@@ -610,11 +649,16 @@ def test_uniform_neighbor_sample_without_dask_inputs(dask_client):
 
     sampling_results = cugraph.dask.uniform_neighbor_sample(
         G,
-        start_list=cudf.Series([0, 1, 2]),
-        batch_id_list=cudf.Series([1, 1, 1], dtype="int32"),
+        start_list=cudf.DataFrame(
+            {
+                "start": cudf.Series([0, 1, 2]),
+                "batch": cudf.Series([1, 1, 1], dtype="int32"),
+            }
+        ),
         fanout_vals=[2, 2],
         with_replacement=False,
         with_edge_properties=True,
+        with_batch_ids=True,
     ).compute()
 
     assert sorted(sampling_results.sources.values_host.tolist()) == [0, 0, 1, 1, 2, 2]
@@ -664,24 +708,24 @@ def test_uniform_neighbor_sample_batched(dask_client, dataset, input_df, max_bat
     input_vertices = dask_cudf.concat([df.src, df.dst]).unique().compute()
     assert isinstance(input_vertices, cudf.Series)
 
+    input_vertices.name = "start"
     input_vertices.index = cupy.random.permutation(len(input_vertices))
+    input_vertices = input_vertices.to_frame().reset_index(drop=True)
 
-    input_batch = cudf.Series(
+    input_vertices["batch"] = cudf.Series(
         cupy.random.randint(0, max_batches, len(input_vertices)), dtype="int32"
     )
-    input_batch.index = cupy.random.permutation(len(input_vertices))
 
     if input_df == dask_cudf.DataFrame:
-        input_batch = dask_cudf.from_cudf(input_batch, npartitions=num_workers)
         input_vertices = dask_cudf.from_cudf(input_vertices, npartitions=num_workers)
 
     sampling_results = cugraph.dask.uniform_neighbor_sample(
         G,
         start_list=input_vertices,
-        batch_id_list=input_batch,
         fanout_vals=[5, 5],
         with_replacement=False,
         with_edge_properties=True,
+        with_batch_ids=True,
     )
 
     for batch_id in range(max_batches):
@@ -693,7 +737,7 @@ def test_uniform_neighbor_sample_batched(dask_client, dataset, input_df, max_bat
             .compute()
         )
 
-        input_starts_per_batch = len(input_batch[input_batch == batch_id])
+        input_starts_per_batch = len(input_vertices[input_vertices.batch == batch_id])
 
         # Should be <= to account for starts without outgoing edges
         assert output_starts_per_batch <= input_starts_per_batch

From c33d4bb839fc9bd514849d0241eb70f835b0a7ac Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 18 Jul 2023 09:00:13 -0500
Subject: [PATCH 09/10] Only run cugraph conda CI for CUDA 11. (#3713)

We are planning to enable CUDA 12 CI jobs on July 18, per https://github.com/rapidsai/shared-action-workflows/issues/112. This PR must be merged to disable CUDA 12 CI jobs for cugraph until #3271 / #3456 are merged.

Authors:
   - Bradley Dice (https://github.com/bdice)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
---
 .github/workflows/build.yaml | 4 +++-
 .github/workflows/pr.yaml    | 8 ++++++--
 .github/workflows/test.yaml  | 2 ++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index de9caa0fabe..cf73e1d2d27 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -30,6 +30,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -39,6 +40,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -61,7 +63,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4d52cd26de4..d9029ea37a1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -35,6 +35,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
@@ -42,18 +43,21 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
@@ -63,7 +67,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -73,7 +77,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibcugraph:
     needs: checks
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index d697b8f1649..1b8cfaf25b7 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -18,6 +18,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -26,6 +27,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}

From 32e6e5184cf63ac989356d6e2b3f93eab63073a4 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 18 Jul 2023 16:53:18 +0200
Subject: [PATCH 10/10] [BUG]   Fix namesapce to default_hash and
 hash_functions (#3711)

Fix namesapce to default_hash and  hash_functions

Authors:
  - Naim (https://github.com/naimnv)
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3711
---
 .../cugraph/utilities/device_functors.cuh     |  16 +-
 .../include/hash/concurrent_unordered_map.cuh |   5 +-
 cpp/src/community/detail/common_methods.cuh   |   6 +
 cpp/src/community/detail/refine_impl.cuh      |   5 +
 cpp/src/prims/detail/nbr_intersection.cuh     |   4 +-
 cpp/src/prims/kv_store.cuh                    | 641 +++++++++++++++---
 ..._v_pair_transform_dst_nbr_intersection.cuh |   2 +-
 ...r_v_random_select_transform_outgoing_e.cuh |  11 +-
 ...m_reduce_dst_key_aggregated_outgoing_e.cuh |   2 +-
 cpp/src/structure/relabel_impl.cuh            |   4 +-
 10 files changed, 568 insertions(+), 128 deletions(-)

diff --git a/cpp/include/cugraph/utilities/device_functors.cuh b/cpp/include/cugraph/utilities/device_functors.cuh
index d29e7c47d14..501e74cf47b 100644
--- a/cpp/include/cugraph/utilities/device_functors.cuh
+++ b/cpp/include/cugraph/utilities/device_functors.cuh
@@ -57,16 +57,28 @@ struct pack_bool_t {
   }
 };
 
-template <typename Iterator>
+template <typename index_t, typename Iterator>
 struct indirection_t {
   Iterator first{};
 
-  __device__ typename thrust::iterator_traits<Iterator>::value_type operator()(size_t i) const
+  __device__ typename thrust::iterator_traits<Iterator>::value_type operator()(index_t i) const
   {
     return *(first + i);
   }
 };
 
+template <typename index_t, typename Iterator>
+struct indirection_if_idx_valid_t {
+  Iterator first{};
+  index_t invalid_idx{};
+  typename thrust::iterator_traits<Iterator>::value_type invalid_value{};
+
+  __device__ typename thrust::iterator_traits<Iterator>::value_type operator()(index_t i) const
+  {
+    return (i != invalid_idx) ? *(first + i) : invalid_value;
+  }
+};
+
 template <typename T>
 struct not_equal_t {
   T compare{};
diff --git a/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh b/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh
index f097f9c43a2..ab14ff6c685 100644
--- a/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh
+++ b/cpp/libcugraph_etl/include/hash/concurrent_unordered_map.cuh
@@ -27,7 +27,8 @@
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -118,7 +119,7 @@ union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
  */
 template <typename Key,
           typename Element,
-          typename Hasher    = cudf::detail::default_hash<Key>,
+          typename Hasher    = cudf::hashing::detail::default_hash<Key>,
           typename Equality  = equal_to<Key>,
           typename Allocator = default_allocator<thrust::pair<Key, Element>>>
 class concurrent_unordered_map {
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index 62ede6eaafb..b388ba53e81 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -18,6 +18,7 @@
 #include <community/detail/common_methods.hpp>
 
 #include <detail/graph_partition_utils.cuh>
+#include <prims/kv_store.cuh>
 #include <prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh>
 #include <prims/per_v_transform_reduce_incoming_outgoing_e.cuh>
 #include <prims/reduce_op.cuh>
@@ -42,6 +43,11 @@
 
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
+// FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
+namespace cuco {
+template <>
+struct is_bitwise_comparable<cuco::pair<int32_t, float>> : std::true_type {};
+}  // namespace cuco
 
 namespace cugraph {
 namespace detail {
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index bbd720131de..e811aafc776 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -48,6 +48,11 @@
 
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
+// FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
+namespace cuco {
+template <>
+struct is_bitwise_comparable<cuco::pair<int32_t, float>> : std::true_type {};
+}  // namespace cuco
 
 namespace cugraph {
 namespace detail {
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index 2d0d0a876e6..98453d46c3f 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -974,7 +974,7 @@ nbr_intersection(raft::handle_t const& handle,
             .get_stream());  // initially store minimum degrees (upper bound for intersection sizes)
         if (intersect_minor_nbr[0] && intersect_minor_nbr[1]) {
           auto second_element_to_idx_map =
-            detail::kv_cuco_store_device_view_t((*major_to_idx_map_ptr)->view());
+            detail::kv_cuco_store_find_device_view_t((*major_to_idx_map_ptr)->view());
           thrust::transform(
             handle.get_thrust_policy(),
             get_dataframe_buffer_begin(vertex_pair_buffer),
@@ -1005,7 +1005,7 @@ nbr_intersection(raft::handle_t const& handle,
           handle.get_stream());
         if (intersect_minor_nbr[0] && intersect_minor_nbr[1]) {
           auto second_element_to_idx_map =
-            detail::kv_cuco_store_device_view_t((*major_to_idx_map_ptr)->view());
+            detail::kv_cuco_store_find_device_view_t((*major_to_idx_map_ptr)->view());
           thrust::tabulate(
             handle.get_thrust_policy(),
             rx_v_pair_nbr_intersection_sizes.begin(),
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 8395fc55833..f20865c92dc 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -16,11 +16,14 @@
 #pragma once
 
 #include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/device_functors.cuh>
 
 #include <cuco/static_map.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -29,6 +32,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
+#include <thrust/unique.h>
 
 #include <algorithm>
 #include <memory>
@@ -45,7 +49,7 @@ namespace cugraph {
 namespace detail {
 
 template <typename KeyIterator, typename ValueIterator>
-struct binary_search_find_op_t {
+struct kv_binary_search_find_op_t {
   using key_type   = typename thrust::iterator_traits<KeyIterator>::value_type;
   using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
 
@@ -67,7 +71,7 @@ struct binary_search_find_op_t {
 };
 
 template <typename KeyIterator>
-struct binary_search_contains_op_t {
+struct kv_binary_search_contains_op_t {
   using key_type = typename thrust::iterator_traits<KeyIterator>::value_type;
 
   KeyIterator store_key_first{};
@@ -79,6 +83,105 @@ struct binary_search_contains_op_t {
   }
 };
 
+template <typename KeyIterator>
+struct kv_cuco_insert_and_increment_t {
+  using key_type        = typename thrust::iterator_traits<KeyIterator>::value_type;
+  using cuco_store_type = cuco::experimental::static_map<
+    key_type,
+    size_t,
+    cuco::experimental::extent<std::size_t>,
+    cuda::thread_scope_device,
+    thrust::equal_to<key_type>,
+    cuco::experimental::linear_probing<1,  // CG size
+                                       cuco::murmurhash3_32<key_t>>,
+    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
+
+  typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag> device_ref{};
+  KeyIterator key_first{};
+  size_t* counter{nullptr};
+  size_t invalid_idx{};
+
+  __device__ size_t operator()(size_t i)
+  {
+    auto pair             = thrust::make_tuple(*(key_first + i), size_t{0} /* dummy */);
+    auto [iter, inserted] = device_ref.insert_and_find(pair);
+    if (inserted) {
+      cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(*counter);
+      auto idx       = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
+      using ref_type = typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag>;
+      cuda::atomic_ref<typename ref_type::mapped_type, ref_type::thread_scope> ref((*iter).second);
+      ref.store(idx, cuda::std::memory_order_relaxed);
+      return idx;
+    } else {
+      return invalid_idx;
+    }
+  }
+};
+
+template <typename KeyIterator, typename StencilIterator, typename PredOp>
+struct kv_cuco_insert_if_and_increment_t {
+  using key_type        = typename thrust::iterator_traits<KeyIterator>::value_type;
+  using cuco_store_type = cuco::experimental::static_map<
+    key_type,
+    size_t,
+    cuco::experimental::extent<std::size_t>,
+    cuda::thread_scope_device,
+    thrust::equal_to<key_type>,
+    cuco::experimental::linear_probing<1,  // CG size
+                                       cuco::murmurhash3_32<key_t>>,
+    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
+
+  typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag> device_ref{};
+  KeyIterator key_first{};
+  StencilIterator stencil_first{};
+  PredOp pred_op{};
+  size_t* counter{nullptr};
+  size_t invalid_idx{};
+
+  __device__ size_t operator()(size_t i)
+  {
+    if (pred_op(*(stencil_first + i)) == false) { return invalid_idx; }
+
+    auto pair             = thrust::make_tuple(*(key_first + i), size_t{0} /* dummy */);
+    auto [iter, inserted] = device_ref.insert_and_find(pair);
+    if (inserted) {
+      cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(*counter);
+      auto idx       = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
+      using ref_type = typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag>;
+      cuda::atomic_ref<typename ref_type::mapped_type, ref_type::thread_scope> ref((*iter).second);
+      ref.store(idx, cuda::std::memory_order_relaxed);
+      return idx;
+    } else {
+      return invalid_idx;
+    }
+  }
+};
+
+template <typename key_t, typename value_t>
+struct kv_cuco_insert_and_assign_t {
+  using cuco_store_type = cuco::experimental::static_map<
+    key_t,
+    std::conditional_t<std::is_arithmetic_v<value_t>, value_t, size_t>,
+    cuco::experimental::extent<std::size_t>,
+    cuda::thread_scope_device,
+    thrust::equal_to<key_t>,
+    cuco::experimental::linear_probing<1,  // CG size
+                                       cuco::murmurhash3_32<key_t>>,
+    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
+
+  typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag> device_ref{};
+
+  __device__ void operator()(thrust::tuple<key_t, value_t> pair)
+  {
+    auto [iter, inserted] = device_ref.insert_and_find(pair);
+    if (!inserted) {
+      using ref_type = typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag>;
+      cuda::atomic_ref<typename ref_type::mapped_type, ref_type::thread_scope> ref((*iter).second);
+      ref.store(thrust::get<1>(pair), cuda::std::memory_order_relaxed);
+    }
+  }
+};
+
 template <typename ViewType>
 struct kv_binary_search_store_device_view_t {
   using key_type   = typename ViewType::key_type;
@@ -112,18 +215,19 @@ struct kv_binary_search_store_device_view_t {
 };
 
 template <typename ViewType>
-struct kv_cuco_store_device_view_t {
-  using key_type                    = typename ViewType::key_type;
-  using value_type                  = typename ViewType::value_type;
-  using cuco_store_device_view_type = typename ViewType::cuco_store_type::device_view;
+struct kv_cuco_store_find_device_view_t {
+  using key_type   = typename ViewType::key_type;
+  using value_type = typename ViewType::value_type;
+  using cuco_store_device_ref_type =
+    typename ViewType::cuco_store_type::ref_type<cuco::experimental::find_tag>;
 
   static_assert(!ViewType::binary_search);
 
-  __host__ kv_cuco_store_device_view_t(ViewType view)
-    : cuco_store_device_view(view.cuco_store_device_view())
+  __host__ kv_cuco_store_find_device_view_t(ViewType view)
+    : cuco_store_device_ref(view.cuco_store_find_device_ref())
   {
     if constexpr (std::is_arithmetic_v<value_type>) {
-      invalid_value = cuco_store_device_view.get_empty_value_sentinel();
+      invalid_value = cuco_store_device_ref.empty_value_sentinel();
     } else {
       store_value_first = view.store_value_first();
       invalid_value     = view.invalid_value();
@@ -132,11 +236,11 @@ struct kv_cuco_store_device_view_t {
 
   __device__ value_type find(key_type key) const
   {
-    auto found = cuco_store_device_view.find(key);
-    if (found == cuco_store_device_view.end()) {
+    auto found = cuco_store_device_ref.find(key);
+    if (found == cuco_store_device_ref.end()) {
       return invalid_value;
     } else {
-      auto val = found->second.load(cuda::std::memory_order_relaxed);
+      auto val = (*found).second;
       if constexpr (std::is_arithmetic_v<value_type>) {
         return val;
       } else {
@@ -145,7 +249,7 @@ struct kv_cuco_store_device_view_t {
     }
   }
 
-  cuco_store_device_view_type cuco_store_device_view{};
+  cuco_store_device_ref_type cuco_store_device_ref{};
   std::conditional_t<!std::is_arithmetic_v<value_type>,
                      typename ViewType::value_iterator,
                      std::byte /* dummy */>
@@ -185,7 +289,7 @@ class kv_binary_search_store_view_t {
                       key_first,
                       key_last,
                       value_first,
-                      binary_search_find_op_t<KeyIterator, ValueIterator>{
+                      kv_binary_search_find_op_t<KeyIterator, ValueIterator>{
                         store_key_first_, store_key_last_, store_value_first_, invalid_value_});
   }
 
@@ -195,11 +299,12 @@ class kv_binary_search_store_view_t {
                 ResultValueIterator value_first,
                 rmm::cuda_stream_view stream) const
   {
-    thrust::transform(rmm::exec_policy(stream),
-                      key_first,
-                      key_last,
-                      value_first,
-                      binary_search_contains_op_t<KeyIterator>{store_key_first_, store_key_last_});
+    thrust::transform(
+      rmm::exec_policy(stream),
+      key_first,
+      key_last,
+      value_first,
+      kv_binary_search_contains_op_t<KeyIterator>{store_key_first_, store_key_last_});
   }
 
   KeyIterator store_key_first() const { return store_key_first_; }
@@ -227,31 +332,29 @@ class kv_cuco_store_view_t {
 
   static constexpr bool binary_search = false;
 
-  using cuco_store_type =
-    cuco::static_map<key_t,
-                     std::conditional_t<std::is_arithmetic_v<value_type>, value_type, size_t>,
-                     cuda::thread_scope_device,
-                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<char>>>;
+  using cuco_store_type = cuco::experimental::static_map<
+    key_t,
+    std::conditional_t<std::is_arithmetic_v<value_type>, value_type, size_t>,
+    cuco::experimental::extent<std::size_t>,
+    cuda::thread_scope_device,
+    thrust::equal_to<key_t>,
+    cuco::experimental::linear_probing<1,  // CG size
+                                       cuco::murmurhash3_32<key_t>>,
+    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
 
-  // FIXME: const_cast as a temporary workaround for
-  // https://github.com/NVIDIA/cuCollections/issues/242 (cuco find() is not a const function)
   template <typename type = value_type>
   kv_cuco_store_view_t(cuco_store_type const* store,
                        std::enable_if_t<std::is_arithmetic_v<type>, int32_t> = 0)
-    : cuco_store_(const_cast<cuco_store_type*>(store))
+    : cuco_store_(store)
   {
   }
 
-  // FIXME: const_cast as a temporary workaround for
-  // https://github.com/NVIDIA/cuCollections/issues/242 (cuco find() is not a const function)
   template <typename type = value_type>
   kv_cuco_store_view_t(cuco_store_type const* store,
                        ValueIterator value_first,
                        type invalid_value,
                        std::enable_if_t<!std::is_arithmetic_v<type>, int32_t> = 0)
-    : cuco_store_(const_cast<cuco_store_type*>(store)),
-      store_value_first_(value_first),
-      invalid_value_(invalid_value)
+    : cuco_store_(store), store_value_first_(value_first), invalid_value_(invalid_value)
   {
   }
 
@@ -262,34 +365,17 @@ class kv_cuco_store_view_t {
             rmm::cuda_stream_view stream) const
   {
     if constexpr (std::is_arithmetic_v<value_type>) {
-      cuco_store_->find(key_first,
-                        key_last,
-                        value_first,
-                        cuco::detail::MurmurHash3_32<key_t>{},
-                        thrust::equal_to<key_t>{},
-                        stream);
+      cuco_store_->find(key_first, key_last, value_first, stream.value());
     } else {
       rmm::device_uvector<size_t> indices(thrust::distance(key_first, key_last), stream);
-      cuco_store_->find(key_first,
-                        key_last,
-                        indices.begin(),
-                        cuco::detail::MurmurHash3_32<key_t>{},
-                        thrust::equal_to<key_t>{},
-                        stream);
-      auto invalid_idx = cuco_store_->get_empty_value_sentinel();
-      thrust::transform(rmm::exec_policy(stream),
-                        indices.begin(),
-                        indices.end(),
-                        value_first,
-                        [store_value_first = store_value_first_,
-                         invalid_value     = invalid_value_,
-                         invalid_idx] __device__(auto idx) {
-                          if (idx != invalid_idx) {
-                            return *(store_value_first + idx);
-                          } else {
-                            return invalid_value;
-                          }
-                        });
+      auto invalid_idx = cuco_store_->empty_value_sentinel();
+      cuco_store_->find(key_first, key_last, indices.begin(), stream.value());
+      thrust::transform(
+        rmm::exec_policy(stream),
+        indices.begin(),
+        indices.end(),
+        value_first,
+        indirection_if_idx_valid_t{store_value_first_, invalid_idx, invalid_value_});
     }
   }
 
@@ -299,15 +385,10 @@ class kv_cuco_store_view_t {
                 ResultValueIterator value_first,
                 rmm::cuda_stream_view stream) const
   {
-    cuco_store_->contains(key_first,
-                          key_last,
-                          value_first,
-                          cuco::detail::MurmurHash3_32<key_t>{},
-                          thrust::equal_to<key_t>{},
-                          stream);
+    cuco_store_->contains(key_first, key_last, value_first, stream.value());
   }
 
-  auto cuco_store_device_view() const { return cuco_store_->get_device_view(); }
+  auto cuco_store_find_device_ref() const { return cuco_store_->ref(cuco::experimental::find); }
 
   template <typename type = value_type>
   std::enable_if_t<!std::is_arithmetic_v<type>, ValueIterator> store_value_first() const
@@ -315,21 +396,19 @@ class kv_cuco_store_view_t {
     return store_value_first_;
   }
 
-  key_t invalid_key() const { return cuco_store_->get_empty_key_sentinel(); }
+  key_t invalid_key() const { return cuco_store_->empty_key_sentinel(); }
 
   value_type invalid_value() const
   {
     if constexpr (std::is_arithmetic_v<value_type>) {
-      return cuco_store_->get_empty_value_sentinel();
+      return cuco_store_->empty_value_sentinel();
     } else {
       return invalid_value_;
     }
   }
 
  private:
-  // FIXME: cuco_store should be a const pointer but we can't due to
-  // https://github.com/NVIDIA/cuCollections/issues/242 (cuco find() is not a const function)
-  cuco_store_type* cuco_store_{};
+  cuco_store_type const* cuco_store_{};
   std::conditional_t<std::is_arithmetic_v<value_type>, ValueIterator, std::byte /* dummy */>
     store_value_first_{};
 
@@ -395,6 +474,29 @@ class kv_binary_search_store_t {
     }
   }
 
+  auto retrieve_all(rmm::cuda_stream_view stream)
+  {
+    rmm::device_uvector<key_t> tmp_store_keys(store_keys_.size(), stream);
+    auto tmp_store_values =
+      allocate_dataframe_buffer<value_t>(size_dataframe_buffer(store_values_), stream);
+    thrust::copy(
+      rmm::exec_policy(stream), store_keys_.begin(), store_keys_.end(), tmp_store_keys.begin());
+    thrust::copy(rmm::exec_policy(stream),
+                 get_dataframe_buffer_begin(store_values_),
+                 get_dataframe_buffer_end(store_values_),
+                 get_dataframe_buffer_begin(tmp_store_values));
+    return std::make_tuple(std::move(tmp_store_keys), std::move(tmp_store_values));
+  }
+
+  auto release(rmm::cuda_stream_view stream)
+  {
+    auto tmp_store_keys   = std::move(store_keys_);
+    auto tmp_store_values = std::move(store_values_);
+    store_keys_           = rmm::device_uvector<key_t>(0, stream);
+    store_values_         = allocate_dataframe_buffer<value_t>(0, stream);
+    return std::make_tuple(std::move(tmp_store_keys), std::move(tmp_store_values));
+  }
+
   key_t const* store_key_first() const { return store_keys_.cbegin(); }
 
   key_t const* store_key_last() const { return store_keys_.cend(); }
@@ -403,6 +505,10 @@ class kv_binary_search_store_t {
 
   value_t invalid_value() const { return invalid_value_; }
 
+  size_t size() const { return store_keys_.size(); }
+
+  size_t capacity() const { return store_keys_.size(); }
+
  private:
   rmm::device_uvector<key_t> store_keys_;
   decltype(allocate_dataframe_buffer<value_t>(0, rmm::cuda_stream_view{})) store_values_;
@@ -421,14 +527,28 @@ class kv_cuco_store_t {
     std::invoke_result_t<decltype(get_dataframe_buffer_cbegin<value_buffer_type>),
                          value_buffer_type&>;
 
-  using cuco_store_type =
-    cuco::static_map<key_t,
-                     std::conditional_t<std::is_arithmetic_v<value_t>, value_t, size_t>,
-                     cuda::thread_scope_device,
-                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<char>>>;
+  using cuco_store_type = cuco::experimental::static_map<
+    key_t,
+    std::conditional_t<std::is_arithmetic_v<value_t>, value_t, size_t>,
+    cuco::experimental::extent<std::size_t>,
+    cuda::thread_scope_device,
+    thrust::equal_to<key_t>,
+    cuco::experimental::linear_probing<1,  // CG size
+                                       cuco::murmurhash3_32<key_t>>,
+    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
 
   kv_cuco_store_t(rmm::cuda_stream_view stream) {}
 
+  kv_cuco_store_t(size_t capacity,
+                  key_t invalid_key,
+                  value_t invalid_value,
+                  rmm::cuda_stream_view stream)
+  {
+    allocate(capacity, invalid_key, invalid_value, stream);
+    capacity_ = capacity;
+    size_     = 0;
+  }
+
   template <typename KeyIterator, typename ValueIterator>
   kv_cuco_store_t(KeyIterator key_first,
                   KeyIterator key_last,
@@ -437,51 +557,228 @@ class kv_cuco_store_t {
                   value_t invalid_value,
                   rmm::cuda_stream_view stream)
   {
-    double constexpr load_factor = 0.7;
-    auto num_keys                = static_cast<size_t>(thrust::distance(key_first, key_last));
-    auto cuco_size               = std::max(
-      static_cast<size_t>(static_cast<double>(num_keys) / load_factor),
-      static_cast<size_t>(num_keys) + 1);  // cuco::static_map requires at least one empty slot
-    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(
-      rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource()), stream);
+    auto num_keys = static_cast<size_t>(thrust::distance(key_first, key_last));
+    allocate(num_keys, invalid_key, invalid_value, stream);
+    if constexpr (!std::is_arithmetic_v<value_t>) { invalid_value_ = invalid_value; }
+    capacity_ = num_keys;
+    size_     = 0;
+
+    insert(key_first, key_last, value_first, stream);
+  }
+
+  template <typename KeyIterator, typename ValueIterator>
+  void insert(KeyIterator key_first,
+              KeyIterator key_last,
+              ValueIterator value_first,
+              rmm::cuda_stream_view stream)
+  {
+    auto num_keys = static_cast<size_t>(thrust::distance(key_first, key_last));
+    if (num_keys == 0) return;
+
     if constexpr (std::is_arithmetic_v<value_t>) {
-      cuco_store_ =
-        std::make_unique<cuco_store_type>(cuco_size,
-                                          cuco::sentinel::empty_key<key_t>{invalid_key},
-                                          cuco::sentinel::empty_value<value_t>{invalid_value},
-                                          stream_adapter,
-                                          stream);
+      auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first));
+      size_ += cuco_store_->insert(pair_first, pair_first + num_keys, stream.value());
+    } else {
+      auto old_store_value_size = size_dataframe_buffer(store_values_);
+      // FIXME: we can use cuda::atomic instead but currently on a system with x86 + GPU, this
+      // requires placing the atomic variable on managed memory and this adds additional
+      // complication.
+      rmm::device_scalar<size_t> counter(old_store_value_size, stream);
+      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      rmm::device_uvector<size_t> store_value_offsets(num_keys, stream);
+      thrust::tabulate(
+        rmm::exec_policy(stream),
+        store_value_offsets.begin(),
+        store_value_offsets.end(),
+        kv_cuco_insert_and_increment_t<KeyIterator>{
+          mutable_device_ref, key_first, counter.data(), std::numeric_limits<size_t>::max()});
+      size_ += counter.value(stream);
+      resize_dataframe_buffer(store_values_, size_, stream);
+      thrust::scatter_if(rmm::exec_policy(stream),
+                         value_first,
+                         value_first + num_keys,
+                         store_value_offsets.begin() /* map */,
+                         store_value_offsets.begin() /* stencil */,
+                         get_dataframe_buffer_begin(store_values_),
+                         not_equal_t<size_t>{std::numeric_limits<size_t>::max()});
+    }
+  }
 
+  template <typename KeyIterator, typename ValueIterator, typename StencilIterator, typename PredOp>
+  void insert_if(KeyIterator key_first,
+                 KeyIterator key_last,
+                 ValueIterator value_first,
+                 StencilIterator stencil_first,
+                 PredOp pred_op,
+                 rmm::cuda_stream_view stream)
+  {
+    auto num_keys = static_cast<size_t>(thrust::distance(key_first, key_last));
+    if (num_keys == 0) return;
+
+    if constexpr (std::is_arithmetic_v<value_t>) {
       auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first));
-      cuco_store_->insert(pair_first,
-                          pair_first + num_keys,
-                          cuco::detail::MurmurHash3_32<key_t>{},
-                          thrust::equal_to<key_t>{},
-                          stream);
+      size_ += cuco_store_->insert_if(
+        pair_first, pair_first + num_keys, stencil_first, pred_op, stream.value());
     } else {
-      cuco_store_ = std::make_unique<cuco_store_type>(
-        cuco_size,
-        cuco::sentinel::empty_key<key_t>{invalid_key},
-        cuco::sentinel::empty_value<size_t>{std::numeric_limits<size_t>::max()},
-        stream_adapter,
-        stream);
-      store_values_  = allocate_dataframe_buffer<value_t>(num_keys, stream);
-      invalid_value_ = invalid_value;
+      auto old_store_value_size = size_dataframe_buffer(store_values_);
+      // FIXME: we can use cuda::atomic instead but currently on a system with x86 + GPU, this
+      // requires placing the atomic variable on managed memory and this adds additional
+      // complication.
+      rmm::device_scalar<size_t> counter(old_store_value_size, stream);
+      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      rmm::device_uvector<size_t> store_value_offsets(num_keys, stream);
+      thrust::tabulate(rmm::exec_policy(stream),
+                       store_value_offsets.begin(),
+                       store_value_offsets.end(),
+                       kv_cuco_insert_if_and_increment_t<KeyIterator, StencilIterator, PredOp>{
+                         mutable_device_ref,
+                         key_first,
+                         stencil_first,
+                         pred_op,
+                         counter.data(),
+                         std::numeric_limits<size_t>::max()});
+      size_ += counter.value(stream);
+      resize_dataframe_buffer(store_values_, size_, stream);
+      thrust::scatter_if(rmm::exec_policy(stream),
+                         value_first,
+                         value_first + num_keys,
+                         store_value_offsets.begin() /* map */,
+                         store_value_offsets.begin() /* stencil */,
+                         get_dataframe_buffer_begin(store_values_),
+                         not_equal_t<size_t>{std::numeric_limits<size_t>::max()});
+    }
+  }
+
+  template <typename KeyIterator, typename ValueIterator>
+  void insert_and_assign(KeyIterator key_first,
+                         KeyIterator key_last,
+                         ValueIterator value_first,
+                         rmm::cuda_stream_view stream)
+  {
+    auto num_keys = static_cast<size_t>(thrust::distance(key_first, key_last));
+    if (num_keys == 0) return;
 
+    if constexpr (std::is_arithmetic_v<value_t>) {
+      auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first));
+      // FIXME: a temporary solution till insert_and_assign is added to
+      // cuco::experimental::static_map
+      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      thrust::for_each(rmm::exec_policy(stream),
+                       pair_first,
+                       pair_first + num_keys,
+                       detail::kv_cuco_insert_and_assign_t<key_t, value_t>{mutable_device_ref});
+      // FIXME: this is an upper bound of size_, as some inserts may fail due to existing keys
+      size_ += num_keys;
+    } else {
+      auto old_store_value_size = size_dataframe_buffer(store_values_);
+      // FIXME: we can use cuda::atomic instead but currently on a system with x86 + GPU, this
+      // requires placing the atomic variable on managed memory and this adds additional
+      // complication.
+      rmm::device_scalar<size_t> counter(old_store_value_size, stream);
+      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      rmm::device_uvector<size_t> store_value_offsets(num_keys, stream);
+      thrust::tabulate(
+        rmm::exec_policy(stream),
+        store_value_offsets.begin(),
+        store_value_offsets.end(),
+        kv_cuco_insert_and_increment_t<KeyIterator>{
+          mutable_device_ref, key_first, counter.data(), std::numeric_limits<size_t>::max()});
+      size_ += counter.value(stream);
+      resize_dataframe_buffer(store_values_, size_, stream);
+      thrust::scatter_if(rmm::exec_policy(stream),
+                         value_first,
+                         value_first + num_keys,
+                         store_value_offsets.begin() /* map */,
+                         store_value_offsets.begin() /* stencil */,
+                         get_dataframe_buffer_begin(store_values_),
+                         not_equal_t<size_t>{std::numeric_limits<size_t>::max()});
+
+      // now perform assigns (for k,v pairs that failed to insert)
+
+      rmm::device_uvector<size_t> kv_indices(num_keys, stream);
+      thrust::sequence(rmm::exec_policy(), kv_indices.begin(), kv_indices.end(), size_t{0});
       auto pair_first = thrust::make_zip_iterator(
-        thrust::make_tuple(key_first, thrust::make_counting_iterator(size_t{0})));
-      cuco_store_->insert(pair_first,
-                          pair_first + num_keys,
-                          cuco::detail::MurmurHash3_32<key_t>{},
-                          thrust::equal_to<key_t>{},
-                          stream);
-      thrust::copy(rmm::exec_policy(stream),
-                   value_first,
-                   value_first + num_keys,
-                   get_dataframe_buffer_begin(store_values_));
+        thrust::make_tuple(store_value_offsets.begin(), kv_indices.begin()));
+      kv_indices.resize(
+        thrust::distance(
+          pair_first,
+          thrust::remove_if(rmm::exec_policy(stream),
+                            pair_first,
+                            pair_first + num_keys,
+                            [invalid_idx = std::numeric_limits<size_t>::max()] __device__(
+                              auto pair) { return thrust::get<0>(pair) != invalid_idx; })),
+        stream);
+      store_value_offsets.resize(0, stream);
+      store_value_offsets.shrink_to_fit(stream);
+
+      thrust::sort(rmm::exec_policy(stream),
+                   kv_indices.begin(),
+                   kv_indices.end(),
+                   [key_first] __device__(auto lhs, auto rhs) {
+                     return *(key_first + lhs) < *(key_first + rhs);
+                   });
+      kv_indices.resize(thrust::distance(kv_indices.begin(),
+                                         thrust::unique(rmm::exec_policy(stream),
+                                                        kv_indices.begin(),
+                                                        kv_indices.end(),
+                                                        [key_first] __device__(auto lhs, auto rhs) {
+                                                          return *(key_first + lhs) ==
+                                                                 *(key_first + rhs);
+                                                        })),
+                        stream);
+
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        kv_indices.begin(),
+        kv_indices.end(),
+        [key_first,
+         value_first,
+         store_value_first = get_dataframe_buffer_begin(store_values_),
+         device_ref        = cuco_store_->ref(cuco::experimental::find)] __device__(auto kv_idx) {
+          size_t store_value_offset{};
+          auto found = device_ref.find(*(key_first + kv_idx));
+          assert(found != device_ref.end());
+          store_value_offset                        = (*found).second;
+          *(store_value_first + store_value_offset) = *(value_first + kv_idx);
+        });
     }
   }
 
+  auto retrieve_all(rmm::cuda_stream_view stream)
+  {
+    rmm::device_uvector<key_t> keys(size_, stream);
+    auto values = allocate_dataframe_buffer<value_t>(0, stream);
+    if constexpr (std::is_arithmetic_v<value_t>) {
+      values.resize(size_, stream);
+      auto pair_last = cuco_store_->retrieve_all(keys.begin(), values.begin(), stream.value());
+      // FIXME: this resize (& shrink_to_fit) shouldn't be necessary if size_ is exact
+      keys.resize(thrust::distance(keys.begin(), std::get<0>(pair_last)), stream);
+      values.resize(keys.size(), stream);
+    } else {
+      rmm::device_uvector<size_t> indices(size_, stream);
+      auto pair_last = cuco_store_->retrieve_all(keys.begin(), indices.begin(), stream.value());
+      // FIXME: this resize (& shrink_to_fit) shouldn't be necessary if size_ is exact
+      keys.resize(thrust::distance(keys.begin(), std::get<0>(pair_last)), stream);
+      indices.resize(keys.size(), stream);
+      resize_dataframe_buffer(values, keys.size(), stream);
+      thrust::gather(rmm::exec_policy(stream),
+                     indices.begin(),
+                     indices.end(),
+                     get_dataframe_buffer_begin(store_values_),
+                     get_dataframe_buffer_begin(values));
+    }
+    return std::make_tuple(std::move(keys), std::move(values));
+  }
+
+  auto release(rmm::cuda_stream_view stream)
+  {
+    auto [retrieved_keys, retrieved_values] = retrieve_all(stream);
+    allocate(0, invalid_key(), invalid_value(), stream);
+    capacity_ = 0;
+    size_     = 0;
+    return std::make_tuple(std::move(retrieved_keys), std::move(retrieved_values));
+  }
+
   cuco_store_type const* cuco_store_ptr() const { return cuco_store_.get(); }
 
   template <typename type = value_t>
@@ -490,18 +787,60 @@ class kv_cuco_store_t {
     return get_dataframe_buffer_cbegin(store_values_);
   }
 
-  key_t invalid_key() const { return cuco_store_.get_empty_key_sentinel(); }
+  key_t invalid_key() const { return cuco_store_->empty_key_sentinel(); }
 
   value_t invalid_value() const
   {
     if constexpr (std::is_arithmetic_v<value_t>) {
-      return cuco_store_.get_empty_value_sentinel();
+      return cuco_store_->empty_value_sentinel();
     } else {
       return invalid_value_;
     }
   }
 
+  // FIXME: currently this returns an upper-bound
+  size_t size() const { return size_; }
+
+  size_t capacity() const { return capacity_; }
+
  private:
+  void allocate(size_t num_keys,
+                key_t invalid_key,
+                value_t invalid_value,
+                rmm::cuda_stream_view stream)
+  {
+    double constexpr load_factor = 0.7;
+    auto cuco_size               = std::max(
+      static_cast<size_t>(static_cast<double>(num_keys) / load_factor),
+      static_cast<size_t>(num_keys) + 1);  // cuco::static_map requires at least one empty slot
+
+    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(
+      rmm::mr::polymorphic_allocator<std::byte>(rmm::mr::get_current_device_resource()), stream);
+    if constexpr (std::is_arithmetic_v<value_t>) {
+      cuco_store_ = std::make_unique<cuco_store_type>(
+        cuco_size,
+        cuco::sentinel::empty_key<key_t>{invalid_key},
+        cuco::sentinel::empty_value<value_t>{invalid_value},
+        thrust::equal_to<key_t>{},
+        cuco::experimental::linear_probing<1,  // CG size
+                                           cuco::murmurhash3_32<key_t>>{},
+        stream_adapter,
+        stream.value());
+    } else {
+      cuco_store_ = std::make_unique<cuco_store_type>(
+        cuco_size,
+        cuco::sentinel::empty_key<key_t>{invalid_key},
+        cuco::sentinel::empty_value<size_t>{std::numeric_limits<size_t>::max()},
+        thrust::equal_to<key_t>{},
+        cuco::experimental::linear_probing<1,  // CG size
+                                           cuco::murmurhash3_32<key_t>>{},
+        stream_adapter,
+        stream);
+      store_values_ = allocate_dataframe_buffer<value_t>(0, stream);
+      reserve_dataframe_buffer(store_values_, num_keys, stream);
+    }
+  }
+
   std::unique_ptr<cuco_store_type> cuco_store_{nullptr};
   std::conditional_t<!std::is_arithmetic_v<value_t>,
                      decltype(allocate_dataframe_buffer<value_t>(0, rmm::cuda_stream_view{})),
@@ -510,6 +849,12 @@ class kv_cuco_store_t {
 
   std::conditional_t<!std::is_arithmetic_v<value_t>, value_t, std::byte /* dummy */>
     invalid_value_{};
+  size_t capacity_{0};
+  size_t size_{
+    0};  // caching as cuco_store_->size() is expensive (this scans the entire slots to handle
+         // user inserts through a device reference (and currently this is an upper bound (this
+         // will become exact once we fully switch to cuco::experimental::static_map and use the
+         // static_map class's insert_and_assign function; this function will be added soon)
 };
 
 }  // namespace detail
@@ -528,6 +873,23 @@ class kv_store_t {
 
   kv_store_t(rmm::cuda_stream_view stream) : store_(stream) {}
 
+  /* when use_binary_search = false */
+  template <bool binary_search = use_binary_search>
+  kv_store_t(
+    size_t capacity /* one can expect good performance till the capacity, the actual underlying
+                       capacity can be larger (for performance & correctness reasons) */
+    ,
+    key_t invalid_key /* invalid key shouldn't appear in any *iter in [key_first, key_last) */,
+    value_t invalid_value /* invalid_value shouldn't appear in any *iter in [value_first,
+                             value_first + thrust::distance(key_first, key_last)), invalid_value is
+                             returned when match fails for the given key */
+    ,
+    rmm::cuda_stream_view stream,
+    std::enable_if_t<!binary_search, int32_t> = 0)
+    : store_(capacity, invalid_key, invalid_value, stream)
+  {
+  }
+
   /* when use_binary_search = true */
   template <typename KeyIterator, typename ValueIterator, bool binary_search = use_binary_search>
   kv_store_t(
@@ -576,6 +938,47 @@ class kv_store_t {
   {
   }
 
+  /* when use binary_search = false, this requires that the capacity is large enough */
+  template <typename KeyIterator, typename ValueIterator, bool binary_search = use_binary_search>
+  std::enable_if_t<!binary_search, void> insert(KeyIterator key_first,
+                                                KeyIterator key_last,
+                                                ValueIterator value_first,
+                                                rmm::cuda_stream_view stream)
+  {
+    store_.insert(key_first, key_last, value_first, stream);
+  }
+
+  /* when use binary_search = false, this requires that the capacity is large enough */
+  template <typename KeyIterator,
+            typename ValueIterator,
+            typename StencilIterator,
+            typename PredOp,
+            bool binary_search = use_binary_search>
+  std::enable_if_t<!binary_search, void> insert_if(KeyIterator key_first,
+                                                   KeyIterator key_last,
+                                                   ValueIterator value_first,
+                                                   StencilIterator stencil_first,
+                                                   PredOp pred_op,
+                                                   rmm::cuda_stream_view stream)
+  {
+    store_.insert_if(key_first, key_last, value_first, stencil_first, pred_op, stream);
+  }
+
+  /* when use binary_search = false, this requires that the capacity is large enough */
+  template <typename KeyIterator, typename ValueIterator, bool binary_search = use_binary_search>
+  std::enable_if_t<!binary_search, void> insert_and_assign(KeyIterator key_first,
+                                                           KeyIterator key_last,
+                                                           ValueIterator value_first,
+                                                           rmm::cuda_stream_view stream)
+  {
+    store_.insert_and_assign(key_first, key_last, value_first, stream);
+  }
+
+  auto retrieve_all(rmm::cuda_stream_view stream) const { return store_.retrieve_all(stream); }
+
+  // kv_store_t becomes empty after release
+  auto release(rmm::cuda_stream_view stream) { return store_.release(stream); }
+
   auto view() const
   {
     if constexpr (use_binary_search) {
@@ -593,6 +996,18 @@ class kv_store_t {
     }
   }
 
+  template <bool binary_search = use_binary_search>
+  std::enable_if_t<!binary_search, key_t> invalid_key() const
+  {
+    return store_.invalid_key();
+  }
+
+  value_t invalid_value() const { return store_.invalid_value(); }
+
+  size_t size() const { return store_.size(); }
+
+  size_t capacity() const { return store_.capacity(); }
+
  private:
   std::conditional_t<use_binary_search,
                      detail::kv_binary_search_store_t<key_t, value_t>,
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index f30de0750e3..d69bb8af25e 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -346,7 +346,7 @@ void per_v_pair_transform_dst_nbr_intersection(
       // partition? This may provide additional performance improvement opportunities???
       auto chunk_vertex_pair_first = thrust::make_transform_iterator(
         chunk_vertex_pair_index_first,
-        detail::indirection_t<VertexPairIterator>{vertex_pair_first});
+        detail::indirection_t<size_t, VertexPairIterator>{vertex_pair_first});
       auto [intersection_offsets, intersection_indices] =
         detail::nbr_intersection(handle,
                                  graph_view,
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 69cce08d352..d7c094a2361 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -399,11 +399,12 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
         if (retry_segment_indices) {
           retry_degrees =
             rmm::device_uvector<edge_t>((*retry_segment_indices).size(), handle.get_stream());
-          thrust::transform(handle.get_thrust_policy(),
-                            (*retry_segment_indices).begin(),
-                            (*retry_segment_indices).end(),
-                            (*retry_degrees).begin(),
-                            indirection_t<decltype(segment_degree_first)>{segment_degree_first});
+          thrust::transform(
+            handle.get_thrust_policy(),
+            (*retry_segment_indices).begin(),
+            (*retry_segment_indices).end(),
+            (*retry_degrees).begin(),
+            indirection_t<size_t, decltype(segment_degree_first)>{segment_degree_first});
           retry_sample_nbr_indices = rmm::device_uvector<edge_t>(
             (*retry_segment_indices).size() * high_partition_over_sampling_K, handle.get_stream());
           retry_sample_indices = rmm::device_uvector<int32_t>(
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index a4d34443413..2e19adc34c4 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -756,7 +756,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         : thrust::nullopt;
     std::conditional_t<KVStoreViewType::binary_search,
                        detail::kv_binary_search_store_device_view_t<KVStoreViewType>,
-                       detail::kv_cuco_store_device_view_t<KVStoreViewType>>
+                       detail::kv_cuco_store_find_device_view_t<KVStoreViewType>>
       dst_key_value_map_device_view(
         GraphViewType::is_multi_gpu ? multi_gpu_minor_key_value_map_ptr->view() : kv_store_view);
     thrust::transform(handle.get_thrust_policy(),
diff --git a/cpp/src/structure/relabel_impl.cuh b/cpp/src/structure/relabel_impl.cuh
index c4c34733a4d..192120e6b4c 100644
--- a/cpp/src/structure/relabel_impl.cuh
+++ b/cpp/src/structure/relabel_impl.cuh
@@ -142,7 +142,7 @@ void relabel(raft::handle_t const& handle,
           handle.get_stream());
 
         if (skip_missing_labels) {
-          auto device_view = detail::kv_cuco_store_device_view_t(relabel_map_view);
+          auto device_view = detail::kv_cuco_store_find_device_view_t(relabel_map_view);
           thrust::transform(
             handle.get_thrust_policy(),
             rx_unique_old_labels.begin(),
@@ -187,7 +187,7 @@ void relabel(raft::handle_t const& handle,
       handle.get_stream());
     auto relabel_map_view = relabel_map.view();
     if (skip_missing_labels) {
-      auto device_view = detail::kv_cuco_store_device_view_t(relabel_map_view);
+      auto device_view = detail::kv_cuco_store_find_device_view_t(relabel_map_view);
       thrust::transform(
         handle.get_thrust_policy(),
         labels,