From b8de24c1b68c7207ba427a71b4c76320a9e2579a Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 19 Jul 2023 08:38:54 -0400
Subject: [PATCH 1/4] Integrate C++ Sampling Source Behavior Updates (#3699)

Updates pylibcugraph and cuGraph-Python to support the new flags at the C++/C API layer (`unique_sources`, `carry_over_sources`, `deduplicate_sources`.  Adds appropriate tests.  Fixes a couple cuGraph-Python bugs that the new tests uncovered as well, and properly exposes `return_hops`.

Merge after #3696

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3699
---
 .../dask/sampling/uniform_neighbor_sample.py  |  66 +++++-
 .../sampling/uniform_neighbor_sample.py       |  35 ++-
 .../sampling/test_uniform_neighbor_sample.py  | 200 ++++++++++++++++
 .../test_uniform_neighbor_sample_mg.py        | 214 ++++++++++++++++++
 .../pylibcugraph/_cugraph_c/algorithms.pxd    |  43 ++++
 .../_cugraph_c/sampling_algorithms.pxd        |  18 ++
 .../pylibcugraph/uniform_neighbor_sample.pyx  |  71 +++++-
 7 files changed, 640 insertions(+), 7 deletions(-)

diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
index d74a8df14eb..8cd7c5849d6 100644
--- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -184,6 +184,9 @@ def _call_plc_uniform_neighbor_sample(
     with_edge_properties,
     random_state=None,
     return_offsets=False,
+    return_hops=True,
+    prior_sources_behavior=None,
+    deduplicate_sources=False,
 ):
     st_x = st_x[0]
     start_list_x = st_x[start_col_name]
@@ -209,6 +212,9 @@ def _call_plc_uniform_neighbor_sample(
         with_edge_properties=with_edge_properties,
         batch_id_list=batch_id_list_x,
         random_state=random_state,
+        prior_sources_behavior=prior_sources_behavior,
+        deduplicate_sources=deduplicate_sources,
+        return_hops=return_hops,
     )
     return convert_to_cudf(
         cp_arrays, weight_t, with_edge_properties, return_offsets=return_offsets
@@ -227,6 +233,7 @@ def _call_plc_uniform_neighbor_sample_legacy(
     with_edge_properties,
     random_state=None,
     return_offsets=False,
+    return_hops=True,
 ):
     start_list_x = st_x[start_col_name]
     batch_id_list_x = st_x[batch_col_name] if batch_col_name in st_x else None
@@ -242,6 +249,7 @@ def _call_plc_uniform_neighbor_sample_legacy(
         with_edge_properties=with_edge_properties,
         batch_id_list=batch_id_list_x,
         random_state=random_state,
+        return_hops=return_hops,
     )
     return convert_to_cudf(
         cp_arrays, weight_t, with_edge_properties, return_offsets=return_offsets
@@ -262,6 +270,7 @@ def _mg_call_plc_uniform_neighbor_sample_legacy(
     with_edge_properties,
     random_state,
     return_offsets=False,
+    return_hops=True,
 ):
     result = [
         client.submit(
@@ -281,6 +290,7 @@ def _mg_call_plc_uniform_neighbor_sample_legacy(
             allow_other_workers=False,
             pure=False,
             return_offsets=return_offsets,
+            return_hops=return_hops,
         )
         for i, w in enumerate(Comms.get_workers())
     ]
@@ -327,6 +337,9 @@ def _mg_call_plc_uniform_neighbor_sample(
     with_edge_properties,
     random_state,
     return_offsets=False,
+    return_hops=True,
+    prior_sources_behavior=None,
+    deduplicate_sources=False,
 ):
     n_workers = None
     if keep_batches_together:
@@ -354,6 +367,9 @@ def _mg_call_plc_uniform_neighbor_sample(
             # FIXME accept and properly transmute a numpy/cupy random state.
             random_state=hash((random_state, w)),
             return_offsets=return_offsets,
+            return_hops=return_hops,
+            prior_sources_behavior=prior_sources_behavior,
+            deduplicate_sources=deduplicate_sources,
             allow_other_workers=False,
             pure=False,
         )
@@ -408,6 +424,7 @@ def _uniform_neighbor_sample_legacy(
     label_to_output_comm_rank: bool = None,
     random_state: int = None,
     return_offsets: bool = False,
+    return_hops: bool = False,
     _multiple_clients: bool = False,
 ) -> Union[dask_cudf.DataFrame, Tuple[dask_cudf.DataFrame, dask_cudf.DataFrame]]:
     warnings.warn(
@@ -508,6 +525,7 @@ def _uniform_neighbor_sample_legacy(
                     with_edge_properties=with_edge_properties,
                     random_state=random_state,
                     return_offsets=return_offsets,
+                    return_hops=return_hops,
                 )
             finally:
                 lock.release()
@@ -530,6 +548,7 @@ def _uniform_neighbor_sample_legacy(
             with_edge_properties=with_edge_properties,
             random_state=random_state,
             return_offsets=return_offsets,
+            return_hops=return_hops,
         )
 
     if return_offsets:
@@ -564,6 +583,9 @@ def uniform_neighbor_sample(
     max_batch_id=None,
     random_state: int = None,
     return_offsets: bool = False,
+    return_hops: bool = True,
+    prior_sources_behavior: str = None,
+    deduplicate_sources: bool = False,
     _multiple_clients: bool = False,
 ) -> Union[dask_cudf.DataFrame, Tuple[dask_cudf.DataFrame, dask_cudf.DataFrame]]:
     """
@@ -630,6 +652,24 @@ def uniform_neighbor_sample(
         dataframes, one with sampling results and one with
         batch ids and their start offsets per rank.
 
+    return_hops: bool, optional (default=True)
+        Whether to return the sampling results with hop ids
+        corresponding to the hop where the edge appeared.
+        Defaults to True.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool, optional (default=False)
+        Whether to first deduplicate the list of possible sources
+        from the previous destinations before performing next
+        hop.
+
     _multiple_clients: bool, optional (default=False)
         internal flag to ensure sampling works with multiple dask clients
         set to True to prevent hangs in multi-client environment
@@ -690,6 +730,14 @@ def uniform_neighbor_sample(
         or label_list is not None
         or label_to_output_comm_rank is not None
     ):
+        if prior_sources_behavior or deduplicate_sources:
+            raise ValueError(
+                "unique sources, carry_over_sources, and deduplicate_sources"
+                " are not supported with batch_id_list, label_list, and"
+                " label_to_output_comm_rank.  Consider using with_batch_ids"
+                " and keep_batches_together instead."
+            )
+
         return uniform_neighbor_sample_legacy(
             input_graph,
             start_list,
@@ -701,6 +749,7 @@ def uniform_neighbor_sample(
             label_to_output_comm_rank=label_to_output_comm_rank,
             random_state=random_state,
             return_offsets=return_offsets,
+            return_hops=return_hops,
             _multiple_clients=_multiple_clients,
         )
 
@@ -719,7 +768,16 @@ def uniform_neighbor_sample(
             raise ValueError("expected 1d input for start list without batch ids")
 
         start_list = start_list.to_frame()
-        start_list[batch_id_n] = cudf.Series(cp.zeros(len(start_list), dtype="int32"))
+        if isinstance(start_list, dask_cudf.DataFrame):
+            start_list = start_list.map_partitions(
+                lambda df: df.assign(
+                    **{batch_id_n: cudf.Series(cp.zeros(len(df), dtype="int32"))}
+                )
+            ).persist()
+        else:
+            start_list = start_list.reset_index(drop=True).assign(
+                **{batch_id_n: cudf.Series(cp.zeros(len(start_list), dtype="int32"))}
+            )
 
     if keep_batches_together and min_batch_id is None:
         raise ValueError(
@@ -795,6 +853,9 @@ def uniform_neighbor_sample(
                     with_edge_properties=with_edge_properties,
                     random_state=random_state,
                     return_offsets=return_offsets,
+                    return_hops=return_hops,
+                    prior_sources_behavior=prior_sources_behavior,
+                    deduplicate_sources=deduplicate_sources,
                 )
             finally:
                 lock.release()
@@ -818,6 +879,9 @@ def uniform_neighbor_sample(
             with_edge_properties=with_edge_properties,
             random_state=random_state,
             return_offsets=return_offsets,
+            return_hops=return_hops,
+            prior_sources_behavior=prior_sources_behavior,
+            deduplicate_sources=deduplicate_sources,
         )
 
     if return_offsets:
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index d239f92d485..05715762365 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -64,6 +64,7 @@ def _uniform_neighbor_sample_legacy(
     batch_id_list: Sequence = None,
     random_state: int = None,
     return_offsets: bool = False,
+    return_hops: bool = True,
 ) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]:
 
     warnings.warn(
@@ -112,6 +113,7 @@ def _uniform_neighbor_sample_legacy(
         do_expensive_check=False,
         with_edge_properties=with_edge_properties,
         batch_id_list=batch_id_list,
+        return_hops=return_hops,
         random_state=random_state,
     )
 
@@ -193,6 +195,9 @@ def uniform_neighbor_sample(
     with_batch_ids: bool = False,
     random_state: int = None,
     return_offsets: bool = False,
+    return_hops: bool = True,
+    prior_sources_behavior: str = None,
+    deduplicate_sources: bool = False,
 ) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]:
     """
     Does neighborhood sampling, which samples nodes from a graph based on the
@@ -236,6 +241,24 @@ def uniform_neighbor_sample(
         dataframes, one with sampling results and one with
         batch ids and their start offsets.
 
+    return_hops: bool, optional (default=True)
+        Whether to return the sampling results with hop ids
+        corresponding to the hop where the edge appeared.
+        Defaults to True.
+
+    prior_sources_behavior: str, optional (default=None)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool, optional (default=False)
+        Whether to first deduplicate the list of possible sources
+        from the previous destinations before performing next
+        hop.
+
     Returns
     -------
     result : cudf.DataFrame or Tuple[cudf.DataFrame, cudf.DataFrame]
@@ -288,6 +311,12 @@ def uniform_neighbor_sample(
     """
 
     if batch_id_list is not None:
+        if prior_sources_behavior or deduplicate_sources:
+            raise ValueError(
+                "prior_sources_behavior and deduplicate_sources"
+                " are not supported with batch_id_list."
+                " Consider using with_batch_ids instead."
+            )
         return uniform_neighbor_sample_legacy(
             G,
             start_list,
@@ -297,6 +326,7 @@ def uniform_neighbor_sample(
             batch_id_list=batch_id_list,
             random_state=random_state,
             return_offsets=return_offsets,
+            return_hops=return_hops,
         )
 
     if isinstance(start_list, int):
@@ -309,7 +339,7 @@ def uniform_neighbor_sample(
 
     if with_edge_properties and not with_batch_ids:
         if isinstance(start_list, cudf.Series):
-            start_list = start_list.to_frame()
+            start_list = start_list.reset_index(drop=True).to_frame()
 
         start_list[batch_col_name] = cudf.Series(
             cp.zeros(len(start_list), dtype="int32")
@@ -361,6 +391,9 @@ def uniform_neighbor_sample(
         do_expensive_check=False,
         with_edge_properties=with_edge_properties,
         random_state=random_state,
+        prior_sources_behavior=prior_sources_behavior,
+        deduplicate_sources=deduplicate_sources,
+        return_hops=return_hops,
     )
 
     df = cudf.DataFrame()
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
index 39d2fbea7dd..1781ce17753 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
@@ -528,6 +528,206 @@ def test_uniform_neighbor_sample_empty_start_list():
     assert sampling_results.empty
 
 
+@pytest.mark.sg
+def test_uniform_neighbor_sample_exclude_sources_basic():
+    df = cudf.DataFrame(
+        {
+            "src": [0, 4, 1, 2, 3, 5, 4, 1, 0],
+            "dst": [1, 1, 2, 4, 3, 1, 5, 0, 2],
+            "eid": [9, 8, 7, 6, 5, 4, 3, 2, 1],
+        }
+    )
+
+    G = cugraph.MultiGraph(directed=True)
+    G.from_cudf_edgelist(df, source="src", destination="dst", edge_id="eid")
+
+    sampling_results = cugraph.uniform_neighbor_sample(
+        G,
+        cudf.DataFrame(
+            {
+                "seed": cudf.Series([0, 4, 1], dtype="int64"),
+                "batch": cudf.Series([1, 1, 1], dtype="int32"),
+            }
+        ),
+        [2, 3, 3],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=True,
+        random_state=62,
+        prior_sources_behavior="exclude",
+    ).sort_values(by="hop_id")
+
+    expected_hop_0 = [1, 2, 1, 5, 2, 0]
+    assert sorted(
+        sampling_results[sampling_results.hop_id == 0].destinations.values_host.tolist()
+    ) == sorted(expected_hop_0)
+
+    next_sources = set(
+        sampling_results[sampling_results.hop_id > 0].sources.values_host.tolist()
+    )
+    for v in [0, 4, 1]:
+        assert v not in next_sources
+
+    next_sources = set(
+        sampling_results[sampling_results.hop_id > 1].sources.values_host.tolist()
+    )
+    for v in sampling_results[
+        sampling_results.hop_id == 1
+    ].sources.values_host.tolist():
+        assert v not in next_sources
+
+
+@pytest.mark.sg
+def test_uniform_neighbor_sample_exclude_sources_email_eu_core():
+    el = email_Eu_core.get_edgelist()
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.001 * len(el)))
+
+    sampling_results = cugraph.uniform_neighbor_sample(
+        G,
+        seeds,
+        [5, 4, 3, 2, 1],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        prior_sources_behavior="exclude",
+    )
+
+    for hop in range(5):
+        current_sources = set(
+            sampling_results[
+                sampling_results.hop_id == hop
+            ].sources.values_host.tolist()
+        )
+        future_sources = set(
+            sampling_results[sampling_results.hop_id > hop].sources.values_host.tolist()
+        )
+
+        for s in current_sources:
+            assert s not in future_sources
+
+
+@pytest.mark.sg
+def test_uniform_neighbor_sample_carry_over_sources_basic():
+    df = cudf.DataFrame(
+        {
+            "src": [0, 4, 1, 2, 3, 5, 4, 1, 0, 6],
+            "dst": [1, 1, 2, 4, 6, 1, 5, 0, 2, 2],
+            "eid": [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+        }
+    )
+
+    G = cugraph.MultiGraph(directed=True)
+    G.from_cudf_edgelist(df, source="src", destination="dst", edge_id="eid")
+
+    sampling_results = cugraph.uniform_neighbor_sample(
+        G,
+        cudf.DataFrame(
+            {
+                "seed": cudf.Series([0, 4, 3], dtype="int64"),
+                "batch": cudf.Series([1, 1, 1], dtype="int32"),
+            }
+        ),
+        [2, 3, 3],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=True,
+        random_state=62,
+        prior_sources_behavior="carryover",
+    ).sort_values(by="hop_id")[["sources", "destinations", "hop_id"]]
+
+    assert (
+        len(
+            sampling_results[
+                (sampling_results.hop_id == 2) & (sampling_results.sources == 6)
+            ]
+        )
+        == 2
+    )
+
+    for hop in range(2):
+        sources_current_hop = set(
+            sampling_results[
+                sampling_results.hop_id == hop
+            ].sources.values_host.tolist()
+        )
+        sources_next_hop = set(
+            sampling_results[
+                sampling_results.hop_id == (hop + 1)
+            ].sources.values_host.tolist()
+        )
+
+        for s in sources_current_hop:
+            assert s in sources_next_hop
+
+
+@pytest.mark.sg
+def test_uniform_neighbor_sample_carry_over_sources_email_eu_core():
+    el = email_Eu_core.get_edgelist()
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.001 * len(el)))
+
+    sampling_results = cugraph.uniform_neighbor_sample(
+        G,
+        seeds,
+        [5, 4, 3, 2, 1],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        prior_sources_behavior="carryover",
+    )
+
+    for hop in range(4):
+        sources_current_hop = set(
+            sampling_results[
+                sampling_results.hop_id == hop
+            ].sources.values_host.tolist()
+        )
+        sources_next_hop = set(
+            sampling_results[
+                sampling_results.hop_id == (hop + 1)
+            ].sources.values_host.tolist()
+        )
+
+        for s in sources_current_hop:
+            assert s in sources_next_hop
+
+
+@pytest.mark.sg
+def test_uniform_neighbor_sample_deduplicate_sources_email_eu_core():
+    el = email_Eu_core.get_edgelist()
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.001 * len(el)))
+
+    sampling_results = cugraph.uniform_neighbor_sample(
+        G,
+        seeds,
+        [5, 4, 3, 2, 1],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+    )
+
+    for hop in range(5):
+        counts_current_hop = (
+            sampling_results[sampling_results.hop_id == hop]
+            .sources.value_counts()
+            .values_host.tolist()
+        )
+        for c in counts_current_hop:
+            assert c <= 5 - hop
+
+
 @pytest.mark.sg
 @pytest.mark.skip(reason="needs to be written!")
 def test_multi_client_sampling():
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
index 4da3f3cf950..c0f8b087dec 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -743,6 +743,220 @@ def test_uniform_neighbor_sample_batched(dask_client, dataset, input_df, max_bat
         assert output_starts_per_batch <= input_starts_per_batch
 
 
+@pytest.mark.mg
+def test_uniform_neighbor_sample_exclude_sources_basic(dask_client):
+    df = dask_cudf.from_cudf(
+        cudf.DataFrame(
+            {
+                "src": [0, 4, 1, 2, 3, 5, 4, 1, 0],
+                "dst": [1, 1, 2, 4, 3, 1, 5, 0, 2],
+                "eid": [9, 8, 7, 6, 5, 4, 3, 2, 1],
+            }
+        ),
+        npartitions=1,
+    )
+
+    G = cugraph.MultiGraph(directed=True)
+    G.from_dask_cudf_edgelist(df, source="src", destination="dst", edge_id="eid")
+
+    sampling_results = (
+        cugraph.dask.uniform_neighbor_sample(
+            G,
+            cudf.DataFrame(
+                {
+                    "seed": cudf.Series([0, 4, 1], dtype="int64"),
+                    "batch": cudf.Series([1, 1, 1], dtype="int32"),
+                }
+            ),
+            [2, 3, 3],
+            with_replacement=False,
+            with_edge_properties=True,
+            with_batch_ids=True,
+            random_state=62,
+            prior_sources_behavior="exclude",
+        )
+        .sort_values(by="hop_id")
+        .compute()
+    )
+
+    expected_hop_0 = [1, 2, 1, 5, 2, 0]
+    assert sorted(
+        sampling_results[sampling_results.hop_id == 0].destinations.values_host.tolist()
+    ) == sorted(expected_hop_0)
+
+    next_sources = set(
+        sampling_results[sampling_results.hop_id > 0].sources.values_host.tolist()
+    )
+    for v in [0, 4, 1]:
+        assert v not in next_sources
+
+    next_sources = set(
+        sampling_results[sampling_results.hop_id > 1].sources.values_host.tolist()
+    )
+    for v in sampling_results[
+        sampling_results.hop_id == 1
+    ].sources.values_host.tolist():
+        assert v not in next_sources
+
+
+@pytest.mark.mg
+def test_uniform_neighbor_sample_exclude_sources_email_eu_core(dask_client):
+    el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=8)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.001 * len(el)))
+
+    sampling_results = cugraph.dask.uniform_neighbor_sample(
+        G,
+        seeds,
+        [5, 4, 3, 2, 1],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        prior_sources_behavior="exclude",
+    ).compute()
+
+    for hop in range(5):
+        current_sources = set(
+            sampling_results[
+                sampling_results.hop_id == hop
+            ].sources.values_host.tolist()
+        )
+        future_sources = set(
+            sampling_results[sampling_results.hop_id > hop].sources.values_host.tolist()
+        )
+
+        for s in current_sources:
+            assert s not in future_sources
+
+
+@pytest.mark.mg
+def test_uniform_neighbor_sample_carry_over_sources_basic(dask_client):
+    df = dask_cudf.from_cudf(
+        cudf.DataFrame(
+            {
+                "src": [0, 4, 1, 2, 3, 5, 4, 1, 0, 6],
+                "dst": [1, 1, 2, 4, 6, 1, 5, 0, 2, 2],
+                "eid": [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+            }
+        ),
+        npartitions=4,
+    )
+
+    G = cugraph.MultiGraph(directed=True)
+    G.from_dask_cudf_edgelist(df, source="src", destination="dst", edge_id="eid")
+
+    sampling_results = (
+        cugraph.dask.uniform_neighbor_sample(
+            G,
+            cudf.DataFrame(
+                {
+                    "seed": cudf.Series([0, 4, 3], dtype="int64"),
+                    "batch": cudf.Series([1, 1, 1], dtype="int32"),
+                }
+            ),
+            [2, 3, 3],
+            with_replacement=False,
+            with_edge_properties=True,
+            with_batch_ids=True,
+            random_state=62,
+            prior_sources_behavior="carryover",
+        )
+        .sort_values(by="hop_id")[["sources", "destinations", "hop_id"]]
+        .compute()
+    )
+
+    assert (
+        len(
+            sampling_results[
+                (sampling_results.hop_id == 2) & (sampling_results.sources == 6)
+            ]
+        )
+        == 2
+    )
+
+    for hop in range(2):
+        sources_current_hop = set(
+            sampling_results[
+                sampling_results.hop_id == hop
+            ].sources.values_host.tolist()
+        )
+        sources_next_hop = set(
+            sampling_results[
+                sampling_results.hop_id == (hop + 1)
+            ].sources.values_host.tolist()
+        )
+
+        for s in sources_current_hop:
+            assert s in sources_next_hop
+
+
+@pytest.mark.mg
+def test_uniform_neighbor_sample_carry_over_sources_email_eu_core(dask_client):
+    el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=8)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.001 * len(el)))
+
+    sampling_results = cugraph.dask.uniform_neighbor_sample(
+        G,
+        seeds,
+        [5, 4, 3, 2, 1],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        prior_sources_behavior="carryover",
+    ).compute()
+
+    for hop in range(4):
+        sources_current_hop = set(
+            sampling_results[
+                sampling_results.hop_id == hop
+            ].sources.values_host.tolist()
+        )
+        sources_next_hop = set(
+            sampling_results[
+                sampling_results.hop_id == (hop + 1)
+            ].sources.values_host.tolist()
+        )
+
+        for s in sources_current_hop:
+            assert s in sources_next_hop
+
+
+@pytest.mark.mg
+def test_uniform_neighbor_sample_deduplicate_sources_email_eu_core(dask_client):
+    el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=8)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.001 * len(el)))
+
+    sampling_results = cugraph.dask.uniform_neighbor_sample(
+        G,
+        seeds,
+        [5, 4, 3, 2, 1],
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+    ).compute()
+
+    for hop in range(5):
+        counts_current_hop = (
+            sampling_results[sampling_results.hop_id == hop]
+            .sources.value_counts()
+            .values_host.tolist()
+        )
+        for c in counts_current_hop:
+            assert c <= 5 - hop
+
+
 # =============================================================================
 # Benchmarks
 # =============================================================================
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
index 8ced11aa4d7..9d2447466e8 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
@@ -231,6 +231,49 @@ cdef extern from "cugraph_c/algorithms.h":
             cugraph_sample_result_t** result,
             cugraph_error_t** error
         )
+    
+    ctypedef struct cugraph_sampling_options_t:
+        pass
+    
+    ctypedef enum cugraph_prior_sources_behavior_t:
+        DEFAULT
+        CARRY_OVER
+        EXCLUDE
+    
+    cdef cugraph_error_code_t \
+        cugraph_sampling_options_create(
+            cugraph_sampling_options_t** options,
+            cugraph_error_t** error,
+        )
+    
+    cdef void \
+        cugraph_sampling_set_with_replacement(
+            cugraph_sampling_options_t* options,
+            bool_t value,
+        )
+    
+    cdef void \
+        cugraph_sampling_set_return_hops(
+            cugraph_sampling_options_t* options,
+            bool_t value,
+        )
+
+    cdef void \
+        cugraph_sampling_set_prior_sources_behavior(
+            cugraph_sampling_options_t* options,
+            cugraph_prior_sources_behavior_t value
+        )
+
+    cdef void \
+        cugraph_sampling_set_dedupe_sources(
+            cugraph_sampling_options_t* options,
+            bool_t value,
+        )
+    
+    cdef void \
+        cugraph_sampling_options_free(
+            cugraph_sampling_options_t* options,
+    )
 
     # uniform random walks
     cdef cugraph_error_code_t \
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
index ad8a8cd33a0..91cc11d6b1c 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
@@ -31,6 +31,7 @@ from pylibcugraph._cugraph_c.graph cimport (
 )
 from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sample_result_t,
+    cugraph_sampling_options_t,
 )
 from pylibcugraph._cugraph_c.random cimport (
     cugraph_rng_state_t,
@@ -41,6 +42,8 @@ from pylibcugraph._cugraph_c.array cimport (
 
 cdef extern from "cugraph_c/sampling_algorithms.h":
     ###########################################################################
+
+    # deprecated, should migrate to cugraph_uniform_neighbor_sample
     cdef cugraph_error_code_t cugraph_uniform_neighbor_sample_with_edge_properties(
         const cugraph_resource_handle_t* handle,
         cugraph_graph_t* graph,
@@ -57,6 +60,21 @@ cdef extern from "cugraph_c/sampling_algorithms.h":
         cugraph_error_t** error
     )
 
+    cdef cugraph_error_code_t cugraph_uniform_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_graph_t* graph,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* start_vertex_labels,
+        const cugraph_type_erased_device_array_view_t* label_list,
+        const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        cugraph_rng_state_t* rng_state,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
     cdef cugraph_error_code_t cugraph_test_uniform_neighborhood_sample_result_create(
         const cugraph_resource_handle_t* handle,
         const cugraph_type_erased_device_array_view_t* srcs,
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index a3f5dfb273f..a1832948f28 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -42,9 +42,20 @@ from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sample_result_get_destinations,
     cugraph_sample_result_get_index,
     cugraph_sample_result_free,
+
+    cugraph_prior_sources_behavior_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+
 )
 from pylibcugraph._cugraph_c.sampling_algorithms cimport (
-    cugraph_uniform_neighbor_sample_with_edge_properties,
+    cugraph_uniform_neighbor_sample,
+
 )
 from pylibcugraph.resource_handle cimport (
     ResourceHandle,
@@ -75,12 +86,16 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
                             _GPUGraph input_graph,
                             start_list,
                             h_fan_out,
+                            *,
                             bool_t with_replacement,
                             bool_t do_expensive_check,
                             bool_t with_edge_properties=<bool_t>False,
                             batch_id_list=None,
                             label_list=None,
                             label_to_output_comm_rank=None,
+                            prior_sources_behavior=None,
+                            bool_t deduplicate_sources=<bool_t>False,
+                            bool_t return_hops=<bool_t>False,
                             random_state=None):
     """
     Does neighborhood sampling, which samples nodes from a graph based on the
@@ -118,6 +133,27 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     batch_id_list: list[int32] (Optional)
         List of int32 batch ids that is returned with each edge.  Optional
         argument, defaults to NULL, returning nothing.
+    
+    label_list: list[int32] (Optional)
+        List of unique int32 batch ids.  Required if also passing the
+        label_to_output_comm_rank flag.  Default to NULL (does nothing)
+    
+    label_to_output_comm_rank: list[int32] (Optional)
+        Maps the unique batch ids in label_list to the rank of the
+        worker that should hold results for that batch id.
+        Defaults to NULL (does nothing)
+    
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+    
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
 
     random_state: int (Optional)
         Random state to use when generating samples.  Optional argument,
@@ -212,7 +248,30 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
 
     cdef cugraph_rng_state_t* rng_state_ptr = \
         cg_rng_state.rng_state_ptr
-    error_code = cugraph_uniform_neighbor_sample_with_edge_properties(
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+
+    error_code = cugraph_uniform_neighbor_sample(
         c_resource_handle_ptr,
         c_graph_ptr,
         start_ptr,
@@ -221,12 +280,14 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         label_to_output_comm_rank_ptr,
         fan_out_ptr,
         rng_state_ptr,
-        with_replacement,
-        <bool_t>True,  # return_hops
+        sampling_options,
         do_expensive_check,
         &result_ptr,
         &error_ptr)
-    assert_success(error_code, error_ptr, "cugraph_uniform_neighbor_sample_with_edge_properties")
+    assert_success(error_code, error_ptr, "cugraph_uniform_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
 
     # Free the two input arrays that are no longer needed.
     cugraph_type_erased_device_array_view_free(start_ptr)

From 59b0eb70c4e3157c3184128bda52104c925bef25 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Wed, 19 Jul 2023 13:39:29 +0100
Subject: [PATCH 2/4] [BUG] Unsupported graph for similiarity algos (#3710)

This PR update the docstrings raises an error when running any similarity algos with vertices from a graph that are unrenumbered.

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3710
---
 .../cugraph/link_prediction/jaccard.py        | 34 +++++++++++++++++--
 .../cugraph/link_prediction/overlap.py        | 33 ++++++++++++++++--
 .../cugraph/link_prediction/sorensen.py       | 32 +++++++++++++++--
 .../cugraph/link_prediction/wjaccard.py       | 28 +++++++++++++--
 .../cugraph/link_prediction/woverlap.py       | 25 +++++++++++++-
 .../cugraph/link_prediction/wsorensen.py      | 26 +++++++++++++-
 .../tests/link_prediction/test_jaccard.py     | 12 +++++++
 .../tests/link_prediction/test_overlap.py     | 11 ++++++
 .../tests/link_prediction/test_sorensen.py    | 12 +++++++
 .../tests/link_prediction/test_wjaccard.py    | 11 ++++++
 .../tests/link_prediction/test_woverlap.py    | 11 ++++++
 .../tests/link_prediction/test_wsorensen.py   | 11 ++++++
 12 files changed, 234 insertions(+), 12 deletions(-)

diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
index 1c4fed7a8f9..dd411fa889d 100644
--- a/python/cugraph/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,7 @@
 )
 
 
-def jaccard(input_graph, vertex_pair=None):
+def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     """
     Compute the Jaccard similarity between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -36,6 +36,10 @@ def jaccard(input_graph, vertex_pair=None):
     of cugraph.jaccard is different from the behavior of
     networkx.jaccard_coefficient.
 
+    This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     cugraph.jaccard, in the absence of a specified vertex pair list, will
     use the edges of the graph to construct a vertex pair list and will
     return the jaccard coefficient for those vertex pairs.
@@ -80,6 +84,10 @@ def jaccard(input_graph, vertex_pair=None):
         current implementation computes the jaccard coefficient for all
         adjacent vertices in the graph.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df  : cudf.DataFrame
@@ -104,6 +112,22 @@ def jaccard(input_graph, vertex_pair=None):
     >>> df = cugraph.jaccard(G)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if input_graph.is_directed():
         raise ValueError("Input must be an undirected Graph.")
     if type(vertex_pair) == cudf.DataFrame:
@@ -120,10 +144,14 @@ def jaccard(input_graph, vertex_pair=None):
     return df
 
 
-def jaccard_coefficient(G, ebunch=None):
+def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
     """
     For NetworkX Compatability.  See `jaccard`
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     graph : cugraph.Graph
diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py
index ba9f225062e..e05e0c944fe 100644
--- a/python/cugraph/cugraph/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/link_prediction/overlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,10 +20,14 @@
 )
 
 
-def overlap_coefficient(G, ebunch=None):
+def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
     """
     For NetworkX Compatability.  See `overlap`
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     """
     vertex_pair = None
 
@@ -42,7 +46,7 @@ def overlap_coefficient(G, ebunch=None):
     return df
 
 
-def overlap(input_graph, vertex_pair=None):
+def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
     """
     Compute the Overlap Coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -54,6 +58,10 @@ def overlap(input_graph, vertex_pair=None):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -66,6 +74,10 @@ def overlap(input_graph, vertex_pair=None):
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -90,6 +102,21 @@ def overlap(input_graph, vertex_pair=None):
     >>> df = cugraph.overlap(G)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
index 20238e10464..0f35f868b7c 100644
--- a/python/cugraph/cugraph/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -21,7 +21,7 @@
 )
 
 
-def sorensen(input_graph, vertex_pair=None):
+def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
     """
     Compute the Sorensen coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -30,6 +30,10 @@ def sorensen(input_graph, vertex_pair=None):
     If first is specified but second is not, or vice versa, an exception will
     be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     cugraph.sorensen, in the absence of a specified vertex pair list, will
     use the edges of the graph to construct a vertex pair list and will
     return the sorensen coefficient for those vertex pairs.
@@ -50,6 +54,10 @@ def sorensen(input_graph, vertex_pair=None):
         current implementation computes the Sorensen coefficient for all
         adjacent vertices in the graph.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df  : cudf.DataFrame
@@ -76,6 +84,22 @@ def sorensen(input_graph, vertex_pair=None):
     >>> df = cugraph.sorensen(G)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if type(input_graph) is not Graph:
         raise TypeError("input graph must a Graph")
 
@@ -94,10 +118,14 @@ def sorensen(input_graph, vertex_pair=None):
     return df
 
 
-def sorensen_coefficient(G, ebunch=None):
+def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
     """
     For NetworkX Compatability.  See `sorensen`
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     G : cugraph.Graph
diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py
index b8ef33d926f..fc6edae8d3e 100644
--- a/python/cugraph/cugraph/link_prediction/wjaccard.py
+++ b/python/cugraph/cugraph/link_prediction/wjaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,7 +17,7 @@
 from cugraph.utilities import renumber_vertex_pair
 
 
-def jaccard_w(input_graph, weights, vertex_pair=None):
+def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     """
     Compute the weighted Jaccard similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -29,6 +29,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -51,6 +55,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
         vertices. If provided, the jaccard coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -87,6 +95,22 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
     >>> df = cugraph.jaccard_w(G, weights)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if type(input_graph) is not Graph:
         raise TypeError("input graph must a Graph")
 
diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py
index c7d4f56a428..27fb7d608ca 100644
--- a/python/cugraph/cugraph/link_prediction/woverlap.py
+++ b/python/cugraph/cugraph/link_prediction/woverlap.py
@@ -16,7 +16,7 @@
 from cugraph.utilities import renumber_vertex_pair
 
 
-def overlap_w(input_graph, weights, vertex_pair=None):
+def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     """
     Compute the weighted Overlap Coefficient between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -28,6 +28,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -51,6 +55,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -88,6 +96,21 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     ...                      len(weights['vertex']))]
     >>> df = cugraph.overlap_w(G, weights)
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py
index c017463a294..c27e4f66a02 100644
--- a/python/cugraph/cugraph/link_prediction/wsorensen.py
+++ b/python/cugraph/cugraph/link_prediction/wsorensen.py
@@ -17,13 +17,17 @@
 from cugraph.utilities import renumber_vertex_pair
 
 
-def sorensen_w(input_graph, weights, vertex_pair=None):
+def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     """
     Compute the weighted Sorensen similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
     the user. Sorensen coefficient is defined between two sets as the ratio of
     twice the volume of their intersection divided by the volume of each set.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -47,6 +51,10 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
         vertices. If provided, the sorensen coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -85,6 +93,22 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
     >>> df = cugraph.sorensen_w(G, weights)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if type(input_graph) is not Graph:
         raise TypeError("input graph must a Graph")
 
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
index b04c4c741b1..43077126827 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
@@ -202,6 +202,7 @@ def test_nx_jaccard_time(read_csv, gpubenchmark):
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", [netscience])
+@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered")
 def test_jaccard_edgevals(gpubenchmark, graph_file):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
@@ -326,3 +327,14 @@ def test_weighted_exp_jaccard():
     use_weight = True
     with pytest.raises(ValueError):
         exp_jaccard(G, use_weight=use_weight)
+
+
+@pytest.mark.sg
+def test_invalid_datasets_jaccard():
+    karate = DATASETS_UNDIRECTED[0]
+    df = karate.get_edgelist()
+    df = df.add(1)
+    G = cugraph.Graph(directed=False)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+    with pytest.raises(ValueError):
+        cugraph.jaccard(G)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
index 68f879dacdb..03bee451f3c 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
@@ -225,3 +225,14 @@ def test_weighted_exp_overlap():
     use_weight = True
     with pytest.raises(ValueError):
         exp_overlap(G, use_weight=use_weight)
+
+
+@pytest.mark.sg
+def test_invalid_datasets_overlap():
+    karate = DATASETS_UNDIRECTED[0]
+    df = karate.get_edgelist()
+    df = df.add(1)
+    G = cugraph.Graph(directed=False)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+    with pytest.raises(ValueError):
+        cugraph.overlap(G)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
index 3457627ed7d..14d84784161 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
@@ -187,6 +187,7 @@ def test_nx_sorensen_time(gpubenchmark, read_csv):
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", [netscience])
+@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered")
 def test_sorensen_edgevals(gpubenchmark, graph_file):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
@@ -288,3 +289,14 @@ def test_weighted_exp_sorensen():
     use_weight = True
     with pytest.raises(ValueError):
         exp_sorensen(G, use_weight=use_weight)
+
+
+@pytest.mark.sg
+def test_invalid_datasets_sorensen():
+    karate = DATASETS_UNDIRECTED[0]
+    df = karate.get_edgelist()
+    df = df.add(1)
+    G = cugraph.Graph(directed=False)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+    with pytest.raises(ValueError):
+        cugraph.sorensen(G)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
index 22ace93c0e4..2bc39b877ea 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
@@ -176,3 +176,14 @@ def test_wjaccard_multi_column(read_csv):
     actual = df_res.sort_values("0_first").reset_index()
     expected = df_exp.sort_values("first").reset_index()
     assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
+
+
+@pytest.mark.sg
+def test_invalid_datasets_jaccard_w():
+    karate = DATASETS_UNDIRECTED[0]
+    df = karate.get_edgelist()
+    df = df.add(1)
+    G = cugraph.Graph(directed=False)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+    with pytest.raises(ValueError):
+        cugraph.jaccard_w(G, None)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
index f4fab9d0faa..5e35bb66f07 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
@@ -159,3 +159,14 @@ def test_woverlap_multi_column(graph_file):
     actual = df_res.sort_values("0_first").reset_index()
     expected = df_exp.sort_values("first").reset_index()
     assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
+
+
+@pytest.mark.sg
+def test_invalid_datasets_overlap_w():
+    karate = DATASETS_UNDIRECTED[0]
+    df = karate.get_edgelist()
+    df = df.add(1)
+    G = cugraph.Graph(directed=False)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+    with pytest.raises(ValueError):
+        cugraph.overlap_w(G, None)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py
index 0cf775d666c..cca2363d2d6 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py
@@ -180,3 +180,14 @@ def test_wsorensen_multi_column(read_csv):
     actual = df_res.sort_values("0_first").reset_index()
     expected = df_exp.sort_values("first").reset_index()
     assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
+
+
+@pytest.mark.sg
+def test_invalid_datasets_sorensen_w():
+    karate = DATASETS_UNDIRECTED[0]
+    df = karate.get_edgelist()
+    df = df.add(1)
+    G = cugraph.Graph(directed=False)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+    with pytest.raises(ValueError):
+        cugraph.sorensen_w(G, None)

From a280986afc6bd9bdb61f990726f61483d5b30620 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Wed, 19 Jul 2023 10:34:06 -0700
Subject: [PATCH 3/4] [WIP] Make edge ids optional (#3702)

This PR makes edge ids optional for cugraph-dgl dataloaders


Todo:
- [ ] Add tests

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3702
---
 .../dataloading/utils/sampling_helpers.py     | 20 +++++++++++++------
 .../gnn/data_loading/bulk_sampler_io.py       |  2 ++
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index 02052c9841d..051464f08bb 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -52,9 +52,11 @@ def _get_tensor_ls_from_sampled_df(df):
     batch_indices = torch.searchsorted(batch_id_tensor, batch_indices)
 
     split_d = {}
+
     for column in ["sources", "destinations", "edge_id", "hop_id"]:
-        tensor = cast_to_tensor(df[column])
-        split_d[column] = torch.tensor_split(tensor, batch_indices.cpu())
+        if column in df.columns:
+            tensor = cast_to_tensor(df[column])
+            split_d[column] = torch.tensor_split(tensor, batch_indices.cpu())
 
     result_tensor_ls = []
     for i, hop_id_tensor in enumerate(split_d["hop_id"]):
@@ -66,7 +68,11 @@ def _get_tensor_ls_from_sampled_df(df):
         hop_indices = torch.searchsorted(hop_id_tensor, hop_indices)
         s = torch.tensor_split(split_d["sources"][i], hop_indices.cpu())
         d = torch.tensor_split(split_d["destinations"][i], hop_indices.cpu())
-        eid = torch.tensor_split(split_d["edge_id"][i], hop_indices.cpu())
+        if "edge_id" in split_d:
+            eid = torch.tensor_split(split_d["edge_id"][i], hop_indices.cpu())
+        else:
+            eid = [None] * len(s)
+
         result_tensor_ls.append((x, y, z) for x, y, z in zip(s, d, eid))
 
     return result_tensor_ls
@@ -125,7 +131,7 @@ def _create_homogeneous_sampled_graphs_from_tensors_perhop(
 def create_homogeneous_dgl_block_from_tensors_ls(
     src_ids: torch.Tensor,
     dst_ids: torch.Tensor,
-    edge_ids: torch.Tensor,
+    edge_ids: Optional[torch.Tensor],
     seed_nodes: Optional[torch.Tensor],
     total_number_of_nodes: int,
 ):
@@ -133,7 +139,8 @@ def create_homogeneous_dgl_block_from_tensors_ls(
         (src_ids, dst_ids),
         num_nodes=total_number_of_nodes,
     )
-    sampled_graph.edata[dgl.EID] = edge_ids
+    if edge_ids is not None:
+        sampled_graph.edata[dgl.EID] = edge_ids
     # TODO: Check if unique is needed
     if seed_nodes is None:
         seed_nodes = dst_ids.unique()
@@ -144,7 +151,8 @@ def create_homogeneous_dgl_block_from_tensors_ls(
         src_nodes=src_ids.unique(),
         include_dst_in_src=True,
     )
-    block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
+    if edge_ids is not None:
+        block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
     return block
 
 
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
index 44c1185bbf1..002b214e783 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -51,6 +51,8 @@ def _write_samples_to_parquet(
         raise ValueError("Invalid value of partition_info")
 
     max_batch_id = offsets.batch_id.max()
+    results.dropna(axis=1, how="all", inplace=True)
+    results["hop_id"] = results["hop_id"].astype("uint8")
 
     for p in range(0, len(offsets), batches_per_partition):
         offsets_p = offsets.iloc[p : p + batches_per_partition]

From 145a92bb691b3e290b50cc7d7678d8cbff137616 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:45:51 -0400
Subject: [PATCH 4/4] Reorder dependencies.yaml channels (#3721)

This PR reorders the channels to be compatible with other RAPIDS libs.

#3693 added two new channels, but they were ordered differently than [other libraries](https://github.com/rapidsai/cudf/blob/branch-23.08/dependencies.yaml#L163-L169). This caused [an error](https://github.com/rapidsai/docker/actions/runs/5591817313/jobs/10223438072?pr=545#step:10:349) in the `conda-merge` script in https://github.com/rapidsai/docker/pull/545.

It's also worth noting that docker overhaul requiring the channel ordering being consistent across libraries' `dependencies.yaml` seems very fragile and will undoubtedly cause more issues down the line. Curious if anyone has thoughts on making this more reliable.

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cugraph/pull/3721
---
 conda/environments/all_cuda-118_arch-x86_64.yaml       | 4 ++--
 dependencies.yaml                                      | 4 ++--
 python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 9c428ef9d07..1540a85bcc8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -4,10 +4,10 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- conda-forge
-- nvidia
 - pytorch
 - dglteam/label/cu118
+- conda-forge
+- nvidia
 dependencies:
 - aiohttp
 - c-compiler
diff --git a/dependencies.yaml b/dependencies.yaml
index 9b858999743..d6e36a9643c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -181,10 +181,10 @@ channels:
   - rapidsai
   - rapidsai-nightly
   - dask/label/dev
-  - conda-forge
-  - nvidia
   - pytorch
   - dglteam/label/cu118
+  - conda-forge
+  - nvidia
 dependencies:
   checks:
     common:
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index a252d5e0c78..6961a485742 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -4,10 +4,10 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- conda-forge
-- nvidia
 - pytorch
 - dglteam/label/cu118
+- conda-forge
+- nvidia
 dependencies:
 - cugraph==23.8.*
 - dgl>=1.1.0.cu*