Merge in updates

rapidsai · Jul 26, 2023 · 927c7b5 · 927c7b5
2 parents 0063362 + e07f6cd
commit 927c7b5
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 39 deletions.
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -170,7 +170,8 @@ def __init__(
             fanout_vals=num_neighbors,
             with_replacement=replace,
             batches_per_partition=self.__batches_per_partition,
-            renumber=renumber**kwargs,
+            renumber=renumber,
+            **kwargs,
         )
 
         # Make sure indices are in cupy
@@ -253,7 +254,7 @@ def __next__(self):
             raw_sample_data = cudf.read_parquet(parquet_path)
             if "map" in raw_sample_data.columns:
                 self.__renumber_map = raw_sample_data["map"]
-                raw_sample_data = raw_sample_data.drop("map", axis=1)
+                raw_sample_data.drop("map", axis=1, inplace=True)
             else:
                 self.__renumber_map = None
 

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -51,6 +51,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 
     out = _sampler_output_from_sampling_results(
         sampling_results=sampling_results,
+        renumber_map=None,
         graph_store=cugraph_store,
         metadata=torch.arange(6, dtype=torch.int64),
     )
@@ -106,6 +107,7 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
 
     out = _sampler_output_from_sampling_results(
         sampling_results=sampling_results,
+        renumber_map=None,
         graph_store=cugraph_store,
         metadata=torch.arange(6, dtype=torch.int64),
     )
@@ -183,7 +185,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
     )
 
     out = _sampler_output_from_sampling_results(
-        mock_sampling_results, graph_store, None
+        mock_sampling_results, None, graph_store, None
     )
 
     assert out.metadata is None

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -44,6 +44,7 @@ def test_neighbor_sample(basic_graph_1):
 
     out = _sampler_output_from_sampling_results(
         sampling_results=sampling_results,
+        renumber_map=None,
         graph_store=cugraph_store,
         metadata=torch.arange(6, dtype=torch.int64),
     )
@@ -94,6 +95,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 
     out = _sampler_output_from_sampling_results(
         sampling_results=sampling_results,
+        renumber_map=None,
         graph_store=cugraph_store,
         metadata=torch.arange(6, dtype=torch.int64),
     )
@@ -144,7 +146,7 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     )
 
     out = _sampler_output_from_sampling_results(
-        mock_sampling_results, graph_store, None
+        mock_sampling_results, None, graph_store, None
     )
 
     assert out.metadata is None

diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -118,6 +118,7 @@ def _write_samples_to_parquet(
 
             if len(final_map_series) > len(results_p):
                 # this should rarely happen and only occurs on small graphs/samples
+                # TODO remove the sort_index to improve performance on small graphs
                 final_map_series.name = "map"
                 results_p = results_p.join(final_map_series, how="outer").sort_index()
             else:

diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -959,29 +959,21 @@ def test_uniform_neighbor_sample_deduplicate_sources_email_eu_core(dask_client):
 
 @pytest.mark.mg
 @pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
+@pytest.mark.tags("runme")
 def test_uniform_neighbor_sample_renumber(dask_client, hops):
+    # FIXME This test is not very good because there is a lot of
+    # non-deterministic behavior that still exists despite passing
+    # a random seed. Right now, there are tests in cuGraph-DGL and
+    # cuGraph-PyG that provide better coverage, but a better test
+    # should eventually be written to augment or replace this one.
+
     el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=4)
 
     G = cugraph.Graph(directed=True)
     G.from_dask_cudf_edgelist(el, source="src", destination="dst")
 
     seeds = G.select_random_vertices(62, int(0.0001 * len(el)))
 
-    sampling_results_unrenumbered = cugraph.dask.uniform_neighbor_sample(
-        G,
-        seeds,
-        hops,
-        with_replacement=False,
-        with_edge_properties=True,
-        with_batch_ids=False,
-        deduplicate_sources=True,
-        renumber=False,
-        random_state=62,
-        keep_batches_together=True,
-        min_batch_id=0,
-        max_batch_id=0,
-    ).compute()
-
     sampling_results_renumbered, renumber_map = cugraph.dask.uniform_neighbor_sample(
         G,
         seeds,
@@ -999,28 +991,17 @@ def test_uniform_neighbor_sample_renumber(dask_client, hops):
     sampling_results_renumbered = sampling_results_renumbered.compute()
     renumber_map = renumber_map.compute()
 
-    print(
-        "\n\n",
-        sampling_results_renumbered,
-        "\n\n",
-        renumber_map,
-        "\n\n",
-        sampling_results_unrenumbered,
-    )
-
-    sources_hop_0 = sampling_results_unrenumbered[
-        sampling_results_unrenumbered.hop_id == 0
+    sources_hop_0 = sampling_results_renumbered[
+        sampling_results_renumbered.hop_id == 0
     ].sources
-    for hop in range(len(hops)):
-        destinations_hop = sampling_results_unrenumbered[
-            sampling_results_unrenumbered.hop_id <= hop
-        ].destinations
-        expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique()
-
-        assert sorted(expected_renumber_map.values_host.tolist()) == sorted(
-            renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist()
-        )
+
     assert (renumber_map.batch_id == 0).all()
+    assert (
+        renumber_map.map.nunique()
+        == cudf.concat(
+            [sources_hop_0, sampling_results_renumbered.destinations]
+        ).nunique()
+    )
 
 
 # =============================================================================