Skip to content

Commit

Permalink
Merge in updates
Browse files Browse the repository at this point in the history
  • Loading branch information
VibhuJawa committed Jul 26, 2023
2 parents 0063362 + e07f6cd commit 927c7b5
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 39 deletions.
5 changes: 3 additions & 2 deletions python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ def __init__(
fanout_vals=num_neighbors,
with_replacement=replace,
batches_per_partition=self.__batches_per_partition,
renumber=renumber**kwargs,
renumber=renumber,
**kwargs,
)

# Make sure indices are in cupy
Expand Down Expand Up @@ -253,7 +254,7 @@ def __next__(self):
raw_sample_data = cudf.read_parquet(parquet_path)
if "map" in raw_sample_data.columns:
self.__renumber_map = raw_sample_data["map"]
raw_sample_data = raw_sample_data.drop("map", axis=1)
raw_sample_data.drop("map", axis=1, inplace=True)
else:
self.__renumber_map = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):

out = _sampler_output_from_sampling_results(
sampling_results=sampling_results,
renumber_map=None,
graph_store=cugraph_store,
metadata=torch.arange(6, dtype=torch.int64),
)
Expand Down Expand Up @@ -106,6 +107,7 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph

out = _sampler_output_from_sampling_results(
sampling_results=sampling_results,
renumber_map=None,
graph_store=cugraph_store,
metadata=torch.arange(6, dtype=torch.int64),
)
Expand Down Expand Up @@ -183,7 +185,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
)

out = _sampler_output_from_sampling_results(
mock_sampling_results, graph_store, None
mock_sampling_results, None, graph_store, None
)

assert out.metadata is None
Expand Down
4 changes: 3 additions & 1 deletion python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def test_neighbor_sample(basic_graph_1):

out = _sampler_output_from_sampling_results(
sampling_results=sampling_results,
renumber_map=None,
graph_store=cugraph_store,
metadata=torch.arange(6, dtype=torch.int64),
)
Expand Down Expand Up @@ -94,6 +95,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):

out = _sampler_output_from_sampling_results(
sampling_results=sampling_results,
renumber_map=None,
graph_store=cugraph_store,
metadata=torch.arange(6, dtype=torch.int64),
)
Expand Down Expand Up @@ -144,7 +146,7 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
)

out = _sampler_output_from_sampling_results(
mock_sampling_results, graph_store, None
mock_sampling_results, None, graph_store, None
)

assert out.metadata is None
Expand Down
1 change: 1 addition & 0 deletions python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def _write_samples_to_parquet(

if len(final_map_series) > len(results_p):
# this should rarely happen and only occurs on small graphs/samples
# TODO remove the sort_index to improve performance on small graphs
final_map_series.name = "map"
results_p = results_p.join(final_map_series, how="outer").sort_index()
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -959,29 +959,21 @@ def test_uniform_neighbor_sample_deduplicate_sources_email_eu_core(dask_client):

@pytest.mark.mg
@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
@pytest.mark.tags("runme")
def test_uniform_neighbor_sample_renumber(dask_client, hops):
# FIXME This test is not very good because there is a lot of
# non-deterministic behavior that still exists despite passing
# a random seed. Right now, there are tests in cuGraph-DGL and
# cuGraph-PyG that provide better coverage, but a better test
# should eventually be written to augment or replace this one.

el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=4)

G = cugraph.Graph(directed=True)
G.from_dask_cudf_edgelist(el, source="src", destination="dst")

seeds = G.select_random_vertices(62, int(0.0001 * len(el)))

sampling_results_unrenumbered = cugraph.dask.uniform_neighbor_sample(
G,
seeds,
hops,
with_replacement=False,
with_edge_properties=True,
with_batch_ids=False,
deduplicate_sources=True,
renumber=False,
random_state=62,
keep_batches_together=True,
min_batch_id=0,
max_batch_id=0,
).compute()

sampling_results_renumbered, renumber_map = cugraph.dask.uniform_neighbor_sample(
G,
seeds,
Expand All @@ -999,28 +991,17 @@ def test_uniform_neighbor_sample_renumber(dask_client, hops):
sampling_results_renumbered = sampling_results_renumbered.compute()
renumber_map = renumber_map.compute()

print(
"\n\n",
sampling_results_renumbered,
"\n\n",
renumber_map,
"\n\n",
sampling_results_unrenumbered,
)

sources_hop_0 = sampling_results_unrenumbered[
sampling_results_unrenumbered.hop_id == 0
sources_hop_0 = sampling_results_renumbered[
sampling_results_renumbered.hop_id == 0
].sources
for hop in range(len(hops)):
destinations_hop = sampling_results_unrenumbered[
sampling_results_unrenumbered.hop_id <= hop
].destinations
expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique()

assert sorted(expected_renumber_map.values_host.tolist()) == sorted(
renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist()
)

assert (renumber_map.batch_id == 0).all()
assert (
renumber_map.map.nunique()
== cudf.concat(
[sources_hop_0, sampling_results_renumbered.destinations]
).nunique()
)


# =============================================================================
Expand Down

0 comments on commit 927c7b5

Please sign in to comment.