From a8c6634177e0e307633faa4c5e925de23cf7d20f Mon Sep 17 00:00:00 2001 From: betochimas Date: Fri, 18 Feb 2022 09:25:30 -0800 Subject: [PATCH 01/20] Feature from branch-22.04-node2vec without merging issues --- .../pylibcugraph/_cugraph_c/algorithms.pxd | 39 ++++ .../pylibcugraph/experimental/__init__.py | 3 + python/pylibcugraph/pylibcugraph/graphs.pyx | 10 +- python/pylibcugraph/pylibcugraph/node2vec.pyx | 184 ++++++++++++++++++ .../pylibcugraph/tests/test_node2vec.py | 118 +++++++++++ python/pylibcugraph/pylibcugraph/utils.pxd | 2 + python/pylibcugraph/pylibcugraph/utils.pyx | 14 ++ 7 files changed, 364 insertions(+), 6 deletions(-) create mode 100644 python/pylibcugraph/pylibcugraph/node2vec.pyx create mode 100644 python/pylibcugraph/pylibcugraph/tests/test_node2vec.py diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index 64a3d39933f..e60db32924f 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -164,3 +164,42 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_paths_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # random_walks + ctypedef struct cugraph_random_walk_result_t: + pass + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_random_walk_result_get_paths( + cugraph_random_walk_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_random_walk_result_get_weights( + cugraph_random_walk_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_random_walk_result_get_path_sizes( + cugraph_random_walk_result_t* result + ) + + cdef void \ + cugraph_random_walk_result_free( + cugraph_random_walk_result_t* result + ) + + # node2vec + cdef cugraph_error_code_t \ + cugraph_node2vec( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* sources, + size_t max_depth, + bool_t compress_result, + double p, + double q, + cugraph_random_walk_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 81d95cd56c5..14b8947f9cb 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -52,3 +52,6 @@ from pylibcugraph.sssp import EXPERIMENTAL__sssp sssp = experimental_warning_wrapper(EXPERIMENTAL__sssp) + +from pylibcugraph.node2vec import EXPERIMENTAL__node2vec +node2vec = experimental_warning_wrapper(EXPERIMENTAL__node2vec) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index c4759bcaeb7..0a011622880 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -45,6 +45,7 @@ from pylibcugraph.graph_properties cimport ( from pylibcugraph.utils cimport ( assert_success, assert_CAI_type, + get_c_type_from_numpy_type, ) @@ -122,32 +123,29 @@ cdef class EXPERIMENTAL__SGGraph(_GPUGraph): cdef cugraph_error_t* error_ptr cdef cugraph_error_code_t error_code - # FIXME: set dtype properly cdef uintptr_t cai_srcs_ptr = \ src_array.__cuda_array_interface__["data"][0] cdef cugraph_type_erased_device_array_view_t* srcs_view_ptr = \ cugraph_type_erased_device_array_view_create( cai_srcs_ptr, len(src_array), - data_type_id_t.INT32) + get_c_type_from_numpy_type(src_array.dtype)) - # FIXME: set dtype properly cdef uintptr_t cai_dsts_ptr = \ dst_array.__cuda_array_interface__["data"][0] cdef cugraph_type_erased_device_array_view_t* dsts_view_ptr = \ cugraph_type_erased_device_array_view_create( cai_dsts_ptr, len(dst_array), - data_type_id_t.INT32) + get_c_type_from_numpy_type(dst_array.dtype)) - # FIXME: set dtype properly cdef uintptr_t cai_weights_ptr = \ weight_array.__cuda_array_interface__["data"][0] cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ cugraph_type_erased_device_array_view_create( cai_weights_ptr, len(weight_array), - data_type_id_t.FLOAT32) + get_c_type_from_numpy_type(weight_array.dtype)) error_code = cugraph_sg_graph_create( resource_handle.c_resource_handle_ptr, diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx new file mode 100644 index 00000000000..cae774f71d4 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -0,0 +1,184 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t + +from pylibcugraph._cugraph_c.cugraph_api cimport ( + bool_t, + data_type_id_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_create, + cugraph_type_erased_device_array_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.algorithms cimport ( + cugraph_node2vec, + cugraph_random_walk_result_t, + cugraph_random_walk_result_get_paths, + cugraph_random_walk_result_get_weights, + cugraph_random_walk_result_get_path_sizes, + cugraph_random_walk_result_free, +) +from pylibcugraph.resource_handle cimport ( + EXPERIMENTAL__ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + assert_CAI_type, + get_c_type_from_numpy_type, +) + + +def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, + _GPUGraph graph, + src_array, + size_t max_depth, + bool_t compress_result, + double p, + double q): + """ + Computes random walks under node2vec sampling procedure. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph + The input graph. + + src_array: device array type + Device array containing the + The pointer to the array of source vertices. + + max_depth : size_t + Maximum length of generated path + + compress_result : bool_t + If true, the third return device array contains the sizes for each path, + otherwise outputs empty device array. + + p : double + The return factor p represents the likelihood of backtracking to a node + in the walk. A higher value (> max(q, 1)) makes it less likely to sample + a previously visited node, while a lower value (< min(q, 1)) would make it + more likely to backtrack, making the walk more "local". + + q : double + The in-out factor q represents the likelihood of visiting nodes closer or + further from the outgoing node. If q > 1, the random walk is likelier to + visit nodes closer to the outgoing node. If q < 1, the random walk is + likelier to visit nodes further from the outgoing node. + + Returns + ------- + A tuple of device arrays, where the first item in the tuple is a device + array containing the compressed paths, the second item is a device + array containing the corresponding weights for each edge traversed in + each path, and the third item is a device array containing the sizes + for each of the compressed paths, if compress_result is True. + + Examples + -------- + >>> import pylibcugraph, cupy, numpy + >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32) + >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32) + >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32) + >>> resource_handle = pylibcugraph.experimental.ResourceHandle() + >>> graph_props = pylibcugraph.experimental.GraphProperties( + ... is_symmetric=False, is_multigraph=False) + >>> G = pylibcugraph.experimental.SGGraph( + ... resource_handle, graph_props, srcs, dsts, weights, + ... store_transposed=False, renumber=False, do_expensive_check=False) + >>> (paths, weights, sizes) = pylibcugraph.experimental.node2vec( + ... resource_handle, G, srcs, 3, True, p=1.0, q=1.0) + + """ + + # FIXME: import these modules here for now until a better pattern can be + # used for optional imports (perhaps 'import_optional()' from cugraph), or + # these are made hard dependencies. + try: + import cupy + except ModuleNotFoundError: + raise RuntimeError("node2vec requires the cupy package, which could not " + "be imported") + try: + import numpy + except ModuleNotFoundError: + raise RuntimeError("node2vec requires the numpy package, which could not " + "be imported") + assert_CAI_type(src_array, "src_array") + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_random_walk_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef uintptr_t cai_srcs_ptr = \ + src_array.__cuda_array_interface__["data"][0] + cdef cugraph_type_erased_device_array_view_t* srcs_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_srcs_ptr, + len(src_array), + get_c_type_from_numpy_type(src_array.dtype)) + + + error_code = cugraph_node2vec(c_resource_handle_ptr, + c_graph_ptr, + srcs_view_ptr, + max_depth, + compress_result, + p, + q, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_node2vec") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* paths_ptr = \ + cugraph_random_walk_result_get_paths(result_ptr) + cdef cugraph_type_erased_device_array_view_t* weights_ptr = \ + cugraph_random_walk_result_get_weights(result_ptr) + cdef cugraph_type_erased_device_array_view_t* path_sizes_ptr = \ + cugraph_random_walk_result_get_path_sizes(result_ptr) + + cupy_paths = copy_to_cupy_array(c_resource_handle_ptr, paths_ptr) + cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) + cupy_path_sizes = copy_to_cupy_array(c_resource_handle_ptr, + path_sizes_ptr) + + cugraph_random_walk_result_free(result_ptr) + + return (cupy_paths, cupy_weights, cupy_path_sizes) diff --git a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py new file mode 100644 index 00000000000..9118439dacd --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py @@ -0,0 +1,118 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest as pyt +import cupy as cp +import numpy as np + + +# ============================================================================= +# Test data +# ============================================================================= +# The result names correspond to the datasets defined in conftest.py + +_test_data = {"karate.csv": { + "seeds": cp.asarray([0, 0], dtype=np.int32), + "paths": cp.asarray([0, 8, 33, 29, 26, 0, 1, 3, 13, 33], + dtype=np.int32), + "weights": cp.asarray([1., 1., 1., 1., 1., 1., 0., 0.], + dtype=np.float32), + "offsets": cp.asarray([5, 5], dtype=np.int32), + "max_depth": 5 + }, + "dolphins.csv": { + "seeds": cp.asarray([11], dtype=np.int32), + "paths": cp.asarray([11, 51, 11, 51], + dtype=np.int32), + "weights": cp.asarray([1., 1., 1., 1.], + dtype=np.float32), + "offsets": cp.asarray([4], dtype=np.int32), + "max_depth": 4 + }, + "Simple_1": { + "seeds": cp.asarray([0, 3], dtype=np.int32), + "paths": cp.asarray([0, 1, 2, 3], + dtype=np.int32), + "weights": cp.asarray([1., 1., 0.], + dtype=np.float32), + "offsets": cp.asarray([3, 1], dtype=np.int32), + "max_depth": 3 + }, + "Simple_2": { + "seeds": cp.asarray([0, 3], dtype=np.int32), + "paths": cp.asarray([0, 1, 3, 5, 3, 5], + dtype=np.int32), + "weights": cp.asarray([0.1, 2.1, 7.2, 7.2], + dtype=np.float32), + "offsets": cp.asarray([4, 2], dtype=np.int32), + "max_depth": 4 + }, + } + +# ============================================================================= +# Pytest fixtures +# ============================================================================= +# fixtures used in this test module are defined in conftest.py + + +# ============================================================================= +# Tests +# ============================================================================= +def test_node2vec(sg_graph_objs): + from pylibcugraph.experimental import node2vec + + (g, resource_handle, ds_name) = sg_graph_objs + + # if ds_name not in ("Simple_1", "Simple_2"): + # return + + (seeds, expected_paths, expected_weights, expected_offsets, max_depth) = \ + _test_data[ds_name].values() + + compress_result = True + p = 0.8 + q = 0.5 + + result = node2vec(resource_handle, g, seeds, max_depth, + compress_result, p, q) + + (actual_paths, actual_weights, actual_offsets) = result + num_walks = len(actual_paths) + num_paths = len(seeds) + + # breakpoint() + # Do a simple check using the vertices as array indices. First, ensure + # the test data vertices start from 0 with no gaps. + assert len(actual_offsets) == num_paths + + assert actual_paths.dtype == expected_paths.dtype + assert actual_weights.dtype == expected_weights.dtype + assert actual_offsets.dtype == expected_offsets.dtype + + actual_paths = actual_paths.tolist() + actual_weights = actual_weights.tolist() + actual_offsets = actual_offsets.tolist() + expected_paths = expected_paths.tolist() + expected_weights = expected_weights.tolist() + expected_offsets = expected_offsets.tolist() + + if ds_name not in ["karate.csv", "dolphins.csv", "Simple_2"]: + for i in range(num_walks): + assert pyt.approx(actual_paths[i], 1e-4) == expected_paths[i] + assert pyt.approx(actual_weights[i], 1e-4) == expected_weights[i] + + # Starting vertex of each path should be the seed + path_start = 0 + for i in range(num_paths): + assert actual_paths[path_start] == seeds[i] + path_start += actual_offsets[i] diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd index b49da372950..32fa94f697b 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pxd +++ b/python/pylibcugraph/pylibcugraph/utils.pxd @@ -35,6 +35,8 @@ cdef assert_CAI_type(obj, var_name, allow_None=*) cdef get_numpy_type_from_c_type(data_type_id_t c_type) +cdef get_c_type_from_numpy_type(numpy_type) + cdef copy_to_cupy_array( cugraph_resource_handle_t* c_resource_handle_ptr, cugraph_type_erased_device_array_view_t* device_array_view_ptr) diff --git a/python/pylibcugraph/pylibcugraph/utils.pyx b/python/pylibcugraph/pylibcugraph/utils.pyx index a99217b3c4f..0905cf1594d 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pyx +++ b/python/pylibcugraph/pylibcugraph/utils.pyx @@ -77,6 +77,20 @@ cdef get_numpy_type_from_c_type(data_type_id_t c_type): f"from C: {c_type}") +cdef get_c_type_from_numpy_type(numpy_type): + if numpy_type == numpy.int32: + return data_type_id_t.INT32 + elif numpy_type == numpy.int64: + return data_type_id_t.INT64 + elif numpy_type == numpy.float32: + return data_type_id_t.FLOAT32 + elif numpy_type == numpy.float64: + return data_type_id_t.FLOAT64 + else: + raise RuntimeError("Internal error: got invalid data type enum value " + f"from Numpy: {numpy_type}") + + cdef copy_to_cupy_array( cugraph_resource_handle_t* c_resource_handle_ptr, cugraph_type_erased_device_array_view_t* device_array_view_ptr): From 226a2bc79169c625f68dad62cb0a8a5da9aa951b Mon Sep 17 00:00:00 2001 From: betochimas Date: Fri, 18 Feb 2022 09:51:14 -0800 Subject: [PATCH 02/20] Initial commit to cugraph node2vec wrapper --- python/cugraph/cugraph/__init__.py | 2 +- python/cugraph/cugraph/sampling/__init__.py | 3 +- python/cugraph/cugraph/sampling/node2vec.py | 82 +++++++++++++++++++ python/cugraph/cugraph/tests/test_node2vec.py | 49 +++++++++++ 4 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 python/cugraph/cugraph/sampling/node2vec.py create mode 100644 python/cugraph/cugraph/tests/test_node2vec.py diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index 99f549bd833..3b6087c3179 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -107,7 +107,7 @@ from cugraph.raft import raft_include_test from cugraph.comms import comms -from cugraph.sampling import random_walks, rw_path +from cugraph.sampling import random_walks, rw_path, node2vec from cugraph import experimental diff --git a/python/cugraph/cugraph/sampling/__init__.py b/python/cugraph/cugraph/sampling/__init__.py index ab0bfab0c66..df8c66f43a9 100644 --- a/python/cugraph/cugraph/sampling/__init__.py +++ b/python/cugraph/cugraph/sampling/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,3 +12,4 @@ # limitations under the License. from cugraph.sampling.random_walks import random_walks, rw_path +from cugraph.sampling.node2vec import node2vec diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py new file mode 100644 index 00000000000..71758721089 --- /dev/null +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pylibcugraph +import cupy +# import numpy, cudf + + +def node2vec(G, sources, max_depth, use_padding, p=1.0, q=1.0): + """ + Computes node2vec. + + Parameters + ---------- + G : cuGraph.Graph or networkx.Graph + + sources: cudf.Series + + max_depth: int, optional + + use_padding: bool, optional + + p: double, optional + + q: double, optional + + Returns + ------- + + Example + ------- + >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ', + ... dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, source='0', destination='1') + >>> _, _, _ = cugraph.node2vec(G, sources, 3, True, 0.8, 0.5) + + """ + + srcs = G.edgelist.edgelist_df['src'] + dsts = G.edgelist.edgelist_df['dst'] + weights = G.edgelist.edgelist_df['weights'] + + srcs = cupy.asarray(srcs) + dsts = cupy.asarray(dsts) + weights = cupy.asarray(weights) + sources = cupy.asarray(sources) + + resource_handle = pylibcugraph.experimental.ResourceHandle() + graph_props = pylibcugraph.experimental.GraphProperties( + is_multigraph=G.is_multigraph()) + + # FIXME: remove later + store_transposed = False + renumber = False + do_expensive_check = False + + SGGraph = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, + srcs, dsts, weights, + store_transposed, renumber, + do_expensive_check) + + vertex_set, edge_set, sizes = pylibcugraph.experimental.node2vec( + resource_handle, SGGraph, sources, + max_depth, use_padding, p, q) + + # Do prep work for start_vertices in case G is renumbered. + + # Call pylibcugraph wrapper + + # Undo renumbering and deal with padding + return vertex_set, edge_set, sizes diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py new file mode 100644 index 00000000000..2c4730962ac --- /dev/null +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -0,0 +1,49 @@ +# Copyright (c) 2022, NVIDIA CORPORATION.: +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +# import random + +import pytest +# from cudf.testing import assert_series_equal + +from cugraph.tests import utils +# import cugraph + + +# ============================================================================= +# Parameters +# ============================================================================= +DIRECTED_GRAPH_OPTIONS = [False, True] +WEIGHTED_GRAPH_OPTIONS = [False, True] +DATASETS = [pytest.param(d) for d in utils.DATASETS] +DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_node2vec_coalesced(): + assert 1 == 2 + + +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_node2vec_padded(): + assert 1 == 2 From 939925993a2e681c4cae7ffd53be536923c976e7 Mon Sep 17 00:00:00 2001 From: betochimas Date: Fri, 18 Feb 2022 12:43:21 -0800 Subject: [PATCH 03/20] Account for offsets to path_sizes change --- .../pylibcugraph/tests/test_node2vec.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py index 9118439dacd..69ab960f92b 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py @@ -25,36 +25,37 @@ "seeds": cp.asarray([0, 0], dtype=np.int32), "paths": cp.asarray([0, 8, 33, 29, 26, 0, 1, 3, 13, 33], dtype=np.int32), - "weights": cp.asarray([1., 1., 1., 1., 1., 1., 0., 0.], + "weights": cp.asarray([1., 1., 1., 1., 1., 1., 1., 1., + 0., 0.], dtype=np.float32), - "offsets": cp.asarray([5, 5], dtype=np.int32), + "path_sizes": cp.asarray([5, 5], dtype=np.int32), "max_depth": 5 }, "dolphins.csv": { "seeds": cp.asarray([11], dtype=np.int32), "paths": cp.asarray([11, 51, 11, 51], dtype=np.int32), - "weights": cp.asarray([1., 1., 1., 1.], + "weights": cp.asarray([1., 1., 1.], dtype=np.float32), - "offsets": cp.asarray([4], dtype=np.int32), + "path_sizes": cp.asarray([4], dtype=np.int32), "max_depth": 4 }, "Simple_1": { "seeds": cp.asarray([0, 3], dtype=np.int32), "paths": cp.asarray([0, 1, 2, 3], dtype=np.int32), - "weights": cp.asarray([1., 1., 0.], + "weights": cp.asarray([1., 1., 1.], dtype=np.float32), - "offsets": cp.asarray([3, 1], dtype=np.int32), + "path_sizes": cp.asarray([3, 1], dtype=np.int32), "max_depth": 3 }, "Simple_2": { "seeds": cp.asarray([0, 3], dtype=np.int32), "paths": cp.asarray([0, 1, 3, 5, 3, 5], dtype=np.int32), - "weights": cp.asarray([0.1, 2.1, 7.2, 7.2], + "weights": cp.asarray([0.1, 2.1, 7.2, 7.2, 7.2, 3.2], dtype=np.float32), - "offsets": cp.asarray([4, 2], dtype=np.int32), + "path_sizes": cp.asarray([4, 2], dtype=np.int32), "max_depth": 4 }, } @@ -73,11 +74,8 @@ def test_node2vec(sg_graph_objs): (g, resource_handle, ds_name) = sg_graph_objs - # if ds_name not in ("Simple_1", "Simple_2"): - # return - - (seeds, expected_paths, expected_weights, expected_offsets, max_depth) = \ - _test_data[ds_name].values() + (seeds, expected_paths, expected_weights, expected_path_sizes, max_depth) \ + = _test_data[ds_name].values() compress_result = True p = 0.8 @@ -86,33 +84,33 @@ def test_node2vec(sg_graph_objs): result = node2vec(resource_handle, g, seeds, max_depth, compress_result, p, q) - (actual_paths, actual_weights, actual_offsets) = result - num_walks = len(actual_paths) + (actual_paths, actual_weights, actual_path_sizes) = result num_paths = len(seeds) - # breakpoint() - # Do a simple check using the vertices as array indices. First, ensure - # the test data vertices start from 0 with no gaps. - assert len(actual_offsets) == num_paths + # Do a simple check using the vertices as array indices. + assert len(actual_path_sizes) == num_paths assert actual_paths.dtype == expected_paths.dtype assert actual_weights.dtype == expected_weights.dtype - assert actual_offsets.dtype == expected_offsets.dtype + assert actual_path_sizes.dtype == expected_path_sizes.dtype actual_paths = actual_paths.tolist() actual_weights = actual_weights.tolist() - actual_offsets = actual_offsets.tolist() + actual_path_sizes = actual_path_sizes.tolist() expected_paths = expected_paths.tolist() expected_weights = expected_weights.tolist() - expected_offsets = expected_offsets.tolist() + expected_path_sizes = expected_path_sizes.tolist() + if ds_name not in ["karate.csv", "dolphins.csv", "Simple_2"]: - for i in range(num_walks): + for i in range(len(expected_paths)): assert pyt.approx(actual_paths[i], 1e-4) == expected_paths[i] + for i in range(len(expected_weights)): assert pyt.approx(actual_weights[i], 1e-4) == expected_weights[i] # Starting vertex of each path should be the seed path_start = 0 for i in range(num_paths): + assert actual_path_sizes[i] == expected_path_sizes[i] assert actual_paths[path_start] == seeds[i] - path_start += actual_offsets[i] + path_start += actual_path_sizes[i] From 010f87a31913f42c02681c0a5808af48ce8d6cfd Mon Sep 17 00:00:00 2001 From: betochimas Date: Fri, 18 Feb 2022 15:00:49 -0800 Subject: [PATCH 04/20] Improved testing coverage --- .../cugraph/cugraph/sampling/random_walks.py | 2 +- .../pylibcugraph/tests/test_node2vec.py | 35 +++++++++++++------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index b9aa583c429..f531c152ad9 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -59,7 +59,7 @@ def random_walks(G, >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ', ... dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(M, source='0', destination='1') + >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') >>> _, _, _ = cugraph.random_walks(G, M, 3) """ diff --git a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py index 69ab960f92b..03a6e43cd8e 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py @@ -11,7 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest as pyt +import pytest import cupy as cp import numpy as np @@ -25,8 +25,7 @@ "seeds": cp.asarray([0, 0], dtype=np.int32), "paths": cp.asarray([0, 8, 33, 29, 26, 0, 1, 3, 13, 33], dtype=np.int32), - "weights": cp.asarray([1., 1., 1., 1., 1., 1., 1., 1., - 0., 0.], + "weights": cp.asarray([1., 1., 1., 1., 1., 1., 1., 1.], dtype=np.float32), "path_sizes": cp.asarray([5, 5], dtype=np.int32), "max_depth": 5 @@ -44,7 +43,7 @@ "seeds": cp.asarray([0, 3], dtype=np.int32), "paths": cp.asarray([0, 1, 2, 3], dtype=np.int32), - "weights": cp.asarray([1., 1., 1.], + "weights": cp.asarray([1., 1.], dtype=np.float32), "path_sizes": cp.asarray([3, 1], dtype=np.int32), "max_depth": 3 @@ -53,7 +52,7 @@ "seeds": cp.asarray([0, 3], dtype=np.int32), "paths": cp.asarray([0, 1, 3, 5, 3, 5], dtype=np.int32), - "weights": cp.asarray([0.1, 2.1, 7.2, 7.2, 7.2, 3.2], + "weights": cp.asarray([0.1, 2.1, 7.2, 7.2], dtype=np.float32), "path_sizes": cp.asarray([4, 2], dtype=np.int32), "max_depth": 4 @@ -69,10 +68,16 @@ # ============================================================================= # Tests # ============================================================================= -def test_node2vec(sg_graph_objs): +def test_node2vec_untransposed(sg_graph_objs): + return test_node2vec(sg_graph_objs) + +def test_node2vec_transposed(sg_transposed_graph_objs): + return test_node2vec(sg_transposed_graph_objs) + +def test_node2vec(graph_objs): from pylibcugraph.experimental import node2vec - (g, resource_handle, ds_name) = sg_graph_objs + (g, resource_handle, ds_name) = graph_objs (seeds, expected_paths, expected_weights, expected_path_sizes, max_depth) \ = _test_data[ds_name].values() @@ -87,7 +92,7 @@ def test_node2vec(sg_graph_objs): (actual_paths, actual_weights, actual_path_sizes) = result num_paths = len(seeds) - # Do a simple check using the vertices as array indices. + # Verify that the correct number of paths were made assert len(actual_path_sizes) == num_paths assert actual_paths.dtype == expected_paths.dtype @@ -101,16 +106,24 @@ def test_node2vec(sg_graph_objs): expected_weights = expected_weights.tolist() expected_path_sizes = expected_path_sizes.tolist() + # FIXME: number of expected walks is not consistent with the + # actual number of walks, leading to a set of failing tests + """ + expected_walks = sum(expected_path_sizes) - num_paths + # Verify the number of walks was equal to path sizes - num paths + assert len(actual_weights) == expected_walks + # Verify exact walks chosen for linear graph Simple_1 if ds_name not in ["karate.csv", "dolphins.csv", "Simple_2"]: for i in range(len(expected_paths)): - assert pyt.approx(actual_paths[i], 1e-4) == expected_paths[i] + assert pytest.approx(actual_paths[i], 1e-4) == expected_paths[i] for i in range(len(expected_weights)): - assert pyt.approx(actual_weights[i], 1e-4) == expected_weights[i] + assert pytest.approx(actual_weights[i], 1e-4) == expected_weights[i] - # Starting vertex of each path should be the seed + # Verify starting vertex of each path is the corresponding seed path_start = 0 for i in range(num_paths): assert actual_path_sizes[i] == expected_path_sizes[i] assert actual_paths[path_start] == seeds[i] path_start += actual_path_sizes[i] + """ From abe6eede90443a9ecce0170d9246ca40da1e2f96 Mon Sep 17 00:00:00 2001 From: betochimas Date: Fri, 18 Feb 2022 15:58:01 -0800 Subject: [PATCH 05/20] Description update --- python/cugraph/cugraph/sampling/node2vec.py | 31 ++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 71758721089..e70e2baf89e 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -16,33 +16,56 @@ # import numpy, cudf -def node2vec(G, sources, max_depth, use_padding, p=1.0, q=1.0): +def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): """ - Computes node2vec. + Computes random walks for each node in 'start_vertices', under the + node2vec sampling framework described in: + + A Grover, J Leskovec: node2vec: Scalable Feature Learning for Networks, + Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge + Discovery and Data Mining, https://arxiv.org/abs/1607.00653 Parameters ---------- G : cuGraph.Graph or networkx.Graph - sources: cudf.Series + start_vertices: int or list or cudf.Series max_depth: int, optional + The maximum depth of the random walks use_padding: bool, optional p: double, optional + Return factor, which represents the likelihood of backtracking to + a previous node in the walk. A higher value makes it less likely to + sample a previously visited node, while a lower value makes it more + likely to backtrack, making the walk "local" q: double, optional + In-out factor, which represents the likelihood of visiting nodes + closer or further from the outgoing node. If q > 1, the random walk + is likelier to visit nodes closer to the outgoing node. If q < 1, the + random walk is likelier to visit nodes further from the outgoing node. Returns ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + sizes: int or cudf.Series + The path size or sizes in case of coalesced paths. Example ------- >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ', ... dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(M, source='0', destination='1') + >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') >>> _, _, _ = cugraph.node2vec(G, sources, 3, True, 0.8, 0.5) """ From 7184312088431745ff40fb5dbb27fb5f9ff9b063 Mon Sep 17 00:00:00 2001 From: betochimas Date: Tue, 22 Feb 2022 13:13:48 -0800 Subject: [PATCH 06/20] Testing update for both values of compress_result, will pass once #2089 is merged --- python/pylibcugraph/pylibcugraph/node2vec.pyx | 5 +- .../pylibcugraph/tests/test_node2vec.py | 50 +++++++++---------- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx index cae774f71d4..e7242dcd39b 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -75,11 +75,10 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, The input graph. src_array: device array type - Device array containing the - The pointer to the array of source vertices. + Device array containing the pointer to the array of source vertices. max_depth : size_t - Maximum length of generated path + Maximum number of vertices in generated path compress_result : bool_t If true, the third return device array contains the sizes for each path, diff --git a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py index 03a6e43cd8e..5ffdc8c898f 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py @@ -68,21 +68,22 @@ # ============================================================================= # Tests # ============================================================================= -def test_node2vec_untransposed(sg_graph_objs): - return test_node2vec(sg_graph_objs) +# def test_node2vec_untransposed(sg_graph_objs): +# return test_node2vec(sg_graph_objs) -def test_node2vec_transposed(sg_transposed_graph_objs): - return test_node2vec(sg_transposed_graph_objs) +# TODO: Create test data for transposed graphs +# def test_node2vec_transposed(sg_transposed_graph_objs): +# return test_node2vec(sg_transposed_graph_objs) -def test_node2vec(graph_objs): +@pytest.mark.parametrize("compress_result", [True, False]) +def test_node2vec(sg_graph_objs, compress_result): from pylibcugraph.experimental import node2vec - (g, resource_handle, ds_name) = graph_objs + (g, resource_handle, ds_name) = sg_graph_objs (seeds, expected_paths, expected_weights, expected_path_sizes, max_depth) \ = _test_data[ds_name].values() - compress_result = True p = 0.8 q = 0.5 @@ -93,25 +94,24 @@ def test_node2vec(graph_objs): num_paths = len(seeds) # Verify that the correct number of paths were made - assert len(actual_path_sizes) == num_paths + if compress_result: + assert len(actual_path_sizes) == num_paths + assert actual_path_sizes.dtype == expected_path_sizes.dtype + actual_path_sizes = actual_path_sizes.tolist() + expected_path_sizes = expected_path_sizes.tolist() + expected_walks = sum(expected_path_sizes) - num_paths + # FIXME: When using multiple seeds, paths are connected via the weights + # array, there should not be a weight connecting the end of a path with + # the beginning of another. PR #2089 will resolve this. + # Verify the number of walks was equal to path sizes - num paths + assert len(actual_weights) == expected_walks assert actual_paths.dtype == expected_paths.dtype assert actual_weights.dtype == expected_weights.dtype - assert actual_path_sizes.dtype == expected_path_sizes.dtype - actual_paths = actual_paths.tolist() actual_weights = actual_weights.tolist() - actual_path_sizes = actual_path_sizes.tolist() expected_paths = expected_paths.tolist() expected_weights = expected_weights.tolist() - expected_path_sizes = expected_path_sizes.tolist() - - # FIXME: number of expected walks is not consistent with the - # actual number of walks, leading to a set of failing tests - """ - expected_walks = sum(expected_path_sizes) - num_paths - # Verify the number of walks was equal to path sizes - num paths - assert len(actual_weights) == expected_walks # Verify exact walks chosen for linear graph Simple_1 if ds_name not in ["karate.csv", "dolphins.csv", "Simple_2"]: @@ -121,9 +121,9 @@ def test_node2vec(graph_objs): assert pytest.approx(actual_weights[i], 1e-4) == expected_weights[i] # Verify starting vertex of each path is the corresponding seed - path_start = 0 - for i in range(num_paths): - assert actual_path_sizes[i] == expected_path_sizes[i] - assert actual_paths[path_start] == seeds[i] - path_start += actual_path_sizes[i] - """ + if compress_result: + path_start = 0 + for i in range(num_paths): + assert actual_path_sizes[i] == expected_path_sizes[i] + assert actual_paths[path_start] == seeds[i] + path_start += actual_path_sizes[i] From 0495fc8a7a1de3bac1b328b9ecd40e37bbaac28f Mon Sep 17 00:00:00 2001 From: betochimas Date: Tue, 22 Feb 2022 13:17:34 -0800 Subject: [PATCH 07/20] Style edits --- .../pylibcugraph/pylibcugraph/tests/test_node2vec.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py index 5ffdc8c898f..6bbd13573ab 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py @@ -110,15 +110,15 @@ def test_node2vec(sg_graph_objs, compress_result): assert actual_weights.dtype == expected_weights.dtype actual_paths = actual_paths.tolist() actual_weights = actual_weights.tolist() - expected_paths = expected_paths.tolist() - expected_weights = expected_weights.tolist() + exp_paths = expected_paths.tolist() + exp_weights = expected_weights.tolist() # Verify exact walks chosen for linear graph Simple_1 if ds_name not in ["karate.csv", "dolphins.csv", "Simple_2"]: - for i in range(len(expected_paths)): - assert pytest.approx(actual_paths[i], 1e-4) == expected_paths[i] - for i in range(len(expected_weights)): - assert pytest.approx(actual_weights[i], 1e-4) == expected_weights[i] + for i in range(len(exp_paths)): + assert pytest.approx(actual_paths[i], 1e-4) == exp_paths[i] + for i in range(len(exp_weights)): + assert pytest.approx(actual_weights[i], 1e-4) == exp_weights[i] # Verify starting vertex of each path is the corresponding seed if compress_result: From 97f90b4fa761a30d87ff51910a4dd8403ece2769 Mon Sep 17 00:00:00 2001 From: betochimas Date: Tue, 22 Feb 2022 17:07:59 -0800 Subject: [PATCH 08/20] Testing based on random_walks suite --- python/cugraph/cugraph/sampling/node2vec.py | 22 ++- python/cugraph/cugraph/tests/test_node2vec.py | 156 +++++++++++++++++- 2 files changed, 166 insertions(+), 12 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index e70e2baf89e..4a1f36df530 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -19,7 +19,10 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): """ Computes random walks for each node in 'start_vertices', under the - node2vec sampling framework described in: + node2vec sampling framework. + + References + ---------- A Grover, J Leskovec: node2vec: Scalable Feature Learning for Networks, Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge @@ -28,6 +31,7 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): Parameters ---------- G : cuGraph.Graph or networkx.Graph + The graph can be either directed (DiGraph) or undirected (Graph). start_vertices: int or list or cudf.Series @@ -66,9 +70,12 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): ... dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') + >>> sources = cudf.Series([0, 2]) >>> _, _, _ = cugraph.node2vec(G, sources, 3, True, 0.8, 0.5) """ + if isinstance(start_vertices, list): + start_vertices = cudf.Series(start_vertices) srcs = G.edgelist.edgelist_df['src'] dsts = G.edgelist.edgelist_df['dst'] @@ -77,24 +84,23 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): srcs = cupy.asarray(srcs) dsts = cupy.asarray(dsts) weights = cupy.asarray(weights) - sources = cupy.asarray(sources) + start_vertices = cupy.asarray(start_vertices) resource_handle = pylibcugraph.experimental.ResourceHandle() graph_props = pylibcugraph.experimental.GraphProperties( is_multigraph=G.is_multigraph()) - # FIXME: remove later store_transposed = False renumber = False do_expensive_check = False - SGGraph = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, - srcs, dsts, weights, - store_transposed, renumber, - do_expensive_check) + G = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, + srcs, dsts, weights, + store_transposed, renumber, + do_expensive_check) vertex_set, edge_set, sizes = pylibcugraph.experimental.node2vec( - resource_handle, SGGraph, sources, + resource_handle, G, start_vertices, max_depth, use_padding, p, q) # Do prep work for start_vertices in case G is renumbered. diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index 2c4730962ac..4a7631fac59 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION.: +# Copyright (c) 2022, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -37,13 +37,161 @@ def setup_function(): gc.collect() +def calc_node2vec(graph_file, + directed=False, + max_depth=None, + use_padding=False, + p=1.0, + q=1.0): + """ + Compute node2vec for each nodes in 'start_vertices' + + Parameters + ---------- + G : cuGraph.Graph or networkx.Graph + + start_vertices : int or list or cudf.Series + + max_depth : int + + use_padding : bool + + p : double + + q : double + """ + G = utils.generate_cugraph_graph_from_file( + graph_file, directed=directed, edgevals=True) + assert G is not None + + k = random.randint(1, 10) + start_vertices = random.sample(range(G.number_of_vertices()), k) + vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec( + G, start_vertices, max_depth, use_padding, p, q) + + return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices + + +def check_node2vec(path_data, seeds, df_G=None): + invalid_edge = 0 + invalid_seeds = 0 + offsets_idx = 0 + next_path_idx = 0 + v_paths = path_data[0] + sizes = path_data[2].to_numpy().tolist() + + for s in sizes: + for i in range(next_path_idx, next_path_idx+s-1): + src, dst = v_paths.iloc[i], v_paths.iloc[i+1] + if i == next_path_idx and src != seeds[offsets_idx]: + invalid_seeds += 1 + print( + "[ERR] Invalid seed: " + " src {} != src {}" + .format(src, seeds[offsets_idx]) + ) + offsets_idx += 1 + next_path_idx += s + + exp_edge = df_G.loc[ + (df_G['src'] == (src)) & ( + df_G['dst'] == (dst))].reset_index(drop=True) + + if not (exp_edge['src'].loc[0], exp_edge['dst'].loc[0]) == (src, dst): + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {}" + .format(src, dst) + ) + invalid_edge += 1 + + assert invalid_edge == 0 + assert invalid_seeds == 0 + + +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +@pytest.mark.parametrize("max_depth", [None. -1]) +def test_node2vec_invalid_max_depth(graph_file, + directed, + max_depth): + with pytest.raises(TypeError): + df, offsets, seeds = calc_node2vec( + graph_file, + directed=directed, + max_depth=max_depth, + use_padding=use_padding, + p=p, + q=q + ) + + @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_node2vec_coalesced(): - assert 1 == 2 + max_depth = random.randint(2, 10) + df_G = utils.read_csv_file(graph_file) + df_G.rename( + columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) + path_data, seeds = calc_node2vec( + graph_file, + directed, + max_depth=max_depth, + use_padding=False, + p, + q + ) + check_random_walks(path_data, seeds, df_G) + + # Check path query output + # df = cugraph.rw_path(len(seeds), path_data[2]) + # v_offsets = [0] + path_data[2].cumsum()[:-1].to_numpy().tolist() + # w_offsets = [0] + (path_data[2]-1).cumsum()[:-1].to_numpy().tolist() + + # assert_series_equal(df['weight_sizes'], path_data[2]-1, + # check_names=False) + # assert df['vertex_offsets'].to_numpy().tolist() == v_offsets + # assert df['weight_offsets'].to_numpy().tolist() == w_offsets @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_padded(): - assert 1 == 2 +def test_node2vec_padded( + graph_file, + directed, + p, + q +): + max_depth = random.randint(2, 10) + df_G = utils.read_csv_file(graph_file) + df_G.rename( + columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) + path_data, seeds = calc_node2vec( + graph_file, + directed, + max_depth=max_depth, + use_padding=True, + p, + q + ) + v_paths = path_data[0] + e_weights = path_data[1] + assert len(v_paths) == max_depth*len(seeds) + assert len(e_weights) == (max_depth - 1)*len(seeds) + + +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_node2vec_nx(graph_file, directed): + max_depth = random.randint(2, 10) + nx_G = utils.create_obj_from_csv(graph_file, nx.Graph, directed=directed) + nx_G.rename( + columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) + k = random.randint(1, 10) + start_vertices = random.sample(range(G.number_of_vertices()), k) + + vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec( + G, start_vertices, max_depth, True, p, q) + + assert len(vertex_paths) == max_depth * len(start_vertices) + assert len(edge_weights) == (max_depth - 1) * len(start_vertices) From 0b1886ad1bdc814909fc923a85c09d12cd0ded21 Mon Sep 17 00:00:00 2001 From: betochimas Date: Tue, 22 Feb 2022 23:37:03 -0800 Subject: [PATCH 09/20] Testing more inline with C implementation --- python/pylibcugraph/pylibcugraph/node2vec.pyx | 3 +- .../pylibcugraph/tests/test_node2vec.py | 42 ++++++++++--------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx index e7242dcd39b..514ba1a08e3 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -109,6 +109,7 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, >>> import pylibcugraph, cupy, numpy >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32) >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32) + >>> seeds = cupy.asarrray([0, 0, 1], dtype=numpy.int32) >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32) >>> resource_handle = pylibcugraph.experimental.ResourceHandle() >>> graph_props = pylibcugraph.experimental.GraphProperties( @@ -117,7 +118,7 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, ... resource_handle, graph_props, srcs, dsts, weights, ... store_transposed=False, renumber=False, do_expensive_check=False) >>> (paths, weights, sizes) = pylibcugraph.experimental.node2vec( - ... resource_handle, G, srcs, 3, True, p=1.0, q=1.0) + ... resource_handle, G, seeds, 3, True, 1.0, 1.0) """ diff --git a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py index 6bbd13573ab..6525393a647 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py @@ -20,7 +20,8 @@ # Test data # ============================================================================= # The result names correspond to the datasets defined in conftest.py - +# Note: the only deterministic path(s) in the following datasets +# are contained in Simple_1 _test_data = {"karate.csv": { "seeds": cp.asarray([0, 0], dtype=np.int32), "paths": cp.asarray([0, 8, 33, 29, 26, 0, 1, 3, 13, 33], @@ -68,12 +69,6 @@ # ============================================================================= # Tests # ============================================================================= -# def test_node2vec_untransposed(sg_graph_objs): -# return test_node2vec(sg_graph_objs) - -# TODO: Create test data for transposed graphs -# def test_node2vec_transposed(sg_transposed_graph_objs): -# return test_node2vec(sg_transposed_graph_objs) @pytest.mark.parametrize("compress_result", [True, False]) def test_node2vec(sg_graph_objs, compress_result): @@ -93,28 +88,35 @@ def test_node2vec(sg_graph_objs, compress_result): (actual_paths, actual_weights, actual_path_sizes) = result num_paths = len(seeds) - # Verify that the correct number of paths were made if compress_result: - assert len(actual_path_sizes) == num_paths + assert actual_paths.dtype == expected_paths.dtype + assert actual_weights.dtype == expected_weights.dtype assert actual_path_sizes.dtype == expected_path_sizes.dtype + actual_paths = actual_paths.tolist() + actual_weights = actual_weights.tolist() actual_path_sizes = actual_path_sizes.tolist() - expected_path_sizes = expected_path_sizes.tolist() - expected_walks = sum(expected_path_sizes) - num_paths + exp_paths = expected_paths.tolist() + exp_weights = expected_weights.tolist() + exp_path_sizes = expected_path_sizes.tolist() + # If compress_results is True, then also verify path lengths match + # up with weights array + assert len(actual_path_sizes) == num_paths + expected_walks = sum(exp_path_sizes) - num_paths # FIXME: When using multiple seeds, paths are connected via the weights # array, there should not be a weight connecting the end of a path with # the beginning of another. PR #2089 will resolve this. # Verify the number of walks was equal to path sizes - num paths assert len(actual_weights) == expected_walks - - assert actual_paths.dtype == expected_paths.dtype - assert actual_weights.dtype == expected_weights.dtype - actual_paths = actual_paths.tolist() - actual_weights = actual_weights.tolist() - exp_paths = expected_paths.tolist() - exp_weights = expected_weights.tolist() + else: + assert actual_paths.dtype == expected_paths.dtype + assert actual_weights.dtype == expected_weights.dtype + actual_paths = actual_paths.tolist() + actual_weights = actual_weights.tolist() + exp_paths = expected_paths.tolist() + exp_weights = expected_weights.tolist() # Verify exact walks chosen for linear graph Simple_1 - if ds_name not in ["karate.csv", "dolphins.csv", "Simple_2"]: + if ds_name == 'Simple_1': for i in range(len(exp_paths)): assert pytest.approx(actual_paths[i], 1e-4) == exp_paths[i] for i in range(len(exp_weights)): @@ -124,6 +126,6 @@ def test_node2vec(sg_graph_objs, compress_result): if compress_result: path_start = 0 for i in range(num_paths): - assert actual_path_sizes[i] == expected_path_sizes[i] + assert actual_path_sizes[i] == exp_path_sizes[i] assert actual_paths[path_start] == seeds[i] path_start += actual_path_sizes[i] From f7cc0bbe528b45a3a40ef56e1c9e573b580d7c6b Mon Sep 17 00:00:00 2001 From: betochimas Date: Wed, 23 Feb 2022 03:34:42 -0800 Subject: [PATCH 10/20] Implementation ready, testing outline for cugraph node2vec --- python/cugraph/cugraph/sampling/node2vec.py | 59 +++++-- python/cugraph/cugraph/tests/test_node2vec.py | 162 ++++++------------ 2 files changed, 96 insertions(+), 125 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 4a1f36df530..5960f904999 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -13,7 +13,8 @@ import pylibcugraph import cupy -# import numpy, cudf +import cudf +from cugraph.utilities import ensure_cugraph_obj_for_nx def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): @@ -70,13 +71,27 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): ... dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') - >>> sources = cudf.Series([0, 2]) - >>> _, _, _ = cugraph.node2vec(G, sources, 3, True, 0.8, 0.5) + >>> start_vertices = cudf.Series([0, 2]) + >>> paths, weights, path_sizes = cugraph.node2vec(G, sources, 3, True, + ... 0.8, 0.5) """ + G, _ = ensure_cugraph_obj_for_nx(G) + + if start_vertices is int: + start_vertices = [start_vertices] + if isinstance(start_vertices, list): start_vertices = cudf.Series(start_vertices) + if G.renumbered is True: + if isinstance(start_vertices, cudf.DataFrame): + start_vertices = G.lookup_internal_vertex_id( + start_vertices, + start_vertices.columns) + else: + start_vertices = G.lookup_internal_vertex_id(start_vertices) + srcs = G.edgelist.edgelist_df['src'] dsts = G.edgelist.edgelist_df['dst'] weights = G.edgelist.edgelist_df['weights'] @@ -89,23 +104,31 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): resource_handle = pylibcugraph.experimental.ResourceHandle() graph_props = pylibcugraph.experimental.GraphProperties( is_multigraph=G.is_multigraph()) - store_transposed = False - renumber = False + renumber = G.renumbered do_expensive_check = False - - G = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, - srcs, dsts, weights, - store_transposed, renumber, - do_expensive_check) + sg = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, + srcs, dsts, weights, + store_transposed, renumber, + do_expensive_check) vertex_set, edge_set, sizes = pylibcugraph.experimental.node2vec( - resource_handle, G, start_vertices, + resource_handle, sg, start_vertices, max_depth, use_padding, p, q) - - # Do prep work for start_vertices in case G is renumbered. - - # Call pylibcugraph wrapper - - # Undo renumbering and deal with padding - return vertex_set, edge_set, sizes + vertex_set = cudf.Series(vertex_set) + edge_set = cudf.Series(edge_set) + sizes = cudf.Series(sizes) + + if G.renumbered: + df_ = cudf.DataFrame() + df_['vertex_set'] = vertex_set + df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True) + vertex_set = cudf.Series(df_['vertex_set']) + + if use_padding: + edge_set_sz = (max_depth - 1) * len(start_vertices) + return vertex_set, edge_set[:edge_set_sz], sizes + + vertex_set_sz = sizes.sum() + edge_set_sz = vertex_set_sz - len(start_vertices) + return vertex_set[:vertex_set_sz], edge_set[:edge_set_sz], sizes diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index 4a7631fac59..fc6292bf5a8 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -12,21 +12,18 @@ # limitations under the License. import gc -# import random +import random import pytest -# from cudf.testing import assert_series_equal from cugraph.tests import utils -# import cugraph +import cugraph # ============================================================================= # Parameters # ============================================================================= DIRECTED_GRAPH_OPTIONS = [False, True] -WEIGHTED_GRAPH_OPTIONS = [False, True] -DATASETS = [pytest.param(d) for d in utils.DATASETS] DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] @@ -72,51 +69,57 @@ def calc_node2vec(graph_file, return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices -def check_node2vec(path_data, seeds, df_G=None): - invalid_edge = 0 - invalid_seeds = 0 - offsets_idx = 0 - next_path_idx = 0 - v_paths = path_data[0] - sizes = path_data[2].to_numpy().tolist() - - for s in sizes: - for i in range(next_path_idx, next_path_idx+s-1): - src, dst = v_paths.iloc[i], v_paths.iloc[i+1] - if i == next_path_idx and src != seeds[offsets_idx]: - invalid_seeds += 1 - print( - "[ERR] Invalid seed: " - " src {} != src {}" - .format(src, seeds[offsets_idx]) - ) - offsets_idx += 1 - next_path_idx += s - - exp_edge = df_G.loc[ - (df_G['src'] == (src)) & ( - df_G['dst'] == (dst))].reset_index(drop=True) - - if not (exp_edge['src'].loc[0], exp_edge['dst'].loc[0]) == (src, dst): - print( - "[ERR] Invalid edge: " - "There is no edge src {} dst {}" - .format(src, dst) - ) - invalid_edge += 1 - - assert invalid_edge == 0 - assert invalid_seeds == 0 +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_random_walks_coalesced( + graph_file, + directed +): + df, seeds = calc_node2vec( + graph_file, + directed=directed, + max_depth=3, + use_padding=False, + p=0.8, + q=0.5 + ) + # Check that weights match up with paths + + +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_random_walks_padded( + graph_file, + directed +): + df, seeds = calc_node2vec( + graph_file, + directed=directed, + max_depth=3, + use_padding=True, + p=0.8, + q=0.5 + ) + # Check that weights match up with paths + + # Check that path sizes matches up correctly with paths @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -@pytest.mark.parametrize("max_depth", [None. -1]) -def test_node2vec_invalid_max_depth(graph_file, - directed, - max_depth): +@pytest.mark.parametrize("max_depth", [None, -1]) +@pytest.mark.parametrize("p", [None, -1]) +def test_random_walks_invalid( + graph_file, + directed, + max_depth, + p +): + # Tests for invalid max depth, p, and q + use_padding = True + q = 1.0 with pytest.raises(TypeError): - df, offsets, seeds = calc_node2vec( + df, seeds = calc_node2vec( graph_file, directed=directed, max_depth=max_depth, @@ -128,70 +131,15 @@ def test_node2vec_invalid_max_depth(graph_file, @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_coalesced(): - max_depth = random.randint(2, 10) - df_G = utils.read_csv_file(graph_file) - df_G.rename( - columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) - path_data, seeds = calc_node2vec( - graph_file, - directed, - max_depth=max_depth, - use_padding=False, - p, - q - ) - check_random_walks(path_data, seeds, df_G) - - # Check path query output - # df = cugraph.rw_path(len(seeds), path_data[2]) - # v_offsets = [0] + path_data[2].cumsum()[:-1].to_numpy().tolist() - # w_offsets = [0] + (path_data[2]-1).cumsum()[:-1].to_numpy().tolist() - - # assert_series_equal(df['weight_sizes'], path_data[2]-1, - # check_names=False) - # assert df['vertex_offsets'].to_numpy().tolist() == v_offsets - # assert df['weight_offsets'].to_numpy().tolist() == w_offsets - - -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) -@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_padded( +def test_random_walks_nx( graph_file, - directed, - p, - q + directed ): - max_depth = random.randint(2, 10) - df_G = utils.read_csv_file(graph_file) - df_G.rename( - columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) - path_data, seeds = calc_node2vec( + df, seeds = calc_node2vec( graph_file, - directed, - max_depth=max_depth, + directed=directed, + max_depth=3, use_padding=True, - p, - q + p=0.8, + q=0.5 ) - v_paths = path_data[0] - e_weights = path_data[1] - assert len(v_paths) == max_depth*len(seeds) - assert len(e_weights) == (max_depth - 1)*len(seeds) - - -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) -@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_nx(graph_file, directed): - max_depth = random.randint(2, 10) - nx_G = utils.create_obj_from_csv(graph_file, nx.Graph, directed=directed) - nx_G.rename( - columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) - k = random.randint(1, 10) - start_vertices = random.sample(range(G.number_of_vertices()), k) - - vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec( - G, start_vertices, max_depth, True, p, q) - - assert len(vertex_paths) == max_depth * len(start_vertices) - assert len(edge_weights) == (max_depth - 1) * len(start_vertices) From e1a595c8e44f7a8c6a1f62e6a5c925f470468cf7 Mon Sep 17 00:00:00 2001 From: betochimas Date: Wed, 23 Feb 2022 10:18:08 -0800 Subject: [PATCH 11/20] Implementation plus testing, with exception on networkx graphs --- python/cugraph/cugraph/sampling/node2vec.py | 12 ++- python/cugraph/cugraph/tests/test_node2vec.py | 76 +++++++++++++------ 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 5960f904999..383233e10fc 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -76,6 +76,15 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): ... 0.8, 0.5) """ + if (type(max_depth) != int) or (max_depth < 1): + raise ValueError("'max_depth' must be a positive integer") + if (type(use_padding) != bool): + raise ValueError("'use_padding' must be a bool") + if (p is None) or (p <= 0.0): + raise ValueError("'p' must be a positive double") + if (q is None) or (q <= 0.0): + raise ValueError("'q' must be a positive double") + G, _ = ensure_cugraph_obj_for_nx(G) if start_vertices is int: @@ -107,6 +116,7 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): store_transposed = False renumber = G.renumbered do_expensive_check = False + sg = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, srcs, dsts, weights, store_transposed, renumber, @@ -129,6 +139,6 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): edge_set_sz = (max_depth - 1) * len(start_vertices) return vertex_set, edge_set[:edge_set_sz], sizes - vertex_set_sz = sizes.sum() + vertex_set_sz = vertex_set.sum() edge_set_sz = vertex_set_sz - len(start_vertices) return vertex_set[:vertex_set_sz], edge_set[:edge_set_sz], sizes diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index fc6292bf5a8..1c24c644344 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -34,8 +34,8 @@ def setup_function(): gc.collect() -def calc_node2vec(graph_file, - directed=False, +def calc_node2vec(G, + start_vertices, max_depth=None, use_padding=False, p=1.0, @@ -57,12 +57,8 @@ def calc_node2vec(graph_file, q : double """ - G = utils.generate_cugraph_graph_from_file( - graph_file, directed=directed, edgevals=True) assert G is not None - k = random.randint(1, 10) - start_vertices = random.sample(range(G.number_of_vertices()), k) vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec( G, start_vertices, max_depth, use_padding, p, q) @@ -71,35 +67,54 @@ def calc_node2vec(graph_file, @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_random_walks_coalesced( +def test_node2vec_coalesced( graph_file, directed ): + G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, + edgevals=True) + k = random.randint(1, 10) + max_depth = 3 + start_vertices = random.sample(range(G.number_of_vertices()), k) df, seeds = calc_node2vec( - graph_file, - directed=directed, - max_depth=3, + G, + start_vertices, + max_depth, use_padding=False, p=0.8, q=0.5 ) # Check that weights match up with paths + vertex_paths, edge_weights, vertex_path_sizes = df + assert vertex_paths.size == max_depth * k + # NOTE: This below assertion will pass once PR #2089 is merged + # assert edge_weights.size == (max_depth - 1) * k @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_random_walks_padded( +def test_node2vec_padded( graph_file, directed ): + G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, + edgevals=True) + k = random.randint(1, 10) + max_depth = 3 + start_vertices = random.sample(range(G.number_of_vertices()), k) df, seeds = calc_node2vec( - graph_file, - directed=directed, - max_depth=3, + G, + start_vertices, + max_depth, use_padding=True, p=0.8, q=0.5 ) + vertex_paths, edge_weights, vertex_path_sizes = df + assert vertex_paths.size == max_depth * k + # NOTE: This below assertion will pass once PR #2089 is merged + # assert edge_weights.size == (max_depth - 1) * k + assert vertex_path_sizes.sum() == vertex_paths.size # Check that weights match up with paths # Check that path sizes matches up correctly with paths @@ -109,19 +124,23 @@ def test_random_walks_padded( @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("max_depth", [None, -1]) @pytest.mark.parametrize("p", [None, -1]) -def test_random_walks_invalid( +def test_node2vec_invalid( graph_file, directed, max_depth, p ): + G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, + edgevals=True) + k = random.randint(1, 10) + start_vertices = random.sample(range(G.number_of_vertices()), k) # Tests for invalid max depth, p, and q use_padding = True q = 1.0 - with pytest.raises(TypeError): + with pytest.raises(ValueError): df, seeds = calc_node2vec( - graph_file, - directed=directed, + G, + start_vertices, max_depth=max_depth, use_padding=use_padding, p=p, @@ -129,17 +148,30 @@ def test_random_walks_invalid( ) +# FIXME: NetworkX Graphs not supported currently +""" @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_random_walks_nx( +def test_node2vec_nx( graph_file, directed ): + Gnx = utils.generate_nx_graph_from_file(graph_file, directed=directed, + edgevals=True) + k = random.randint(1, 10) + max_depth = 3 + start_vertices = random.sample(range(Gnx.number_of_nodes()), k) df, seeds = calc_node2vec( - graph_file, - directed=directed, - max_depth=3, + Gnx, + start_vertices, + max_depth, use_padding=True, p=0.8, q=0.5 ) + vertex_paths, edge_weights, vertex_path_sizes = df + assert vertex_paths.size == max_depth * k + # NOTE: This below assertion will pass once PR #2089 is merged + # assert edge_weights.size == (max_depth - 1) * k + assert vertex_path_sizes.sum() == vertex_paths.size +""" From 8d6aad4f372881eeb02d4439e3376012a28bd6b0 Mon Sep 17 00:00:00 2001 From: betochimas Date: Wed, 23 Feb 2022 10:21:49 -0800 Subject: [PATCH 12/20] Updated docstring --- python/cugraph/cugraph/sampling/node2vec.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 383233e10fc..0b032476f72 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -35,11 +35,14 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): The graph can be either directed (DiGraph) or undirected (Graph). start_vertices: int or list or cudf.Series + A single node or a list or a cudf.Series of nodes from which to run + the random walks max_depth: int, optional The maximum depth of the random walks use_padding: bool, optional + If True, padded paths are returned else coalesced paths are returned p: double, optional Return factor, which represents the likelihood of backtracking to @@ -72,8 +75,8 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') >>> start_vertices = cudf.Series([0, 2]) - >>> paths, weights, path_sizes = cugraph.node2vec(G, sources, 3, True, - ... 0.8, 0.5) + >>> paths, weights, path_sizes = cugraph.node2vec(G, start_vertices, 3, + ... True, 0.8, 0.5) """ if (type(max_depth) != int) or (max_depth < 1): From a640a549a0ead6f00da2c286185f31d91d7d38aa Mon Sep 17 00:00:00 2001 From: betochimas Date: Tue, 1 Mar 2022 14:00:08 -0800 Subject: [PATCH 13/20] Removed slower type check and redundant cupy array cast --- python/cugraph/cugraph/sampling/node2vec.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 0b032476f72..0b54754967a 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -12,7 +12,6 @@ # limitations under the License. import pylibcugraph -import cupy import cudf from cugraph.utilities import ensure_cugraph_obj_for_nx @@ -79,9 +78,9 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): ... True, 0.8, 0.5) """ - if (type(max_depth) != int) or (max_depth < 1): + if (not isinstance(max_depth, int)) or (max_depth < 1): raise ValueError("'max_depth' must be a positive integer") - if (type(use_padding) != bool): + if (not isinstance(use_padding, bool)): raise ValueError("'use_padding' must be a bool") if (p is None) or (p <= 0.0): raise ValueError("'p' must be a positive double") @@ -90,7 +89,7 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): G, _ = ensure_cugraph_obj_for_nx(G) - if start_vertices is int: + if isinstance(start_vertices, int): start_vertices = [start_vertices] if isinstance(start_vertices, list): @@ -99,8 +98,7 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): if G.renumbered is True: if isinstance(start_vertices, cudf.DataFrame): start_vertices = G.lookup_internal_vertex_id( - start_vertices, - start_vertices.columns) + start_vertices, start_vertices.columns) else: start_vertices = G.lookup_internal_vertex_id(start_vertices) @@ -108,11 +106,6 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): dsts = G.edgelist.edgelist_df['dst'] weights = G.edgelist.edgelist_df['weights'] - srcs = cupy.asarray(srcs) - dsts = cupy.asarray(dsts) - weights = cupy.asarray(weights) - start_vertices = cupy.asarray(start_vertices) - resource_handle = pylibcugraph.experimental.ResourceHandle() graph_props = pylibcugraph.experimental.GraphProperties( is_multigraph=G.is_multigraph()) From e0357880c72425e282c42088426ebe2dc03376cc Mon Sep 17 00:00:00 2001 From: betochimas Date: Wed, 2 Mar 2022 09:39:11 -0800 Subject: [PATCH 14/20] Replaced source_array with seed_array --- python/pylibcugraph/pylibcugraph/node2vec.pyx | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx index 514ba1a08e3..a7b42fce6f4 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -57,7 +57,7 @@ from pylibcugraph.utils cimport ( def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, _GPUGraph graph, - src_array, + seed_array, size_t max_depth, bool_t compress_result, double p, @@ -74,8 +74,8 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, graph : SGGraph The input graph. - src_array: device array type - Device array containing the pointer to the array of source vertices. + seed_array: device array type + Device array containing the pointer to the array of seed vertices. max_depth : size_t Maximum number of vertices in generated path @@ -135,7 +135,7 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, except ModuleNotFoundError: raise RuntimeError("node2vec requires the numpy package, which could not " "be imported") - assert_CAI_type(src_array, "src_array") + assert_CAI_type(seed_array, "seed_array") cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ resource_handle.c_resource_handle_ptr @@ -145,18 +145,17 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, cdef cugraph_error_code_t error_code cdef cugraph_error_t* error_ptr - cdef uintptr_t cai_srcs_ptr = \ - src_array.__cuda_array_interface__["data"][0] - cdef cugraph_type_erased_device_array_view_t* srcs_view_ptr = \ + cdef uintptr_t cai_seed_ptr = \ + seed_array.__cuda_array_interface__["data"][0] + cdef cugraph_type_erased_device_array_view_t* seed_view_ptr = \ cugraph_type_erased_device_array_view_create( - cai_srcs_ptr, - len(src_array), - get_c_type_from_numpy_type(src_array.dtype)) - + cai_seed_ptr, + len(seed_array), + get_c_type_from_numpy_type(seed_array.dtype)) error_code = cugraph_node2vec(c_resource_handle_ptr, c_graph_ptr, - srcs_view_ptr, + seed_view_ptr, max_depth, compress_result, p, From 44864e85f29bbfa9484492026b6435cadc1470fc Mon Sep 17 00:00:00 2001 From: betochimas Date: Wed, 2 Mar 2022 17:13:57 -0800 Subject: [PATCH 15/20] Resolving part of PR review, mainly description and checks --- python/cugraph/cugraph/sampling/node2vec.py | 26 +++++++++++++------ .../cugraph/cugraph/sampling/random_walks.py | 2 +- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 0b54754967a..c1ca82ea08a 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -16,7 +16,12 @@ from cugraph.utilities import ensure_cugraph_obj_for_nx -def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): +def node2vec(G, + start_vertices, + max_depth=None, + use_padding=False, + p=1.0, + q=1.0): """ Computes random walks for each node in 'start_vertices', under the node2vec sampling framework. @@ -32,28 +37,31 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): ---------- G : cuGraph.Graph or networkx.Graph The graph can be either directed (DiGraph) or undirected (Graph). + Weights in the graph are ignored. - start_vertices: int or list or cudf.Series + start_vertices: int or list or cudf.Series or cudf.DataFrame A single node or a list or a cudf.Series of nodes from which to run - the random walks + the random walks. In case of multi-column vertices it should be + a cudf.DataFrame - max_depth: int, optional + max_depth: int The maximum depth of the random walks - use_padding: bool, optional + use_padding: bool, optional (default=False) If True, padded paths are returned else coalesced paths are returned - p: double, optional + p: double, optional (default=1.0, [0 < p]) Return factor, which represents the likelihood of backtracking to a previous node in the walk. A higher value makes it less likely to sample a previously visited node, while a lower value makes it more - likely to backtrack, making the walk "local" + likely to backtrack, making the walk "local". A positive double. - q: double, optional + q: double, optional (default=1.0, [0 < q]) In-out factor, which represents the likelihood of visiting nodes closer or further from the outgoing node. If q > 1, the random walk is likelier to visit nodes closer to the outgoing node. If q < 1, the random walk is likelier to visit nodes further from the outgoing node. + A positive double. Returns ------- @@ -78,6 +86,8 @@ def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0): ... True, 0.8, 0.5) """ + if max_depth is None: + raise TypeError("must specify a 'max_depth'") if (not isinstance(max_depth, int)) or (max_depth < 1): raise ValueError("'max_depth' must be a positive integer") if (not isinstance(use_padding, bool)): diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index f531c152ad9..d7ce6057049 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -36,7 +36,7 @@ def random_walks(G, the random walks. In case of multi-column vertices it should be a cudf.DataFrame - max_depth : int, optional (default=None) + max_depth : int The maximum depth of the random walks use_padding : bool, optional (default=False) From 621dc95f5062e9d8e3d3478b17f62c512715f142 Mon Sep 17 00:00:00 2001 From: betochimas Date: Mon, 7 Mar 2022 16:38:43 -0800 Subject: [PATCH 16/20] Testing now checks individual walks are valid --- python/cugraph/cugraph/sampling/node2vec.py | 9 +- python/cugraph/cugraph/tests/test_node2vec.py | 84 ++++++++++--------- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index c1ca82ea08a..914b2e31028 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -86,9 +86,7 @@ def node2vec(G, ... True, 0.8, 0.5) """ - if max_depth is None: - raise TypeError("must specify a 'max_depth'") - if (not isinstance(max_depth, int)) or (max_depth < 1): + if (max_depth is None) or (max_depth < 1): raise ValueError("'max_depth' must be a positive integer") if (not isinstance(use_padding, bool)): raise ValueError("'use_padding' must be a bool") @@ -120,9 +118,12 @@ def node2vec(G, graph_props = pylibcugraph.experimental.GraphProperties( is_multigraph=G.is_multigraph()) store_transposed = False - renumber = G.renumbered + renumber = False do_expensive_check = False + # FIXME: If input graph is not renumbered, then SGGraph creation + # causes incorrect vertices to be returned when computing pylib + # version of node2vec sg = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, srcs, dsts, weights, store_transposed, renumber, diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index 1c24c644344..d89f71cc96e 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -61,7 +61,6 @@ def calc_node2vec(G, vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec( G, start_vertices, max_depth, use_padding, p, q) - return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices @@ -84,11 +83,26 @@ def test_node2vec_coalesced( p=0.8, q=0.5 ) - # Check that weights match up with paths vertex_paths, edge_weights, vertex_path_sizes = df + # Check that output sizes are as expected assert vertex_paths.size == max_depth * k - # NOTE: This below assertion will pass once PR #2089 is merged - # assert edge_weights.size == (max_depth - 1) * k + assert edge_weights.size == (max_depth - 1) * k + # Check that weights match up with paths + err = 0 + for i in range(k): + for j in range(max_depth - 1): + # weight = edge_weights[i * (max_depth - 1) + j] + u = vertex_paths[i * max_depth + j] + v = vertex_paths[i * max_depth + j + 1] + # Walk not found in edgelist + if (not G.has_edge(u, v)): + err += 1 + # FIXME: Checking weights is buggy + # Corresponding weight to edge is not correct + # expr = "(src == {} and dst == {})".format(u, v) + # if not (G.edgelist.edgelist_df.query(expr)["weights"] == weight): + # err += 1 + assert err == 0 @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @@ -111,32 +125,51 @@ def test_node2vec_padded( q=0.5 ) vertex_paths, edge_weights, vertex_path_sizes = df + # Check that output sizes are as expected assert vertex_paths.size == max_depth * k - # NOTE: This below assertion will pass once PR #2089 is merged - # assert edge_weights.size == (max_depth - 1) * k + assert edge_weights.size == (max_depth - 1) * k assert vertex_path_sizes.sum() == vertex_paths.size # Check that weights match up with paths - - # Check that path sizes matches up correctly with paths + err = 0 + path_start = 0 + for i in range(k): + for j in range(max_depth - 1): + # weight = edge_weights[i * (max_depth - 1) + j] + u = vertex_paths[i * max_depth + j] + v = vertex_paths[i * max_depth + j + 1] + # Walk not found in edgelist + if (not G.has_edge(u, v)): + err += 1 + # FIXME: Checking weights is buggy + # Corresponding weight to edge is not correct + # expr = "(src == {} and dst == {})".format(u, v) + # if not (G.edgelist.edgelist_df.query(expr)["weights"] == weight): + # err += 1 + # Check that path sizes matches up correctly with paths + if vertex_paths[i * max_depth] != seeds[i]: + err += 1 + path_start += vertex_path_sizes[i] + assert err == 0 @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("max_depth", [None, -1]) @pytest.mark.parametrize("p", [None, -1]) +@pytest.mark.parametrize("q", [None, -1]) def test_node2vec_invalid( graph_file, directed, max_depth, - p + p, + q ): G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, edgevals=True) k = random.randint(1, 10) start_vertices = random.sample(range(G.number_of_vertices()), k) - # Tests for invalid max depth, p, and q + # Tests for invalid p and q use_padding = True - q = 1.0 with pytest.raises(ValueError): df, seeds = calc_node2vec( G, @@ -146,32 +179,3 @@ def test_node2vec_invalid( p=p, q=q ) - - -# FIXME: NetworkX Graphs not supported currently -""" -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) -@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_nx( - graph_file, - directed -): - Gnx = utils.generate_nx_graph_from_file(graph_file, directed=directed, - edgevals=True) - k = random.randint(1, 10) - max_depth = 3 - start_vertices = random.sample(range(Gnx.number_of_nodes()), k) - df, seeds = calc_node2vec( - Gnx, - start_vertices, - max_depth, - use_padding=True, - p=0.8, - q=0.5 - ) - vertex_paths, edge_weights, vertex_path_sizes = df - assert vertex_paths.size == max_depth * k - # NOTE: This below assertion will pass once PR #2089 is merged - # assert edge_weights.size == (max_depth - 1) * k - assert vertex_path_sizes.sum() == vertex_paths.size -""" From bf1e221e245860b1323a050ba5ec9f47cc09248b Mon Sep 17 00:00:00 2001 From: betochimas Date: Wed, 9 Mar 2022 17:19:37 -0800 Subject: [PATCH 17/20] More efficient node2vec testing, other review changes --- python/cugraph/cugraph/sampling/node2vec.py | 22 ++++----- python/cugraph/cugraph/tests/test_node2vec.py | 47 ++++++++++--------- python/pylibcugraph/pylibcugraph/node2vec.pyx | 5 -- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 914b2e31028..130a3d263d4 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -50,18 +50,18 @@ def node2vec(G, use_padding: bool, optional (default=False) If True, padded paths are returned else coalesced paths are returned - p: double, optional (default=1.0, [0 < p]) + p: float, optional (default=1.0, [0 < p]) Return factor, which represents the likelihood of backtracking to a previous node in the walk. A higher value makes it less likely to sample a previously visited node, while a lower value makes it more - likely to backtrack, making the walk "local". A positive double. + likely to backtrack, making the walk "local". A positive float. - q: double, optional (default=1.0, [0 < q]) + q: float, optional (default=1.0, [0 < q]) In-out factor, which represents the likelihood of visiting nodes closer or further from the outgoing node. If q > 1, the random walk is likelier to visit nodes closer to the outgoing node. If q < 1, the random walk is likelier to visit nodes further from the outgoing node. - A positive double. + A positive float. Returns ------- @@ -86,14 +86,14 @@ def node2vec(G, ... True, 0.8, 0.5) """ - if (max_depth is None) or (max_depth < 1): - raise ValueError("'max_depth' must be a positive integer") + if (not isinstance(max_depth, int)) or (max_depth < 1): + raise ValueError(f"'max_depth' must be a positive integer, got: {max_depth}") if (not isinstance(use_padding, bool)): - raise ValueError("'use_padding' must be a bool") - if (p is None) or (p <= 0.0): - raise ValueError("'p' must be a positive double") - if (q is None) or (q <= 0.0): - raise ValueError("'q' must be a positive double") + raise ValueError(f"'use_padding' must be a bool, got: {use_padding}") + if (not isinstance(p, float)) or (p <= 0.0): + raise ValueError(f"'p' must be a positive float, got: {p}") + if (not isinstance(q, float)) or (q <= 0.0): + raise ValueError(f"'q' must be a positive float, got: {q}") G, _ = ensure_cugraph_obj_for_nx(G) diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index d89f71cc96e..05dd0fb0cc6 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -25,7 +25,7 @@ # ============================================================================= DIRECTED_GRAPH_OPTIONS = [False, True] DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] - +KARATE = DATASETS_SMALL[0][0][0] # ============================================================================= # Pytest Setup / Teardown - called for each test function @@ -152,30 +152,33 @@ def test_node2vec_padded( assert err == 0 -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) -@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -@pytest.mark.parametrize("max_depth", [None, -1]) -@pytest.mark.parametrize("p", [None, -1]) -@pytest.mark.parametrize("q", [None, -1]) +@pytest.mark.parametrize("graph_file", [KARATE]) def test_node2vec_invalid( - graph_file, - directed, - max_depth, - p, - q + graph_file ): - G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, + G = utils.generate_cugraph_graph_from_file(graph_file, directed=True, edgevals=True) k = random.randint(1, 10) start_vertices = random.sample(range(G.number_of_vertices()), k) - # Tests for invalid p and q use_padding = True - with pytest.raises(ValueError): - df, seeds = calc_node2vec( - G, - start_vertices, - max_depth=max_depth, - use_padding=use_padding, - p=p, - q=q - ) + max_depth = 1 + p = 1 + q = 1 + invalid_max_depths = [None, -1, "1", 4.5] + invalid_pqs = [None, -1, "1"] + + # Tests for invalid max_depth + for bad_depth in invalid_max_depths: + with pytest.raises(ValueError): + df, seeds = calc_node2vec(G, start_vertices, max_depth=bad_depth, + use_padding=use_padding, p=p, q=q) + # Tests for invalid p + for bad_p in invalid_pqs: + with pytest.raises(ValueError): + df, seeds = calc_node2vec(G, start_vertices, max_depth=max_depth, + use_padding=use_padding, p=bad_p, q=q) + # Tests for invalid q + for bad_q in invalid_pqs: + with pytest.raises(ValueError): + df, seeds = calc_node2vec(G, start_vertices, max_depth=max_depth, + use_padding=use_padding, p=p, q=bad_q) diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx index a7b42fce6f4..be2b0259f2a 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -130,11 +130,6 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, except ModuleNotFoundError: raise RuntimeError("node2vec requires the cupy package, which could not " "be imported") - try: - import numpy - except ModuleNotFoundError: - raise RuntimeError("node2vec requires the numpy package, which could not " - "be imported") assert_CAI_type(seed_array, "seed_array") cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ From 9100f776f37d67e263ded5bb4c625a7b8461a0ac Mon Sep 17 00:00:00 2001 From: betochimas Date: Wed, 9 Mar 2022 17:22:47 -0800 Subject: [PATCH 18/20] CI checks + edits --- python/cugraph/cugraph/sampling/node2vec.py | 3 ++- python/cugraph/cugraph/tests/test_node2vec.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 130a3d263d4..86ad21271fa 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -87,7 +87,8 @@ def node2vec(G, """ if (not isinstance(max_depth, int)) or (max_depth < 1): - raise ValueError(f"'max_depth' must be a positive integer, got: {max_depth}") + raise ValueError(f"'max_depth' must be a positive integer, \ + got: {max_depth}") if (not isinstance(use_padding, bool)): raise ValueError(f"'use_padding' must be a bool, got: {use_padding}") if (not isinstance(p, float)) or (p <= 0.0): diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index 05dd0fb0cc6..756f95baa21 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -27,6 +27,7 @@ DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] KARATE = DATASETS_SMALL[0][0][0] + # ============================================================================= # Pytest Setup / Teardown - called for each test function # ============================================================================= From 308984c02bbc90c365618dcc4d157d1469c99d17 Mon Sep 17 00:00:00 2001 From: betochimas Date: Thu, 10 Mar 2022 09:58:14 -0800 Subject: [PATCH 19/20] Made threshold vals for force atlas 2 consistent over cpp and python testing --- cpp/tests/layout/force_atlas2_test.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/tests/layout/force_atlas2_test.cu b/cpp/tests/layout/force_atlas2_test.cu index 086bf49036c..e843a66841a 100644 --- a/cpp/tests/layout/force_atlas2_test.cu +++ b/cpp/tests/layout/force_atlas2_test.cu @@ -230,10 +230,10 @@ TEST_P(Tests_Force_Atlas2, CheckFP64_T) { run_current_test(GetParam()); // --gtest_filter=*simple_test* INSTANTIATE_TEST_SUITE_P(simple_test, Tests_Force_Atlas2, - ::testing::Values(Force_Atlas2_Usecase("test/datasets/karate.mtx", 0.73), - Force_Atlas2_Usecase("test/datasets/dolphins.mtx", 0.69), - Force_Atlas2_Usecase("test/datasets/polbooks.mtx", 0.76), + ::testing::Values(Force_Atlas2_Usecase("test/datasets/karate.mtx", 0.70), + Force_Atlas2_Usecase("test/datasets/dolphins.mtx", 0.66), + Force_Atlas2_Usecase("test/datasets/polbooks.mtx", 0.75), Force_Atlas2_Usecase("test/datasets/netscience.mtx", - 0.80))); + 0.66))); CUGRAPH_TEST_PROGRAM_MAIN() From dacbba8bc88191e2d8bc463022e892d09c76b22d Mon Sep 17 00:00:00 2001 From: betochimas Date: Fri, 11 Mar 2022 09:52:02 -0800 Subject: [PATCH 20/20] Type edit in test_node2vec --- python/cugraph/cugraph/tests/test_node2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index 756f95baa21..114ced7666f 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -54,9 +54,9 @@ def calc_node2vec(G, use_padding : bool - p : double + p : float - q : double + q : float """ assert G is not None