From e4eb22bf21a189c937736cd74642199410e8d788 Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Thu, 27 Jul 2023 14:43:59 +0100 Subject: [PATCH] Refactor edge betweenness centrality (#3672) This PR refactor edge betweenness centrality by enabling it to follow the PLC path closes #3147 Authors: - Joseph Nke (https://github.com/jnke2016) - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/3672 --- cpp/src/c_api/centrality_result.cpp | 10 + .../betweenness_centrality_impl.cuh | 29 ++ .../betweenness_centrality_reference.hpp | 38 ++- .../edge_betweenness_centrality_test.cpp | 19 +- python/cugraph/CMakeLists.txt | 1 - .../cugraph/cugraph/centrality/CMakeLists.txt | 25 -- .../centrality/betweenness_centrality.py | 138 ++++----- .../edge_betweenness_centrality.pxd | 31 -- .../edge_betweenness_centrality_wrapper.pyx | 224 -------------- python/cugraph/cugraph/dask/__init__.py | 1 + .../cugraph/dask/centrality/__init__.py | 17 ++ .../dask/centrality/betweenness_centrality.py | 285 +++++++++++++++--- .../simpleDistributedGraph.py | 14 +- .../cugraph/cugraph/structure/symmetrize.py | 3 + ...st_batch_edge_betweenness_centrality_mg.py | 16 +- .../centrality/test_betweenness_centrality.py | 2 +- .../test_edge_betweenness_centrality.py | 56 ++-- .../test_edge_betweenness_centrality_mg.py | 231 ++++++++++++++ .../cugraph/tests/structure/test_graph.py | 7 +- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 1 + python/pylibcugraph/pylibcugraph/__init__.py | 2 + .../_cugraph_c/centrality_algorithms.pxd | 42 +++ .../edge_betweenness_centrality.pyx | 197 ++++++++++++ python/pylibcugraph/pylibcugraph/graphs.pxd | 4 +- python/pylibcugraph/pylibcugraph/graphs.pyx | 20 +- .../tests/test_edge_betweenness_centrality.py | 145 +++++++++ 26 files changed, 1101 insertions(+), 457 deletions(-) delete mode 100644 python/cugraph/cugraph/centrality/CMakeLists.txt delete mode 100644 python/cugraph/cugraph/centrality/edge_betweenness_centrality.pxd delete mode 100644 python/cugraph/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx create mode 100644 python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py create mode 100644 python/pylibcugraph/pylibcugraph/edge_betweenness_centrality.pyx create mode 100644 python/pylibcugraph/pylibcugraph/tests/test_edge_betweenness_centrality.py diff --git a/cpp/src/c_api/centrality_result.cpp b/cpp/src/c_api/centrality_result.cpp index 08e7c0341f2..75f10fcbbdb 100644 --- a/cpp/src/c_api/centrality_result.cpp +++ b/cpp/src/c_api/centrality_result.cpp @@ -81,6 +81,15 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_edge_centrality_resu internal_pointer->values_->view()); } +extern "C" cugraph_type_erased_device_array_view_t* cugraph_edge_centrality_result_get_edge_ids( + cugraph_edge_centrality_result_t* result) +{ + auto internal_pointer = + reinterpret_cast(result); + return reinterpret_cast( + internal_pointer->edge_ids_->view()); +} + extern "C" void cugraph_edge_centrality_result_free(cugraph_edge_centrality_result_t* result) { auto internal_pointer = @@ -88,5 +97,6 @@ extern "C" void cugraph_edge_centrality_result_free(cugraph_edge_centrality_resu delete internal_pointer->src_ids_; delete internal_pointer->dst_ids_; delete internal_pointer->values_; + delete internal_pointer->edge_ids_; delete internal_pointer; } diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index 0a87531d6ca..e496344583c 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -647,6 +647,35 @@ edge_betweenness_centrality( do_expensive_check); } + std::optional scale_factor{std::nullopt}; + + if (normalized) { + weight_t n = static_cast(graph_view.number_of_vertices()); + scale_factor = n * (n - 1); + } else if (graph_view.is_symmetric()) + scale_factor = weight_t{2}; + + if (scale_factor) { + if (graph_view.number_of_vertices() > 1) { + if (static_cast(num_sources) < graph_view.number_of_vertices()) { + (*scale_factor) *= static_cast(num_sources) / + static_cast(graph_view.number_of_vertices()); + } + + auto firsts = centralities.view().value_firsts(); + auto counts = centralities.view().edge_counts(); + auto mutable_firsts = centralities.mutable_view().value_firsts(); + for (size_t k = 0; k < counts.size(); k++) { + thrust::transform( + handle.get_thrust_policy(), + firsts[k], + firsts[k] + counts[k], + mutable_firsts[k], + [sf = *scale_factor] __device__(auto centrality) { return centrality / sf; }); + } + } + } + return centralities; } diff --git a/cpp/tests/centrality/betweenness_centrality_reference.hpp b/cpp/tests/centrality/betweenness_centrality_reference.hpp index 3c60020265a..0f1a4d6adf3 100644 --- a/cpp/tests/centrality/betweenness_centrality_reference.hpp +++ b/cpp/tests/centrality/betweenness_centrality_reference.hpp @@ -166,6 +166,37 @@ void reference_rescale(result_t* result, } } +template +void reference_edge_rescale(result_t* result, + bool directed, + bool normalize, + size_t const number_of_vertices, + size_t const number_of_edges, + size_t const number_of_sources) +{ + result_t rescale_factor = static_cast(1); + result_t casted_number_of_vertices = static_cast(number_of_vertices); + result_t casted_number_of_sources = static_cast(number_of_sources); + + if (normalize) { + if (number_of_edges > 2) { + rescale_factor /= ((casted_number_of_vertices) * (casted_number_of_vertices - 1)); + } + } else { + if (!directed) { rescale_factor /= static_cast(2); } + } + + if (rescale_factor != result_t{1}) { + if (number_of_sources > 0) { + rescale_factor *= (casted_number_of_vertices / casted_number_of_sources); + } + + for (auto idx = 0; idx < number_of_edges; ++idx) { + result[idx] *= rescale_factor; + } + } +} + template std::vector betweenness_centrality_reference( std::vector const& offsets, @@ -213,7 +244,9 @@ std::vector edge_betweenness_centrality_reference( std::vector const& offsets, std::vector const& indices, std::optional> const& wgt, - std::vector const& seeds) + std::vector const& seeds, + bool directed, + bool normalize) { std::vector result; if (indices.size() > 0) { @@ -234,6 +267,9 @@ std::vector edge_betweenness_centrality_reference( ref_edge_accumulation(result, offsets, indices, S, pred, sigmas, deltas, s); } } + + reference_edge_rescale( + result.data(), directed, normalize, offsets.size() - 1, indices.size(), seeds.size()); return result; } } // namespace diff --git a/cpp/tests/centrality/edge_betweenness_centrality_test.cpp b/cpp/tests/centrality/edge_betweenness_centrality_test.cpp index e4d22ff069c..be153bb41f8 100644 --- a/cpp/tests/centrality/edge_betweenness_centrality_test.cpp +++ b/cpp/tests/centrality/edge_betweenness_centrality_test.cpp @@ -122,7 +122,12 @@ class Tests_EdgeBetweennessCentrality auto h_seeds = cugraph::test::to_host(handle, d_seeds); auto h_reference_centralities = - edge_betweenness_centrality_reference(h_offsets, h_indices, h_wgt, h_seeds); + edge_betweenness_centrality_reference(h_offsets, + h_indices, + h_wgt, + h_seeds, + !graph_view.is_symmetric(), + betweenness_usecase.normalized); rmm::device_uvector d_reference_src_vertex_ids(0, handle.get_stream()); rmm::device_uvector d_reference_dst_vertex_ids(0, handle.get_stream()); @@ -183,7 +188,9 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Combine( // enable correctness checks ::testing::Values(EdgeBetweennessCentrality_Usecase{20, false, false, true}, - EdgeBetweennessCentrality_Usecase{20, false, true, true}), + EdgeBetweennessCentrality_Usecase{20, false, true, true}, + EdgeBetweennessCentrality_Usecase{20, true, false, true}, + EdgeBetweennessCentrality_Usecase{20, true, true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"), cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")))); @@ -194,7 +201,9 @@ INSTANTIATE_TEST_SUITE_P( // enable correctness checks ::testing::Combine( ::testing::Values(EdgeBetweennessCentrality_Usecase{50, false, false, true}, - EdgeBetweennessCentrality_Usecase{50, false, true, true}), + EdgeBetweennessCentrality_Usecase{50, false, true, true}, + EdgeBetweennessCentrality_Usecase{50, true, false, true}, + EdgeBetweennessCentrality_Usecase{50, true, true, true}), ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false)))); INSTANTIATE_TEST_SUITE_P( @@ -207,7 +216,9 @@ INSTANTIATE_TEST_SUITE_P( // disable correctness checks for large graphs ::testing::Combine( ::testing::Values(EdgeBetweennessCentrality_Usecase{500, false, false, false}, - EdgeBetweennessCentrality_Usecase{500, false, true, false}), + EdgeBetweennessCentrality_Usecase{500, false, true, false}, + EdgeBetweennessCentrality_Usecase{500, true, false, false}, + EdgeBetweennessCentrality_Usecase{500, true, true, false}), ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt index 48815792553..9094a73fdeb 100644 --- a/python/cugraph/CMakeLists.txt +++ b/python/cugraph/CMakeLists.txt @@ -82,7 +82,6 @@ endif() rapids_cython_init() -add_subdirectory(cugraph/centrality) add_subdirectory(cugraph/community) add_subdirectory(cugraph/components) add_subdirectory(cugraph/dask/comms) diff --git a/python/cugraph/cugraph/centrality/CMakeLists.txt b/python/cugraph/cugraph/centrality/CMakeLists.txt deleted file mode 100644 index f5036cca0a8..00000000000 --- a/python/cugraph/cugraph/centrality/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources - edge_betweenness_centrality_wrapper.pyx -) -set(linked_libraries cugraph::cugraph) - -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX centrality_ - ASSOCIATED_TARGETS cugraph -) diff --git a/python/cugraph/cugraph/centrality/betweenness_centrality.py b/python/cugraph/cugraph/centrality/betweenness_centrality.py index 56fa7fea9a7..63af410e06c 100644 --- a/python/cugraph/cugraph/centrality/betweenness_centrality.py +++ b/python/cugraph/cugraph/centrality/betweenness_centrality.py @@ -13,9 +13,9 @@ from pylibcugraph import ( betweenness_centrality as pylibcugraph_betweenness_centrality, + edge_betweenness_centrality as pylibcugraph_edge_betweenness_centrality, ResourceHandle, ) -from cugraph.centrality import edge_betweenness_centrality_wrapper from cugraph.utilities import ( df_edge_score_to_dictionary, @@ -25,7 +25,6 @@ import cudf import warnings import numpy as np -import random from typing import Union @@ -49,25 +48,24 @@ def betweenness_centrality( To improve performance. rather than doing an all-pair shortest path, a sample of k starting vertices can be used. - CuGraph does not currently support the 'endpoints' and 'weight' parameters - as seen in the corresponding networkX call. + CuGraph does not currently support 'weight' parameters. Parameters ---------- G : cuGraph.Graph or networkx.Graph The graph can be either directed (Graph(directed=True)) or undirected. - Weights in the graph are ignored, the current implementation uses a parallel - variation of the Brandes Algorithm (2001) to compute exact or approximate - betweenness. If weights are provided in the edgelist, they will not be - used. + The current implementation uses a parallel variation of the Brandes + Algorithm (2001) to compute exact or approximate betweenness. + If weights are provided in the edgelist, they will not be used. k : int, list or cudf object or None, optional (default=None) - If k is not None, use k node samples to estimate betweenness. Higher - values give better approximation. If k is either a list or a cudf, use its - content for estimation: it contain vertex identifiers. If k is None - (the default), all the vertices are used to estimate betweenness. Vertices - obtained through sampling or defined as a list will be used as sources for - traversals inside the algorithm. + If k is not None, use k node samples to estimate betweenness. Higher + values give better approximation. If k is either a list, a cudf DataFrame, + or a dask_cudf DataFrame, then its contents are assumed to be vertex + identifiers to be used for estimation. If k is None (the default), all the + vertices are used to estimate betweenness. Vertices obtained through + sampling or defined as a list will be used as sources for traversals inside + the algorithm. normalized : bool, optional (default=True) If true, the betweenness values are normalized by @@ -137,7 +135,6 @@ def betweenness_centrality( G, isNx = ensure_cugraph_obj_for_nx(G) - # FIXME: Should we raise an error if the graph created is weighted? if weight is not None: raise NotImplementedError( "weighted implementation of betweenness " @@ -218,29 +215,28 @@ def edge_betweenness_centrality( To improve performance, rather than doing an all-pair shortest path, a sample of k starting vertices can be used. - CuGraph does not currently support the 'weight' parameter - as seen in the corresponding networkX call. + CuGraph does not currently support the 'weight' parameter. Parameters ---------- G : cuGraph.Graph or networkx.Graph The graph can be either directed (Graph(directed=True)) or undirected. - Weights in the graph are ignored, the current implementation uses - BFS traversals. Use weight parameter if weights need to be considered - (currently not supported) + The current implementation uses BFS traversals. Use weight parameter + if weights need to be considered (currently not supported). k : int or list or None, optional (default=None) - If k is not None, use k node samples to estimate betweenness. Higher - values give better approximation. - If k is a list, use the content of the list for estimation: the list - should contain vertices identifiers. - Vertices obtained through sampling or defined as a list will be used as - sources for traversals inside the algorithm. + If k is not None, use k node samples to estimate betweenness. Higher + values give better approximation. If k is either a list, a cudf DataFrame, + or a dask_cudf DataFrame, then its contents are assumed to be vertex + identifiers to be used for estimation. If k is None (the default), all the + vertices are used to estimate betweenness. Vertices obtained through + sampling or defined as a list will be used as sources for traversals inside + the algorithm. normalized : bool, optional (default=True) If true, the betweenness values are normalized by - 2 / (n * (n - 1)) for undirected Graphs, and - 1 / (n * (n - 1)) for directed Graphs + __2 / (n * (n - 1))__ for undirected Graphs, and + __1 / (n * (n - 1))__ for directed Graphs where n is the number of nodes in G. Normalization will ensure that values are in [0, 1], this normalization scales for the highest possible value where one @@ -278,13 +274,11 @@ def edge_betweenness_centrality( df['dst'] : cudf.Series Contains the vertex identifiers of the destination of each edge - df['edge_betweenness_centrality'] : cudf.Series + df['betweenness_centrality'] : cudf.Series Contains the betweenness centrality of edges - When using undirected graphs, 'src' and 'dst' only contains elements - such that 'src' < 'dst', which might differ from networkx and user's - input. Namely edge (1 -> 0) is transformed into (0 -> 1) but - contains the betweenness centrality of edge (1 -> 0). + df["edge_id"] : cudf.Series + Contains the edge ids of edges if present. Examples @@ -303,16 +297,47 @@ def edge_betweenness_centrality( raise TypeError("result type can only be np.float32 or np.float64") G, isNx = ensure_cugraph_obj_for_nx(G) - vertices = _initialize_vertices(G, k, seed) - df = edge_betweenness_centrality_wrapper.edge_betweenness_centrality( - G, normalized, weight, vertices, result_dtype + if not isinstance(k, (cudf.DataFrame, cudf.Series)): + if isinstance(k, list): + vertex_dtype = G.edgelist.edgelist_df.dtypes[0] + k = cudf.Series(k, dtype=vertex_dtype) + + if isinstance(k, (cudf.DataFrame, cudf.Series)): + if G.renumbered: + k = G.lookup_internal_vertex_id(k) + + # FIXME: src, dst and edge_ids need to be of the same type which should not + # be the case + + ( + src_vertices, + dst_vertices, + values, + edge_ids, + ) = pylibcugraph_edge_betweenness_centrality( + resource_handle=ResourceHandle(), + graph=G._plc_graph, + k=k, + random_state=seed, + normalized=normalized, + do_expensive_check=False, ) + df = cudf.DataFrame() + df["src"] = src_vertices + df["dst"] = dst_vertices + df["betweenness_centrality"] = values + if edge_ids is not None: + df["edge_id"] = edge_ids + if G.renumbered: df = G.unrenumber(df, "src") df = G.unrenumber(df, "dst") + if df["betweenness_centrality"].dtype != result_dtype: + df["betweenness_centrality"] = df["betweenness_centrality"].astype(result_dtype) + if G.is_directed() is False: # select the lower triangle of the df based on src/dst vertex value lower_triangle = df["src"] >= df["dst"] @@ -332,44 +357,3 @@ def edge_betweenness_centrality( return df_edge_score_to_dictionary(df, "betweenness_centrality") else: return df - - -# In order to compare with pre-set sources, -# k can either be a list or an integer or None -# int: Generate an random sample with k elements -# list: k become the length of the list and vertices become the content -# None: All the vertices are considered -def _initialize_vertices(G, k: Union[int, list], seed: int) -> np.ndarray: - vertices = None - numpy_vertices = None - if k is not None: - if isinstance(k, int): - vertices = _initialize_vertices_from_indices_sampling(G, k, seed) - elif isinstance(k, list): - vertices = _initialize_vertices_from_identifiers_list(G, k) - numpy_vertices = np.array(vertices, dtype=np.int32) - else: - numpy_vertices = np.arange(G.number_of_vertices(), dtype=np.int32) - return numpy_vertices - - -# NOTE: We do not renumber in case k is an int, the sampling is -# not operating on the valid vertices identifiers but their -# indices: -# Example: -# - vertex '2' is missing -# - vertices '0' '1' '3' '4' exist -# - There is a vertex at index 2 (there is not guarantee that it is -# vertice '3' ) -def _initialize_vertices_from_indices_sampling(G, k: int, seed: int) -> list: - random.seed(seed) - vertices = random.sample(range(G.number_of_vertices()), k) - return vertices - - -def _initialize_vertices_from_identifiers_list(G, identifiers: list) -> np.ndarray: - vertices = identifiers - if G.renumbered: - vertices = G.lookup_internal_vertex_id(cudf.Series(vertices)).to_numpy() - - return vertices diff --git a/python/cugraph/cugraph/centrality/edge_betweenness_centrality.pxd b/python/cugraph/cugraph/centrality/edge_betweenness_centrality.pxd deleted file mode 100644 index 7ec20e35b83..00000000000 --- a/python/cugraph/cugraph/centrality/edge_betweenness_centrality.pxd +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.structure.graph_primtypes cimport * -from libcpp cimport bool - -cdef extern from "cugraph/algorithms.hpp" namespace "cugraph": - - cdef void edge_betweenness_centrality[VT, ET, WT, result_t]( - const handle_t &handle, - const GraphCSRView[VT, ET, WT] &graph, - result_t *result, - bool normalized, - const WT *weight, - VT k, - const VT *vertices) except + diff --git a/python/cugraph/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx b/python/cugraph/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx deleted file mode 100644 index 8c64dcbf952..00000000000 --- a/python/cugraph/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 - -from cugraph.centrality.edge_betweenness_centrality cimport edge_betweenness_centrality as c_edge_betweenness_centrality -from cugraph.structure import graph_primtypes_wrapper -from cugraph.structure.graph_primtypes cimport * -from libc.stdint cimport uintptr_t -from libcpp cimport bool -import cudf -import numpy as np -from cugraph.dask.common.mg_utils import get_client -import cugraph.dask.comms.comms as Comms -import dask.distributed - - -def get_output_df(indices, result_dtype): - number_of_edges = len(indices) - df = cudf.DataFrame() - df['src'] = cudf.Series(np.zeros(number_of_edges, dtype=np.int32)) - df['dst'] = indices.copy() - df['betweenness_centrality'] = cudf.Series(np.zeros(number_of_edges, - dtype=result_dtype)) - return df - - -def get_batch(sources, number_of_workers, current_worker): - batch_size = len(sources) // number_of_workers - begin = current_worker * batch_size - end = (current_worker + 1) * batch_size - if current_worker == (number_of_workers - 1): - end = len(sources) - batch = sources[begin:end] - return batch - - -def run_mg_work(input_data, normalized, weights, sources, - result_dtype, session_id): - result = None - - number_of_workers = Comms.get_n_workers(session_id) - worker_idx = Comms.get_worker_id(session_id) - handle = Comms.get_handle(session_id) - - batch = get_batch(sources, number_of_workers, worker_idx) - - result = run_internal_work(handle, input_data, normalized, weights, - batch, result_dtype) - return result - - -def run_internal_work(handle, input_data, normalized, weights, batch, - result_dtype): - cdef uintptr_t c_handle = NULL - cdef uintptr_t c_graph = NULL - cdef uintptr_t c_src_identifier = NULL - cdef uintptr_t c_dst_identifier = NULL - cdef uintptr_t c_weights = NULL - cdef uintptr_t c_betweenness = NULL - cdef uintptr_t c_batch = NULL - - cdef uintptr_t c_offsets = NULL - cdef uintptr_t c_indices = NULL - cdef uintptr_t c_graph_weights = NULL - - cdef GraphCSRViewDouble graph_double - cdef GraphCSRViewFloat graph_float - - (offsets, indices, graph_weights), is_directed = input_data - - if graph_weights is not None: - c_graph_weights = graph_weights.__cuda_array_interface__['data'][0] - c_offsets = offsets.__cuda_array_interface__['data'][0] - c_indices = indices.__cuda_array_interface__['data'][0] - - number_of_vertices = len(offsets) - 1 - number_of_edges = len(indices) - - result_df = get_output_df(indices, result_dtype) - c_src_identifier = result_df['src'].__cuda_array_interface__['data'][0] - c_dst_identifier = result_df['dst'].__cuda_array_interface__['data'][0] - c_betweenness = result_df['betweenness_centrality'].__cuda_array_interface__['data'][0] - - number_of_sources_in_batch = len(batch) - if result_dtype == np.float64: - graph_double = GraphCSRView[int, int, double]( c_offsets, - c_indices, - c_graph_weights, - number_of_vertices, - number_of_edges) - graph_double.prop.directed = is_directed - c_graph = &graph_double - elif result_dtype == np.float32: - graph_float = GraphCSRView[int, int, float](c_offsets, - c_indices, - c_graph_weights, - number_of_vertices, - number_of_edges) - graph_float.prop.directed = is_directed - c_graph = &graph_float - else: - raise ValueError("result_dtype can only be np.float64 or np.float32") - - if weights is not None: - c_weights = weights.__cuda_array_interface__['data'][0] - c_batch = batch.__array_interface__['data'][0] - c_handle = handle.getHandle() - - run_c_edge_betweenness_centrality(c_handle, - c_graph, - c_betweenness, - normalized, - c_weights, - number_of_sources_in_batch, - c_batch, - result_dtype) - return result_df - - -cdef void run_c_edge_betweenness_centrality(uintptr_t c_handle, - uintptr_t c_graph, - uintptr_t c_betweenness, - bool normalized, - uintptr_t c_weights, - int number_of_sources_in_batch, - uintptr_t c_batch, - result_dtype): - if result_dtype == np.float64: - c_edge_betweenness_centrality[int, int, double, double](( c_handle)[0], - ( c_graph)[0], - c_betweenness, - normalized, - c_weights, - number_of_sources_in_batch, - c_batch) - elif result_dtype == np.float32: - c_edge_betweenness_centrality[int, int, float, float](( c_handle)[0], - ( c_graph)[0], - c_betweenness, - normalized, - c_weights, - number_of_sources_in_batch, - c_batch) - else: - raise ValueError("result_dtype can only be np.float64 or np.float32") - -def batch_edge_betweenness_centrality(input_graph, - normalized, - weights, vertices, result_dtype): - client = get_client() - comms = Comms.get_comms() - replicated_adjlists = input_graph.batch_adjlists - work_futures = [client.submit(run_mg_work, - (data, input_graph.is_directed()), - normalized, - weights, - vertices, - result_dtype, - comms.sessionId, - workers=[worker]) for - (worker, data) in replicated_adjlists.items()] - dask.distributed.wait(work_futures) - df = work_futures[0].result() - return df - - -def sg_edge_betweenness_centrality(input_graph, normalized, weights, - vertices, result_dtype): - if not input_graph.adjlist: - input_graph.view_adj_list() - - handle = Comms.get_default_handle() - adjlist = input_graph.adjlist - input_data = ((adjlist.offsets, adjlist.indices, adjlist.weights), - input_graph.is_directed()) - df = run_internal_work(handle, input_data, normalized, weights, - vertices, result_dtype) - return df - - -def edge_betweenness_centrality(input_graph, normalized, weights, - vertices, result_dtype): - """ - Call betweenness centrality - """ - cdef GraphCSRViewDouble graph_double - cdef GraphCSRViewFloat graph_float - - - df = None - - if not input_graph.adjlist: - input_graph.view_adj_list() - - if Comms.is_initialized() and input_graph.batch_enabled == True: - df = batch_edge_betweenness_centrality(input_graph, normalized, - weights, vertices, - result_dtype) - else: - df = sg_edge_betweenness_centrality(input_graph, normalized, - weights, vertices, result_dtype) - - if result_dtype == np.float64: - graph_double = get_graph_view[GraphCSRViewDouble](input_graph) - graph_double.get_source_indices((df['src'].__cuda_array_interface__['data'][0])) - elif result_dtype == np.float32: - graph_float = get_graph_view[GraphCSRViewFloat](input_graph) - graph_float.get_source_indices((df['src'].__cuda_array_interface__['data'][0])) - - return df diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index f639856f929..a6958aaaf49 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -28,6 +28,7 @@ from .centrality.eigenvector_centrality import eigenvector_centrality from .cores.core_number import core_number from .centrality.betweenness_centrality import betweenness_centrality +from .centrality.betweenness_centrality import edge_betweenness_centrality from .cores.k_core import k_core from .link_prediction.jaccard import jaccard from .link_prediction.sorensen import sorensen diff --git a/python/cugraph/cugraph/dask/centrality/__init__.py b/python/cugraph/cugraph/dask/centrality/__init__.py index e69de29bb2d..3cbf91040d4 100644 --- a/python/cugraph/cugraph/dask/centrality/__init__.py +++ b/python/cugraph/cugraph/dask/centrality/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cugraph.centrality.betweenness_centrality import ( + betweenness_centrality, + edge_betweenness_centrality, +) diff --git a/python/cugraph/cugraph/dask/centrality/betweenness_centrality.py b/python/cugraph/cugraph/dask/centrality/betweenness_centrality.py index e048c91f34d..6aa708ea585 100644 --- a/python/cugraph/cugraph/dask/centrality/betweenness_centrality.py +++ b/python/cugraph/cugraph/dask/centrality/betweenness_centrality.py @@ -17,6 +17,7 @@ from pylibcugraph import ( ResourceHandle, betweenness_centrality as pylibcugraph_betweenness_centrality, + edge_betweenness_centrality as pylibcugraph_edge_betweenness_centrality, ) import cugraph.dask.comms.comms as Comms from cugraph.dask.common.input_utils import get_distributed_data @@ -28,14 +29,23 @@ from typing import Union -def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.DataFrame: +def convert_to_cudf(cp_arrays: cp.ndarray, edge_bc: bool) -> cudf.DataFrame: """ create a cudf DataFrame from cupy arrays """ - cupy_vertices, cupy_values = cp_arrays df = cudf.DataFrame() - df["vertex"] = cupy_vertices - df["betweenness_centrality"] = cupy_values + if edge_bc: + cupy_src_vertices, cupy_dst_vertices, cupy_values, cupy_edge_ids = cp_arrays + df["src"] = cupy_src_vertices + df["dst"] = cupy_dst_vertices + df["betweenness_centrality"] = cupy_values + if cupy_edge_ids is not None: + df["edge_id"] = cupy_edge_ids + + else: + cupy_vertices, cupy_values = cp_arrays + df["vertex"] = cupy_vertices + df["betweenness_centrality"] = cupy_values return df @@ -47,18 +57,29 @@ def _call_plc_betweenness_centrality( normalized: bool, endpoints: bool, do_expensive_check: bool, + edge_bc: bool, ) -> cudf.DataFrame: - cp_arrays = pylibcugraph_betweenness_centrality( - resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), - graph=mg_graph_x, - k=k, - random_state=random_state, - normalized=normalized, - include_endpoints=endpoints, - do_expensive_check=do_expensive_check, - ) - return convert_to_cudf(cp_arrays) + if edge_bc: + cp_arrays = pylibcugraph_edge_betweenness_centrality( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + k=k, + random_state=random_state, + normalized=normalized, + do_expensive_check=do_expensive_check, + ) + else: + cp_arrays = pylibcugraph_betweenness_centrality( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + k=k, + random_state=random_state, + normalized=normalized, + include_endpoints=endpoints, + do_expensive_check=do_expensive_check, + ) + return convert_to_cudf(cp_arrays, edge_bc) def _mg_call_plc_betweenness_centrality( @@ -68,8 +89,9 @@ def _mg_call_plc_betweenness_centrality( k: dict, random_state: int, normalized: bool, - endpoints: bool, do_expensive_check: bool, + endpoints: bool = False, + edge_bc: bool = False, ) -> dask_cudf.DataFrame: result = [ @@ -82,6 +104,7 @@ def _mg_call_plc_betweenness_centrality( normalized, endpoints, do_expensive_check, + edge_bc, workers=[w], allow_other_workers=False, pure=False, @@ -89,6 +112,8 @@ def _mg_call_plc_betweenness_centrality( for i, w in enumerate(Comms.get_workers()) ] + wait(result) + ddf = dask_cudf.from_delayed(result, verify_meta=False).persist() wait(ddf) wait([r.release() for r in result]) @@ -101,6 +126,7 @@ def betweenness_centrality( int, list, cudf.Series, cudf.DataFrame, dask_cudf.Series, dask_cudf.DataFrame ] = None, normalized: bool = True, + weight: cudf.DataFrame = None, endpoints: bool = False, random_state: int = None, ) -> dask_cudf.DataFrame: @@ -114,28 +140,39 @@ def betweenness_centrality( To improve performance. rather than doing an all-pair shortest path, a sample of k starting vertices can be used. - CuGraph does not currently support the 'endpoints' and 'weight' parameters - as seen in the corresponding networkX call. + CuGraph does not currently support 'weight' parameters. Parameters ---------- input_graph: cuGraph.Graph The graph can be either directed (Graph(directed=True)) or undirected. - Weights in the graph are ignored, the current implementation uses a parallel - variation of the Brandes Algorithm (2001) to compute exact or approximate - betweenness. If weights are provided in the edgelist, they will not be - used. + The current implementation uses a parallel variation of the Brandes + Algorithm (2001) to compute exact or approximate betweenness. + If weights are provided in the edgelist, they will not be used. k : int, list or (dask)cudf object or None, optional (default=None) - If k is not None, use k node samples to estimate betweenness. Higher - values give better approximation. If k is either a list or a (dask)cudf, - use its content for estimation: it contain vertex identifiers. If k is None - (the default), all the vertices are used to estimate betweenness. Vertices - obtained through sampling or defined as a list will be used as sources for - traversals inside the algorithm. + If k is not None, use k node samples to estimate betweenness. Higher + values give better approximation. If k is either a list, a cudf DataFrame, + or a dask_cudf DataFrame, then its contents are assumed to be vertex + identifiers to be used for estimation. If k is None (the default), all the + vertices are used to estimate betweenness. Vertices obtained through + sampling or defined as a list will be used as sources for traversals inside + the algorithm. normalized : bool, optional (default=True) - If True normalize the resulting betweenness centrality values + If True, normalize the resulting betweenness centrality values by + __2 / ((n - 1) * (n - 2))__ for undirected Graphs, and + __1 / ((n - 1) * (n - 2))__ for directed Graphs + where n is the number of nodes in G. + Normalization will ensure that values are in [0, 1], + this normalization scales for the highest possible value where one + node is crossed by every single shortest path. + + weight : (dask)cudf.DataFrame, optional (default=None) + Specifies the weights to be used for each edge. + Should contain a mapping between + edges and weights. + (Not Supported) endpoints : bool, optional (default=False) If true, include the endpoints in the shortest path counts. @@ -184,6 +221,12 @@ def betweenness_centrality( ) warnings.warn(warning_msg, UserWarning) + if weight is not None: + raise NotImplementedError( + "weighted implementation of betweenness " + "centrality not currently supported" + ) + if not isinstance(k, (dask_cudf.DataFrame, dask_cudf.Series)): if isinstance(k, (cudf.DataFrame, cudf.Series, list)): if isinstance(k, list): @@ -216,17 +259,187 @@ def betweenness_centrality( client = get_client() ddf = _mg_call_plc_betweenness_centrality( - input_graph, - client, - Comms.get_session_id(), - k, - random_state, - normalized, - endpoints, - do_expensive_check, + input_graph=input_graph, + client=client, + sID=Comms.get_session_id(), + k=k, + random_state=random_state, + normalized=normalized, + endpoints=endpoints, + do_expensive_check=do_expensive_check, ) if input_graph.renumbered: return input_graph.unrenumber(ddf, "vertex") return ddf + + +def edge_betweenness_centrality( + input_graph, + k: Union[ + int, list, cudf.Series, cudf.DataFrame, dask_cudf.Series, dask_cudf.DataFrame + ] = None, + normalized: bool = True, + weight: cudf.DataFrame = None, + random_state: int = None, +) -> dask_cudf.DataFrame: + """ + Compute the edge betweenness centrality for all edges of the graph G. + Betweenness centrality is a measure of the number of shortest paths + that pass over an edge. An edge with a high betweenness centrality + score has more paths passing over it and is therefore believed to be + more important. + + To improve performance. rather than doing an all-pair shortest path, + a sample of k starting vertices can be used. + + CuGraph does not currently support the 'weight' parameter. + + Parameters + ---------- + input_graph: cuGraph.Graph + The graph can be either directed (Graph(directed=True)) or undirected. + The current implementation uses a parallel variation of the Brandes + Algorithm (2001) to compute exact or approximate betweenness. + If weights are provided in the edgelist, they will not be used. + + k : int, list or (dask)cudf object or None, optional (default=None) + If k is not None, use k node samples to estimate betweenness. Higher + values give better approximation. If k is either a list, a cudf DataFrame, + or a dask_cudf DataFrame, then its contents are assumed to be vertex + identifiers to be used for estimation. If k is None (the default), all the + vertices are used to estimate betweenness. Vertices obtained through + sampling or defined as a list will be used as sources for traversals inside + the algorithm. + + normalized : bool, optional (default=True) + If True, normalize the resulting betweenness centrality values by + __2 / (n * (n - 1))__ for undirected Graphs, and + __1 / (n * (n - 1))__ for directed Graphs + where n is the number of nodes in G. + Normalization will ensure that values are in [0, 1], + this normalization scales for the highest possible value where one + edge is crossed by every single shortest path. + + weight : (dask)cudf.DataFrame, optional (default=None) + Specifies the weights to be used for each edge. + Should contain a mapping between + edges and weights. + (Not Supported) + + random_state : int, optional (default=None) + if k is specified and k is an integer, use random_state to initialize the + random number generator. + Using None defaults to a hash of process id, time, and hostname + If k is either None or list or cudf objects: random_state parameter is + ignored. + + Returns + ------- + betweenness_centrality : dask_cudf.DataFrame + GPU distributed data frame containing two dask_cudf.Series of size V: + the vertex identifiers and the corresponding betweenness centrality values. + + ddf['src'] : dask_cudf.Series + Contains the vertex identifiers of the source of each edge + + ddf['dst'] : dask_cudf.Series + Contains the vertex identifiers of the destination of each edge + + ddf['betweenness_centrality'] : dask_cudf.Series + Contains the betweenness centrality of edges + + ddf["edge_id"] : dask_cudf.Series + Contains the edge ids of edges if present. + + Examples + -------- + >>> import cugraph.dask as dcg + >>> import dask_cudf + >>> # ... Init a DASK Cluster + >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html + >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. + >>> chunksize = dcg.get_chunksize(datasets_path / "karate.csv") + >>> ddf = dask_cudf.read_csv(datasets_path / "karate.csv", + ... chunksize=chunksize, delimiter=" ", + ... names=["src", "dst", "value"], + ... dtype=["int32", "int32", "float32"]) + >>> dg = cugraph.Graph(directed=True) + >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst') + >>> pr = dcg.edge_betweenness_centrality(dg) + + """ + + if input_graph.store_transposed is True: + warning_msg = ( + "Betweenness centrality expects the 'store_transposed' flag " + "to be set to 'False' for optimal performance during " + "the graph creation" + ) + warnings.warn(warning_msg, UserWarning) + + if weight is not None: + raise NotImplementedError( + "weighted implementation of edge betweenness " + "centrality not currently supported" + ) + + if not isinstance(k, (dask_cudf.DataFrame, dask_cudf.Series)): + if isinstance(k, (cudf.DataFrame, cudf.Series, list)): + if isinstance(k, list): + k_dtype = input_graph.nodes().dtype + k = cudf.Series(k, dtype=k_dtype) + + if isinstance(k, (cudf.Series, cudf.DataFrame)): + splits = cp.array_split(cp.arange(len(k)), len(Comms.get_workers())) + k = {w: [k.iloc[splits[i]]] for i, w in enumerate(Comms.get_workers())} + + else: + if k is not None: + k = get_distributed_data(k) + wait(k) + k = k.worker_to_parts + + if input_graph.renumbered: + if isinstance(k, dask_cudf.DataFrame): + tmp_col_names = k.columns + + elif isinstance(k, dask_cudf.Series): + tmp_col_names = None + + if isinstance(k, (dask_cudf.DataFrame, dask_cudf.Series)): + k = input_graph.lookup_internal_vertex_id(k, tmp_col_names) + + # FIXME: should we add this parameter as an option? + do_expensive_check = False + + client = get_client() + + ddf = _mg_call_plc_betweenness_centrality( + input_graph=input_graph, + client=client, + sID=Comms.get_session_id(), + k=k, + random_state=random_state, + normalized=normalized, + do_expensive_check=do_expensive_check, + edge_bc=True, + ) + + if input_graph.renumbered: + return input_graph.unrenumber(ddf, "vertex") + + if input_graph.is_directed() is False: + # swap the src and dst vertices for the lower triangle only. Because + # this is a symmeterized graph, this operation results in a df with + # multiple src/dst entries. + ddf["src"], ddf["dst"] = ddf[["src", "dst"]].min(axis=1), ddf[ + ["src", "dst"] + ].max(axis=1) + # overwrite the df with the sum of the values for all alike src/dst + # vertex pairs, resulting in half the edges of the original df from the + # symmeterized graph. + ddf = ddf.groupby(by=["src", "dst"]).sum().reset_index() + + return ddf diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py index c0efb425b75..ae2c57f5ef3 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -201,14 +201,12 @@ def __from_edgelist( value_col_names = [self.edgeWeightCol] elif len(edge_attr) == 3: weight_col, id_col, type_col = edge_attr - input_ddf = input_ddf.rename( - columns={ - weight_col: self.edgeWeightCol, - id_col: self.edgeIdCol, - type_col: self.edgeTypeCol, - } - ) - + input_ddf = input_ddf[ddf_columns + [weight_col, id_col, type_col]] + input_ddf.columns = ddf_columns + [ + self.edgeWeightCol, + self.edgeIdCol, + self.edgeTypeCol, + ] value_col_names = [self.edgeWeightCol, self.edgeIdCol, self.edgeTypeCol] else: raise ValueError("Only 1 or 3 values may be provided" "for edge_attr") diff --git a/python/cugraph/cugraph/structure/symmetrize.py b/python/cugraph/cugraph/structure/symmetrize.py index 4b159b279c0..15011fa8dbc 100644 --- a/python/cugraph/cugraph/structure/symmetrize.py +++ b/python/cugraph/cugraph/structure/symmetrize.py @@ -230,6 +230,9 @@ def symmetrize( """ + if "edge_id" in input_df.columns and symmetrize: + raise ValueError("Edge IDs are not supported on undirected graphs") + csg.null_check(input_df[source_col_name]) csg.null_check(input_df[dest_col_name]) diff --git a/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py index 6b30d9fcb2b..dedf85a034b 100644 --- a/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py +++ b/python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py @@ -18,7 +18,7 @@ from cugraph.dask.common.mg_utils import is_single_gpu -from cugraph.experimental.datasets import karate +from cugraph.experimental.datasets import karate, netscience # Get parameters from standard betwenness_centrality_test # As tests directory is not a module, we need to add it to the path @@ -29,7 +29,6 @@ NORMALIZED_OPTIONS, DEFAULT_EPSILON, SUBSET_SIZE_OPTIONS, - SUBSET_SEED_OPTIONS, ) from test_edge_betweenness_centrality import ( @@ -40,11 +39,11 @@ # ============================================================================= # Parameters # ============================================================================= -DATASETS = [karate] +DATASETS = [karate, netscience] # FIXME: The "preset_gpu_count" from 21.08 and below are not supported and have # been removed -RESULT_DTYPE_OPTIONS = [np.float64] +RESULT_DTYPE_OPTIONS = [np.float32, np.float64] # ============================================================================= @@ -54,6 +53,7 @@ def setup_function(): gc.collect() +# FIXME: Fails for directed = False(bc score twice as much) and normalized = True. @pytest.mark.mg @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system") @pytest.mark.parametrize( @@ -62,16 +62,12 @@ def setup_function(): @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) -@pytest.mark.parametrize("weight", [None]) -@pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) def test_mg_edge_betweenness_centrality( graph_file, directed, subset_size, normalized, - weight, - subset_seed, result_dtype, dask_client, ): @@ -80,8 +76,8 @@ def test_mg_edge_betweenness_centrality( directed=directed, normalized=normalized, k=subset_size, - weight=weight, - seed=subset_seed, + weight=None, + seed=42, result_dtype=result_dtype, multi_gpu_batch=True, ) diff --git a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py index 759ed01a7eb..c9e31e804d4 100644 --- a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py +++ b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py @@ -313,7 +313,7 @@ def compare_scores(sorted_df, first_key, second_key, epsilon=DEFAULT_EPSILON): @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) -def test_betweenness_centrality_0( +def test_betweenness_centrality( graph_file, directed, subset_size, diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality.py index 0717925216a..12e9dd4c0a5 100644 --- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality.py +++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality.py @@ -45,7 +45,6 @@ DEFAULT_EPSILON = 0.0001 SUBSET_SIZE_OPTIONS = [4, None] -SUBSET_SEED_OPTIONS = [42] # NOTE: The following is not really being exploited in the tests as the # datasets that are used are too small to compare, but it ensures that both @@ -157,6 +156,14 @@ def calc_edge_betweenness_centrality( return sorted_df +def _rescale_e(betweenness, num_nodes, k): + + for e in betweenness: + betweenness[e] *= num_nodes / k + + return betweenness + + def _calc_bc_subset(G, Gnx, normalized, weight, k, seed, result_dtype): # NOTE: Networkx API does not allow passing a list of vertices # And the sampling is operated on Gnx.nodes() directly @@ -180,6 +187,10 @@ def _calc_bc_subset(G, Gnx, normalized, weight, k, seed, result_dtype): Gnx, k=k, normalized=normalized, weight=weight, seed=seed ) + if normalized or not Gnx.is_directed(): + if k is not None: + nx_bc_dict = _rescale_e(nx_bc_dict, len(Gnx.nodes()), k) + nx_df = generate_nx_result(nx_bc_dict, type(Gnx) is nx.DiGraph).rename( columns={"betweenness_centrality": "ref_bc"}, copy=False ) @@ -200,9 +211,9 @@ def _calc_bc_subset_fixed(G, Gnx, normalized, weight, k, seed, result_dtype): # In the fixed set we compare cu_bc against itself as we random.seed(seed) # on the same seed and then sample on the number of vertices themselves if seed is None: - seed = 123 # random.seed(None) uses time, but we want same sources - random.seed(seed) # It will be called again in cugraph's call - sources = random.sample(range(G.number_of_vertices()), k) + seed = 123 # We want the same sources so we use the same seed when + # randomly selecting vertices both below and internally(plc) + sources = G.select_random_vertices(seed, k) if G.renumbered: sources_df = cudf.DataFrame({"src": sources}) @@ -316,7 +327,6 @@ def generate_upper_triangle(dataframe): @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [None]) -@pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_edge_betweenness_centrality( @@ -325,7 +335,6 @@ def test_edge_betweenness_centrality( subset_size, normalized, weight, - subset_seed, result_dtype, edgevals, ): @@ -335,7 +344,7 @@ def test_edge_betweenness_centrality( normalized=normalized, k=subset_size, weight=weight, - seed=subset_seed, + seed=42, result_dtype=result_dtype, edgevals=edgevals, ) @@ -348,18 +357,15 @@ def test_edge_betweenness_centrality( @pytest.mark.parametrize("subset_size", [None]) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [None]) -@pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) @pytest.mark.parametrize("use_k_full", [True]) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) -@pytest.mark.skip(reason="Skipping large tests") def test_edge_betweenness_centrality_k_full( graph_file, directed, subset_size, normalized, weight, - subset_seed, result_dtype, use_k_full, edgevals, @@ -372,7 +378,7 @@ def test_edge_betweenness_centrality_k_full( normalized=normalized, k=subset_size, weight=weight, - seed=subset_seed, + seed=42, result_dtype=result_dtype, use_k_full=use_k_full, edgevals=edgevals, @@ -390,17 +396,14 @@ def test_edge_betweenness_centrality_k_full( @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [None]) -@pytest.mark.parametrize("subset_seed", [None]) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) -@pytest.mark.skip(reason="Skipping large tests") def test_edge_betweenness_centrality_fixed_sample( graph_file, directed, subset_size, normalized, weight, - subset_seed, result_dtype, edgevals, ): @@ -414,7 +417,7 @@ def test_edge_betweenness_centrality_fixed_sample( k=subset_size, normalized=normalized, weight=weight, - seed=subset_seed, + seed=None, result_dtype=result_dtype, edgevals=edgevals, ) @@ -427,17 +430,14 @@ def test_edge_betweenness_centrality_fixed_sample( @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [[]]) -@pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) -@pytest.mark.skip(reason="Skipping large tests") def test_edge_betweenness_centrality_weight_except( graph_file, directed, subset_size, normalized, weight, - subset_seed, result_dtype, edgevals, ): @@ -453,7 +453,7 @@ def test_edge_betweenness_centrality_weight_except( k=subset_size, normalized=normalized, weight=weight, - seed=subset_seed, + seed=42, result_dtype=result_dtype, edgevals=edgevals, ) @@ -466,7 +466,6 @@ def test_edge_betweenness_centrality_weight_except( @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("weight", [None]) -@pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", [str]) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_edge_betweenness_invalid_dtype( @@ -475,7 +474,6 @@ def test_edge_betweenness_invalid_dtype( subset_size, normalized, weight, - subset_seed, result_dtype, edgevals, ): @@ -488,7 +486,7 @@ def test_edge_betweenness_invalid_dtype( k=subset_size, normalized=normalized, weight=weight, - seed=subset_seed, + seed=42, result_dtype=result_dtype, edgevals=edgevals, ) @@ -499,13 +497,14 @@ def test_edge_betweenness_invalid_dtype( @pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) -def test_edge_betweenness_centrality_nx(graph_file, directed, edgevals): +@pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) +def test_edge_betweenness_centrality_nx(graph_file, directed, edgevals, normalized): dataset_path = graph_file.get_path() Gnx = utils.generate_nx_graph_from_file(dataset_path, directed, edgevals) assert nx.is_directed(Gnx) == directed - nx_bc = nx.edge_betweenness_centrality(Gnx) - cu_bc = cugraph.edge_betweenness_centrality(Gnx) + nx_bc = nx.edge_betweenness_centrality(Gnx, normalized=normalized) + cu_bc = cugraph.edge_betweenness_centrality(Gnx, normalized=normalized) # Calculating mismatch networkx_bc = sorted(nx_bc.items(), key=lambda x: x[0]) @@ -519,6 +518,11 @@ def test_edge_betweenness_centrality_nx(graph_file, directed, edgevals): and cugraph_bc[i][0] == networkx_bc[i][0] ): err = err + 1 - print(f"{cugraph_bc[i][1]} and {cugraph_bc[i][1]}") + print( + "type c_bc = ", + type(cugraph_bc[i][1]), + " type nx_bc = ", + type(networkx_bc[i][1]), + ) print("Mismatches:", err) assert err < (0.01 * len(cugraph_bc)) diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py new file mode 100644 index 00000000000..aa41f8e1c82 --- /dev/null +++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py @@ -0,0 +1,231 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import pytest + +import dask_cudf +from pylibcugraph.testing.utils import gen_fixture_params_product +from cugraph.experimental.datasets import DATASETS_UNDIRECTED, email_Eu_core + +import cugraph +import cugraph.dask as dcg + +# from cugraph.dask.common.mg_utils import is_single_gpu + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def setup_function(): + gc.collect() + + +IS_DIRECTED = [True, False] +INCLUDE_WEIGHTS = [False, True] +INCLUDE_EDGE_IDS = [False, True] +NORMALIZED_OPTIONS = [False, True] +SUBSET_SIZE_OPTIONS = [4, None] + + +# email_Eu_core is too expensive to test +datasets = DATASETS_UNDIRECTED + [email_Eu_core] + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= + + +fixture_params = gen_fixture_params_product( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), + (INCLUDE_WEIGHTS, "include_weights"), + (INCLUDE_EDGE_IDS, "include_edgeids"), + (NORMALIZED_OPTIONS, "normalized"), + (SUBSET_SIZE_OPTIONS, "subset_size"), +) + + +@pytest.fixture(scope="module", params=fixture_params) +def input_combo(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict( + zip( + ( + "graph_file", + "directed", + "include_weights", + "include_edge_ids", + "normalized", + "subset_size", + "subset_seed", + ), + request.param, + ) + ) + + return parameters + + +@pytest.fixture(scope="module") +def input_expected_output(input_combo): + """ + This fixture returns the inputs and expected results from the edge + betweenness centrality algo. + (based on cuGraph edge betweenness centrality) which can be used + for validation. + """ + directed = input_combo["directed"] + normalized = input_combo["normalized"] + k = input_combo["subset_size"] + subset_seed = 42 + edge_ids = input_combo["include_edge_ids"] + weight = input_combo["include_weights"] + + df = input_combo["graph_file"].get_edgelist() + if edge_ids: + if not directed: + # Edge ids not supported for undirected graph + return + dtype = df.dtypes[0] + edge_id = "edge_id" + df["edge_id"] = df.index + df = df.astype(dtype) + + else: + edge_id = None + + G = cugraph.Graph(directed=directed) + G.from_cudf_edgelist( + df, source="src", destination="dst", weight="wgt", edge_id=edge_id + ) + if isinstance(k, int): + k = G.select_random_vertices(subset_seed, k) + + input_combo["k"] = k + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + sg_cugraph_edge_bc = ( + cugraph.edge_betweenness_centrality(G, k, normalized) + .sort_values(["src", "dst"]) + .reset_index(drop=True) + ) + + input_data_path = input_combo["graph_file"].get_path() + + input_combo["sg_cugraph_results"] = sg_cugraph_edge_bc + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + chunksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + if weight: + weight = ddf + else: + weight = None + + if edge_ids: + dtype = ddf.dtypes[0] + edge_id = "edge_id" + ddf = ddf.assign(idx=1) + ddf["edge_id"] = ddf.idx.cumsum().astype(dtype) - 1 + else: + edge_id = None + + dg = cugraph.Graph(directed=directed) + + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + weight="value", + edge_id=edge_id, + renumber=True, + ) + + input_combo["MGGraph"] = dg + input_combo["include_weights"] = weight + + return input_combo + + +# ============================================================================= +# Tests +# ============================================================================= + + +# @pytest.mark.skipif( +# is_single_gpu(), reason="skipping MG testing on Single GPU system" +# ) +@pytest.mark.mg +def test_dask_edge_betweenness_centrality( + dask_client, benchmark, input_expected_output +): + if input_expected_output is not None: + dg = input_expected_output["MGGraph"] + k = input_expected_output["k"] + normalized = input_expected_output["normalized"] + weight = input_expected_output["include_weights"] + if weight is not None: + with pytest.raises(NotImplementedError): + result_edge_bc = benchmark( + dcg.edge_betweenness_centrality, dg, k, normalized, weight=weight + ) + + else: + result_edge_bc = benchmark( + dcg.edge_betweenness_centrality, dg, k, normalized, weight=weight + ) + result_edge_bc = ( + result_edge_bc.compute() + .sort_values(["src", "dst"]) + .reset_index(drop=True) + .rename(columns={"betweenness_centrality": "mg_betweenness_centrality"}) + ) + + if len(result_edge_bc.columns) > 3: + result_edge_bc = result_edge_bc.rename( + columns={"edge_id": "mg_edge_id"} + ) + + expected_output = input_expected_output["sg_cugraph_results"].reset_index( + drop=True + ) + result_edge_bc["betweenness_centrality"] = expected_output[ + "betweenness_centrality" + ] + if len(expected_output.columns) > 3: + result_edge_bc["edge_id"] = expected_output["edge_id"] + edge_id_diff = result_edge_bc.query("mg_edge_id != edge_id") + assert len(edge_id_diff) == 0 + + edge_bc_diffs1 = result_edge_bc.query( + "mg_betweenness_centrality - betweenness_centrality > 0.01" + ) + edge_bc_diffs2 = result_edge_bc.query( + "betweenness_centrality - mg_betweenness_centrality < -0.01" + ) + + assert len(edge_bc_diffs1) == 0 + assert len(edge_bc_diffs2) == 0 diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py index 02219002a7e..1b883b91e92 100644 --- a/python/cugraph/cugraph/tests/structure/test_graph.py +++ b/python/cugraph/cugraph/tests/structure/test_graph.py @@ -774,9 +774,12 @@ def test_create_graph_with_edge_ids(graph_file): edge_attr=["2", "id", "etype"], ) - H = G.to_undirected() assert G.is_directed() - assert not H.is_directed() + + # 'edge_ids are not supported for undirected graph" + with pytest.raises(ValueError): + G.to_undirected() + # assert not H.is_directed() @pytest.mark.sg diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index 6a09c3de0da..2f7e63b5c55 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -25,6 +25,7 @@ set(cython_sources bfs.pyx core_number.pyx ecg.pyx + edge_betweenness_centrality.pyx egonet.pyx eigenvector_centrality.pyx generate_rmat_edgelist.pyx diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index c39075ce3fb..6f99d128938 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -81,6 +81,8 @@ from pylibcugraph.select_random_vertices import select_random_vertices +from pylibcugraph.edge_betweenness_centrality import edge_betweenness_centrality + from pylibcugraph.generate_rmat_edgelist import generate_rmat_edgelist from pylibcugraph.generate_rmat_edgelists import generate_rmat_edgelists diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd index 6cd02ed6f17..532df624c99 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/centrality_algorithms.pxd @@ -212,3 +212,45 @@ cdef extern from "cugraph_c/centrality_algorithms.h": cugraph_centrality_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # edge betweenness centrality + + ctypedef struct cugraph_edge_centrality_result_t: + pass + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edge_centrality_result_get_src_vertices( + cugraph_edge_centrality_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edge_centrality_result_get_dst_vertices( + cugraph_edge_centrality_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edge_centrality_result_get_edge_ids( + cugraph_edge_centrality_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_edge_centrality_result_get_values( + cugraph_edge_centrality_result_t* result + ) + + cdef void \ + cugraph_edge_centrality_result_free( + cugraph_edge_centrality_result_t* result + ) + + cdef cugraph_error_code_t \ + cugraph_edge_betweenness_centrality( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertex_list, + bool_t normalized, + bool_t do_expensive_check, + cugraph_edge_centrality_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/edge_betweenness_centrality.pyx b/python/pylibcugraph/pylibcugraph/edge_betweenness_centrality.pyx new file mode 100644 index 00000000000..c88c9fe8a67 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/edge_betweenness_centrality.pyx @@ -0,0 +1,197 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.centrality_algorithms cimport ( + cugraph_edge_centrality_result_t, + cugraph_edge_betweenness_centrality, + cugraph_edge_centrality_result_get_src_vertices, + cugraph_edge_centrality_result_get_dst_vertices, + cugraph_edge_centrality_result_get_values, + cugraph_edge_centrality_result_get_edge_ids, + cugraph_edge_centrality_result_get_values, + cugraph_edge_centrality_result_free, +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj, +) +from pylibcugraph.select_random_vertices import ( + select_random_vertices +) + + +def edge_betweenness_centrality(ResourceHandle resource_handle, + _GPUGraph graph, + k, + random_state, + bool_t normalized, + bool_t do_expensive_check): + """ + Compute the edge betweenness centrality for all edges of the graph G. + Betweenness centrality is a measure of the number of shortest paths + that pass over an edge. An edge with a high betweenness centrality + score has more paths passing over it and is therefore believed to be + more important. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + k : int or device array type or None, optional (default=None) + If k is not None, use k node samples to estimate the edge betweenness. + Higher values give better approximation. If k is a device array type, + the contents are assumed to be vertex identifiers to be used for estimation. + If k is None (the default), all the vertices are used to estimate the edge + betweenness. Vertices obtained through sampling or defined as a list will + be used as sources for traversals inside the algorithm. + + random_state : int, optional (default=None) + if k is specified and k is an integer, use random_state to initialize the + random number generator. + Using None defaults to a hash of process id, time, and hostname + If k is either None or list or cudf objects: random_state parameter is + ignored. + + normalized : bool_t + Normalization will ensure that values are in [0, 1]. + + do_expensive_check : bool_t + A flag to run expensive checks for input arguments if True. + + Returns + ------- + A tuple of device arrays corresponding to the sources, destinations, edge + betweenness centrality scores and edge ids (if provided). + + array containing the vertices and the second item in the tuple is a device + array containing the eigenvector centrality scores for the corresponding + vertices. + Examples + -------- + >>> import pylibcugraph, cupy, numpy + >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5], + ... dtype=numpy.int32) + >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4], + ... dtype=numpy.int32) + >>> edge_ids = cupy.asarray( + ... [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ... dtype=numpy.int32) + >>> resource_handle = pylibcugraph.ResourceHandle() + >>> graph_props = pylibcugraph.GraphProperties( + ... is_symmetric=False, is_multigraph=False) + >>> G = pylibcugraph.SGGraph( + ... resource_handle, graph_props, srcs, dsts, store_transposed=False, + ... renumber=False, do_expensive_check=False, edge_id_array=edge_ids) + >>> (srcs, dsts, values, edge_ids) = pylibcugraph.edge_betweenness_centrality( + resource_handle, G, None, None, True, False) + >>> srcs + [0 0 1 1 1 1 2 2 2 3 3 3 4 4 5 5] + >>> dsts + [1 2 0 2 3 4 0 1 3 1 2 5 1 5 3 4] + >>> values + [0.10555556 0.06111111 0.10555556 0.06666667 0.09444445 0.14444445 + 0.06111111 0.06666667 0.09444445 0.09444445 0.09444445 0.12222222 + 0.14444445 0.07777778 0.12222222 0.07777778] + >>> edge_ids + [ 0 11 8 12 1 2 3 4 5 9 13 6 10 7 14 15] + + """ + + if isinstance(k, int): + # randomly select vertices + + #'select_random_vertices' internally creates a + # 'pylibcugraph.random.CuGraphRandomState' + vertex_list = select_random_vertices( + resource_handle, graph, random_state, k) + else: + # FiXME: Add CAPI check ensuring that k is a cuda array interface + vertex_list = k + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_edge_centrality_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertex_list_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertex_list) + + error_code = cugraph_edge_betweenness_centrality(c_resource_handle_ptr, + c_graph_ptr, + vertex_list_view_ptr, + normalized, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_edge_betweenness_centrality") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* src_ptr = \ + cugraph_edge_centrality_result_get_src_vertices(result_ptr) + cdef cugraph_type_erased_device_array_view_t* dst_ptr = \ + cugraph_edge_centrality_result_get_dst_vertices(result_ptr) + cdef cugraph_type_erased_device_array_view_t* values_ptr = \ + cugraph_edge_centrality_result_get_values(result_ptr) + + if graph.edge_id_view_ptr is NULL: + cupy_edge_ids = None + else: + edge_ids_ptr = cugraph_edge_centrality_result_get_edge_ids(result_ptr) + cupy_edge_ids = copy_to_cupy_array(c_resource_handle_ptr, edge_ids_ptr) + + + cupy_src_vertices = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) + cupy_dst_vertices = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) + cupy_values = copy_to_cupy_array(c_resource_handle_ptr, values_ptr) + + cugraph_edge_centrality_result_free(result_ptr) + cugraph_type_erased_device_array_view_free(vertex_list_view_ptr) + + return (cupy_src_vertices, cupy_dst_vertices, cupy_values, cupy_edge_ids) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pxd b/python/pylibcugraph/pylibcugraph/graphs.pxd index e468738f529..4e52ed557ed 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pxd +++ b/python/pylibcugraph/pylibcugraph/graphs.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -16,6 +16,7 @@ from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, + cugraph_type_erased_device_array_view_t, ) @@ -23,6 +24,7 @@ from pylibcugraph._cugraph_c.graph cimport ( # This is not visible in python cdef class _GPUGraph: cdef cugraph_graph_t* c_graph_ptr + cdef cugraph_type_erased_device_array_view_t* edge_id_view_ptr cdef class SGGraph(_GPUGraph): pass diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index 49b9747f0b3..fb4692bf3a8 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -171,8 +171,8 @@ cdef class SGGraph(_GPUGraph): weight_array ) - cdef cugraph_type_erased_device_array_view_t* edge_id_view_ptr = \ - create_cugraph_type_erased_device_array_view_from_py_obj( + + self.edge_id_view_ptr = create_cugraph_type_erased_device_array_view_from_py_obj( edge_id_array ) @@ -188,7 +188,7 @@ cdef class SGGraph(_GPUGraph): srcs_or_offsets_view_ptr, dsts_or_indices_view_ptr, weights_view_ptr, - edge_id_view_ptr, + self.edge_id_view_ptr, edge_type_view_ptr, store_transposed, renumber, @@ -206,7 +206,7 @@ cdef class SGGraph(_GPUGraph): srcs_or_offsets_view_ptr, dsts_or_indices_view_ptr, weights_view_ptr, - edge_id_view_ptr, + self.edge_id_view_ptr, edge_type_view_ptr, store_transposed, renumber, @@ -225,8 +225,8 @@ cdef class SGGraph(_GPUGraph): cugraph_type_erased_device_array_view_free(srcs_or_offsets_view_ptr) cugraph_type_erased_device_array_view_free(dsts_or_indices_view_ptr) cugraph_type_erased_device_array_view_free(weights_view_ptr) - if edge_id_view_ptr is not NULL: - cugraph_type_erased_device_array_view_free(edge_id_view_ptr) + if self.edge_id_view_ptr is not NULL: + cugraph_type_erased_device_array_view_free(self.edge_id_view_ptr) if edge_type_view_ptr is not NULL: cugraph_type_erased_device_array_view_free(edge_type_view_ptr) @@ -341,7 +341,7 @@ cdef class MGGraph(_GPUGraph): create_cugraph_type_erased_device_array_view_from_py_obj( weight_array ) - cdef cugraph_type_erased_device_array_view_t* edge_id_view_ptr = \ + self.edge_id_view_ptr = \ create_cugraph_type_erased_device_array_view_from_py_obj( edge_id_array ) @@ -356,7 +356,7 @@ cdef class MGGraph(_GPUGraph): srcs_view_ptr, dsts_view_ptr, weights_view_ptr, - edge_id_view_ptr, + self.edge_id_view_ptr, edge_type_view_ptr, store_transposed, num_edges, @@ -370,8 +370,8 @@ cdef class MGGraph(_GPUGraph): cugraph_type_erased_device_array_view_free(srcs_view_ptr) cugraph_type_erased_device_array_view_free(dsts_view_ptr) cugraph_type_erased_device_array_view_free(weights_view_ptr) - if edge_id_view_ptr is not NULL: - cugraph_type_erased_device_array_view_free(edge_id_view_ptr) + if self.edge_id_view_ptr is not NULL: + cugraph_type_erased_device_array_view_free(self.edge_id_view_ptr) if edge_type_view_ptr is not NULL: cugraph_type_erased_device_array_view_free(edge_type_view_ptr) diff --git a/python/pylibcugraph/pylibcugraph/tests/test_edge_betweenness_centrality.py b/python/pylibcugraph/pylibcugraph/tests/test_edge_betweenness_centrality.py new file mode 100644 index 00000000000..fa92147842c --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/tests/test_edge_betweenness_centrality.py @@ -0,0 +1,145 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import cupy as cp +import numpy as np +from pylibcugraph import ( + ResourceHandle, + GraphProperties, + SGGraph, + edge_betweenness_centrality, +) +from pylibcugraph.testing import utils + + +TOY = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "toy_graph.csv" + + +# ============================================================================= +# Test helpers +# ============================================================================= +def _get_param_args(param_name, param_values): + """ + Returns a tuple of (, ) which can be applied + as the args to pytest.mark.parametrize(). The pytest.param list also + contains param id string formed from the param name and values. + """ + return (param_name, [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) + + +def _generic_edge_betweenness_centrality_test( + src_arr, + dst_arr, + edge_id_arr, + result_score_arr, + result_edge_id_arr, + num_edges, + store_transposed, + k, + random_state, + normalized, +): + """ + Builds a graph from the input arrays and runs edge bc using the other args, + similar to how edge bc is tested in libcugraph. + """ + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_symmetric=False, is_multigraph=False) + G = SGGraph( + resource_handle, + graph_props, + src_arr, + dst_arr, + store_transposed=store_transposed, + renumber=False, + do_expensive_check=True, + edge_id_array=edge_id_arr, + ) + + (_, _, values, edge_ids) = edge_betweenness_centrality( + resource_handle, G, k, random_state, normalized, do_expensive_check=False + ) + + result_score_arr = result_score_arr.get() + result_edge_id_arr = result_edge_id_arr.get() + centralities = values.get() + edge_ids = edge_ids.get() + + for idx in range(num_edges): + expected_result_score = result_score_arr[idx] + actual_result_score = centralities[idx] + + expected_result_edge_id = result_edge_id_arr[idx] + actual_result_edge_id = edge_ids[idx] + + assert pytest.approx(expected_result_score, 1e-4) == actual_result_score, ( + f"Edge {src_arr[idx]} {dst_arr[idx]} has centrality {actual_result_score}," + f" should have been {expected_result_score}" + ) + + assert pytest.approx(expected_result_edge_id, 1e-4) == actual_result_edge_id, ( + f"Edge {src_arr[idx]} {dst_arr[idx]} has id {actual_result_edge_id}," + f" should have been {expected_result_edge_id}" + ) + + +def test_edge_betweenness_centrality(): + num_edges = 16 + + graph_data = np.genfromtxt(TOY, delimiter=" ") + src = cp.asarray(graph_data[:, 0], dtype=np.int32) + dst = cp.asarray(graph_data[:, 1], dtype=np.int32) + edge_id = cp.array( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], dtype=np.int32 + ) + result_score = cp.asarray( + [ + 0.10555556, + 0.06111111, + 0.10555556, + 0.06666667, + 0.09444445, + 0.14444445, + 0.06111111, + 0.06666667, + 0.09444445, + 0.09444445, + 0.09444445, + 0.12222222, + 0.14444445, + 0.07777778, + 0.12222222, + 0.07777778, + ], + dtype=np.float32, + ) + result_edge_ids = cp.asarray([0, 11, 8, 12, 1, 2, 3, 4, 5, 9, 13, 6, 10, 7, 14, 15]) + + store_transposed = False + k = None + random_state = None + normalized = True + + _generic_edge_betweenness_centrality_test( + src, + dst, + edge_id, + result_score, + result_edge_ids, + num_edges, + store_transposed, + k, + random_state, + normalized, + )